first commit

2026-06-03 12:42:47 +08:00
commit ec23799148
339 changed files with 57120 additions and 0 deletions
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -0,0 +1,13 @@
+# These are supported funding model platforms
+
+github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
+patreon: # Replace with a single Patreon username
+open_collective: # Replace with a single Open Collective username
+ko_fi: # Replace with a single Ko-fi username
+tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
+community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
+liberapay: # Replace with a single Liberapay username
+issuehunt: # Replace with a single IssueHunt username
+otechie: # Replace with a single Otechie username
+lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
+custom: ['https://github.com/lyuwenyu/cvperception/assets/17582080/2b4bfcd5-5c0f-45fd-badf-3f6e5b0249ac']# Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,21 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: lyuwenyu
+
+---
+
+**Star RTDETR**
+请先在RTDETR主页点击**star**以支持本项目
+Star RTDETR to help more people discover this project. 
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is. 
+If applicable, add screenshots to help explain your problem. 
+
+**To Reproduce**
+Steps to reproduce the behavior.
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,172 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+
+.DS_Store
+*.ipynb
+*.pth 
+*.pdparams
+*.onnx
+test.py
+rtdetr_pytorch/output/
+rtdetr_pytorch/dataset/
+rtdetrv2_pytorch/output/
+rtdetrv2_pytorch/dataset/
--- a/201
+++ b/201
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,140 @@
+English | [简体中文](README_cn.md)
+
+
+<h2 align="center">RT-DETR: DETRs Beat YOLOs on Real-time Object Detection</h2>
+<p align="center">
+    <!-- <a href="https://github.com/lyuwenyu/RT-DETR/blob/main/LICENSE">
+        <img alt="license" src="https://img.shields.io/badge/LICENSE-Apache%202.0-blue">
+    </a> -->
+    <a href="https://github.com/lyuwenyu/RT-DETR/blob/main/LICENSE">
+        <img alt="license" src="https://img.shields.io/github/license/lyuwenyu/RT-DETR">
+    </a>
+    <a href="https://github.com/lyuwenyu/RT-DETR/pulls">
+        <img alt="prs" src="https://img.shields.io/github/issues-pr/lyuwenyu/RT-DETR">
+    </a>
+    <a href="https://github.com/lyuwenyu/RT-DETR/issues">
+        <img alt="issues" src="https://img.shields.io/github/issues/lyuwenyu/RT-DETR?color=pink">
+    </a>
+    <a href="https://github.com/lyuwenyu/RT-DETR">
+        <img alt="issues" src="https://img.shields.io/github/stars/lyuwenyu/RT-DETR">
+    </a>
+    <a href="https://arxiv.org/abs/2304.08069">
+        <img alt="arXiv" src="https://img.shields.io/badge/arXiv-2304.08069-red">
+    </a>
+    <a href="mailto: lyuwenyu@foxmail.com">
+        <img alt="emal" src="https://img.shields.io/badge/contact_me-email-yellow">
+    </a>
+</p>
+
+---
+
+
+This is the official implementation of papers 
+- [DETRs Beat YOLOs on Real-time Object Detection](https://arxiv.org/abs/2304.08069)
+- [RT-DETRv2: Improved Baseline with Bag-of-Freebies for Real-Time Detection Transformer](https://arxiv.org/abs/2407.17140)
+
+
+<details>
+<summary>Fig</summary>
+
+<table><tr>
+<td><img src=https://github.com/lyuwenyu/RT-DETR/assets/77494834/0ede1dc1-a854-43b6-9986-cf9090f11a61 border=0 width=500></td>
+<td><img src=https://github.com/user-attachments/assets/437877e9-1d4f-4d30-85e8-aafacfa0ec56 border=0 width=500></td>
+</tr></table>
+</details>
+
+
+## 🚀 Updates
+- \[2025.11.18\] Release the **newest** member of the RT-DETR family: [RT-DETRv4:Painlessly Furthering Real-Time Object Detection with Vision Foundation Models](https://github.com/RT-DETRs/RT-DETRv4).
+By harnessing the rapidly evolving capabilities of Vision Foundation Models (VFMs), we boost lightweight detectors and, without incurring any extra inference latency, significantly improve the performance of the full-size model.
+- \[2024.11.28\] Add torch tool for parameters and flops statistics. see [run_profile.py](./rtdetrv2_pytorch/tools/run_profile.py)
+- \[2024.10.10\] Add sliced inference support for small object detecion. [#468](https://github.com/lyuwenyu/RT-DETR/pull/468)
+- \[2024.09.23\] Add ✅[Regnet and DLA34](https://github.com/lyuwenyu/RT-DETR/tree/main/rtdetr_pytorch) for RTDETR.
+- \[2024.08.27\] Add hubconf.py file to support torch hub.
+- \[2024.08.22\] Improve the performance of ✅ [RT-DETRv2-S](./rtdetrv2_pytorch/) to 48.1 mAP (<font color=green>+1.6</font> compared to RT-DETR-R18).
+- \[2024.07.24\] Release ✅ [RT-DETRv2](./rtdetrv2_pytorch/)!
+- \[2024.02.27\] Our work has been accepted to CVPR 2024!
+- \[2024.01.23\] Fix difference on data augmentation with paper in rtdetr_pytorch [#84](https://github.com/lyuwenyu/RT-DETR/commit/5dc64138e439247b4e707dd6cebfe19d8d77f5b1).
+- \[2023.11.07\] Add pytorch ✅ *rtdetr_r34vd* for requests [#107](https://github.com/lyuwenyu/RT-DETR/issues/107), [#114](https://github.com/lyuwenyu/RT-DETR/issues/114).
+- \[2023.11.05\] Upgrade the logic of `remap_mscoco_category` to facilitate training of custom datasets, see detils in [*Train custom data*](./rtdetr_pytorch/) part. [#81](https://github.com/lyuwenyu/RT-DETR/commit/95fc522fd7cf26c64ffd2ad0c622c392d29a9ebf).
+- \[2023.10.23\] Add [*discussion for deployments*](https://github.com/lyuwenyu/RT-DETR/issues/95), supported onnxruntime, TensorRT, openVINO.
+- \[2023.10.12\] Add tuning code for pytorch version, now you can tuning rtdetr based on pretrained weights.
+- \[2023.09.19\] Upload ✅ [*pytorch weights*](https://github.com/lyuwenyu/RT-DETR/issues/42) convert from paddle version.
+- \[2023.08.24] Release RT-DETR-R18 pretrained models on objects365. *49.2 mAP* and *217 FPS*.
+- \[2023.08.22\] Upload ✅ [*rtdetr_pytorch*](./rtdetr_pytorch/) source code. Please enjoy it!
+- \[2023.08.15\] Release RT-DETR-R101 pretrained models on objects365. *56.2 mAP* and *74 FPS*.
+- \[2023.07.30\] Release RT-DETR-R50 pretrained models on objects365. *55.3 mAP* and *108 FPS*.
+- \[2023.07.28\] Fix some bugs, and add some comments. [1](https://github.com/lyuwenyu/RT-DETR/pull/14), [2](https://github.com/lyuwenyu/RT-DETR/commit/3b5cbcf8ae3b907e6b8bb65498a6be7c6736eabc).
+- \[2023.07.13\] Upload ✅ [*training logs on coco*](https://github.com/lyuwenyu/RT-DETR/issues/8).
+- \[2023.05.17\] Release RT-DETR-R18, RT-DETR-R34, RT-DETR-R50-m（example for scaled).
+- \[2023.04.17\] Release RT-DETR-R50, RT-DETR-R101, RT-DETR-L, RT-DETR-X.
+
+## 📣 News
+- RTDETR and RTDETRv2 are now available in Hugging Face Transformers. [#413](https://github.com/lyuwenyu/RT-DETR/issues/413), [#549](https://github.com/lyuwenyu/RT-DETR/issues/549)
+- RTDETR is now available in [ultralytics/ultralytics](https://docs.ultralytics.com/zh/models/rtdetr/).
+
+## 📍 Implementations
+- 🔥 RT-DETRv2
+  - paddle: [code&weight](./rtdetrv2_paddle/)
+  - pytorch: [code&weight](./rtdetrv2_pytorch/)
+- 🔥 RT-DETR 
+  - paddle: [code&weight](./rtdetr_paddle)
+  - pytorch: [code&weight](./rtdetr_pytorch)
+
+
+| Model | Input shape | Dataset | $AP^{val}$ | $AP^{val}_{50}$| Params(M) | FLOPs(G) | T4 TensorRT FP16(FPS)
+|:---:|:---:| :---:|:---:|:---:|:---:|:---:|:---:|
+| RT-DETR-R18 | 640 | COCO | 46.5 | 63.8 | 20 | 60 | 217 |
+| RT-DETR-R34 | 640 | COCO | 48.9 | 66.8 | 31 | 92 | 161 |
+| RT-DETR-R50-m | 640 | COCO | 51.3 | 69.6 | 36 | 100 | 145 |
+| RT-DETR-R50 |  640 | COCO | 53.1 | 71.3 | 42 | 136 | 108 |
+| RT-DETR-R101 | 640 | COCO | 54.3 | 72.7 | 76 | 259 | 74 |
+| RT-DETR-HGNetv2-L | 640 | COCO | 53.0 | 71.6 | 32 | 110 | 114 |
+| RT-DETR-HGNetv2-X | 640 | COCO | 54.8 | 73.1 | 67 | 234 | 74 |
+| RT-DETR-R18 | 640 | COCO + Objects365 | **49.2** | **66.6** | 20 | 60 | **217** |
+| RT-DETR-R50 | 640 | COCO + Objects365 | **55.3** | **73.4** | 42 | 136 | **108** |
+| RT-DETR-R101 | 640 | COCO + Objects365 | **56.2** | **74.6** | 76 | 259 | **74** |
+**RT-DETRv2-S** | 640 | COCO  | **48.1** <font color=green>(+1.6)</font> | **65.1** | 20 | 60 | 217 |
+**RT-DETRv2-M**<sup>*<sup> | 640 | COCO  | **49.9** <font color=green>(+1.0)</font> | **67.5** | 31 | 92 | 161 |
+**RT-DETRv2-M** | 640 | COCO | **51.9** <font color=green>(+0.6)</font> | **69.9** | 36 | 100 | 145 |
+**RT-DETRv2-L** | 640 | COCO | **53.4** <font color=green>(+0.3)</font> | **71.6** | 42 | 136 | 108 |
+**RT-DETRv2-X** | 640 | COCO | 54.3 | **72.8** <font color=green>(+0.1)</font>  | 76 | 259| 74 |
+
+**Notes:**
+- `COCO + Objects365` in the table means finetuned model on COCO using pretrained weights trained on Objects365.
+
+
+## 🦄 Performance
+
+### 🏕️ Complex Scenarios
+<div align="center">
+  <img src="https://github.com/lyuwenyu/RT-DETR/assets/77494834/52743892-68c8-4e53-b782-9f89221739e4" width=500 >
+</div>
+
+### 🌋 Difficult Conditions
+<div align="center">
+  <img src="https://github.com/lyuwenyu/RT-DETR/assets/77494834/213cf795-6da6-4261-8549-11947292d3cb" width=500 >
+</div>
+
+## Citation
+If you use `RT-DETR` or `RTDETRv2` in your work, please use the following BibTeX entries:
+```
+@misc{lv2023detrs,
+      title={DETRs Beat YOLOs on Real-time Object Detection},
+      author={Yian Zhao and Wenyu Lv and Shangliang Xu and Jinman Wei and Guanzhong Wang and Qingqing Dang and Yi Liu and Jie Chen},
+      year={2023},
+      eprint={2304.08069},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+
+@misc{lv2024rtdetrv2improvedbaselinebagoffreebies,
+      title={RT-DETRv2: Improved Baseline with Bag-of-Freebies for Real-Time Detection Transformer}, 
+      author={Wenyu Lv and Yian Zhao and Qinyao Chang and Kui Huang and Guanzhong Wang and Yi Liu},
+      year={2024},
+      eprint={2407.17140},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2407.17140}, 
+}
+```
--- a/README_cn.md
+++ b/README_cn.md
@@ -0,0 +1,64 @@
+简体中文 | [English](README.md)
+
+# RT-DETR 
+
+文章"[DETRs Beat YOLOs on Real-time Object Detection](https://arxiv.org/abs/2304.08069)"和"[RT-DETRv2: Improved Baseline with Bag-of-Freebies for Real-Time Detection Transformer](https://arxiv.org/abs/2407.17140)"的官方实现.
+
+<details>
+<summary>Fig</summary>
+
+<div align="center">
+  <img src="https://github.com/lyuwenyu/RT-DETR/assets/77494834/0ede1dc1-a854-43b6-9986-cf9090f11a61" width=500 >
+</div>
+
+</details>
+
+
+## 最新动态
+- 发布RT-DETRv2系列模型
+- 发布RT-DETR-R50, RT-DETR-R101模型
+- 发布RT-DETR-R50-m模型（scale模型的范例）
+- 发布RT-DETR-R34, RT-DETR-R18模型
+- 发布RT-DETR-L, RT-DETR-X模型
+
+
+## 代码仓库
+- 🔥 RT-DETRv2
+  - paddle: [code&weight](./rtdetrv2_paddle/)
+  - pytorch: [code&weight](./rtdetrv2_pytorch/)
+- 🔥 RT-DETR 
+  - paddle: [code&weight](./rtdetr_paddle)
+  - pytorch: [code&weight](./rtdetr_pytorch)
+
+
+## 简介
+<!-- We propose a **R**eal-**T**ime **DE**tection **TR**ansformer (RT-DETR), the first real-time end-to-end object detector to our best knowledge. Specifically, we design an efficient hybrid encoder to efficiently process multi-scale features by decoupling the intra-scale interaction and cross-scale fusion, and propose IoU-aware query selection to improve the initialization of object queries. In addition, our proposed detector supports flexibly adjustment of the inference speed by using different decoder layers without the need for retraining, which facilitates the practical application of real-time object detectors. Our RT-DETR-L achieves 53.0% AP on COCO val2017 and 114 FPS on T4 GPU, while RT-DETR-X achieves 54.8% AP and 74 FPS, outperforming all YOLO detectors of the same scale in both speed and accuracy. Furthermore, our RT-DETR-R50 achieves 53.1% AP and 108 FPS, outperforming DINO-Deformable-DETR-R50 by 2.2% AP in accuracy and by about 21 times in FPS.  -->
+RT-DETR是第一个实时端到端目标检测器。具体而言，我们设计了一个高效的混合编码器，通过解耦尺度内交互和跨尺度融合来高效处理多尺度特征，并提出了IoU感知的查询选择机制，以优化解码器查询的初始化。此外，RT-DETR支持通过使用不同的解码器层来灵活调整推理速度，而不需要重新训练，这有助于实时目标检测器的实际应用。RT-DETR-R50在COCO val2017上实现了53.1%的AP，在T4 GPU上实现了108FPS，RT-DETR-R101实现了54.3%的AP和74FPS，在速度和精度方面都优于相同规模的所有YOLO检测器。使用Objects365预训练之后, RT-DETR-R50 和 RT-DETR-R101 分别实现了 55.3% 和 56.2% AP的精度.
+若要了解更多细节，请参考我们的论文[paper](https://arxiv.org/abs/2304.08069).
+
+<div align="center">
+  <img src="https://github.com/lyuwenyu/RT-DETR/assets/77494834/c211a164-ddce-4084-8b71-fb73f29f363b" width=500 >
+</div>
+
+## 引用RT-DETR
+如果需要在你的研究中使用RT-DETR，请通过以下方式引用我们的论文：
+```
+@misc{lv2023detrs,
+      title={DETRs Beat YOLOs on Real-time Object Detection},
+      author={Yian Zhao and Wenyu Lv and Shangliang Xu and Jinman Wei and Guanzhong Wang and Qingqing Dang and Yi Liu and Jie Chen},
+      year={2023},
+      eprint={2304.08069},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+
+@misc{lv2024rtdetrv2improvedbaselinebagoffreebies,
+      title={RT-DETRv2: Improved Baseline with Bag-of-Freebies for Real-Time Detection Transformer}, 
+      author={Wenyu Lv and Yian Zhao and Qinyao Chang and Kui Huang and Guanzhong Wang and Yi Liu},
+      year={2024},
+      eprint={2407.17140},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2407.17140}, 
+}
+```
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -0,0 +1,59 @@
+# 论文测速使用的部分代码和工具
+
+
+## 测试YOLO系列的速度 [in progress]
+以[yolov8](https://github.com/ultralytics/ultralytics)为例
+
+<details open>
+<summary>1. 转onnx </summary>  
+
+执行`yolov8_onnx.py`中的`export_onnx`函数，新增代码主要涉及输出格式的转换
+</details>
+
+
+<details>
+<summary>2. 插入nms </summary>
+
+使用`utils.py`中的`yolo_insert_nms`函数，导出onnx模型后使用[Netron](https://netron.app/)查看结构. <img width="924" alt="image" src="https://github.com/lyuwenyu/RT-DETR/assets/17582080/cb466483-d3a3-4f23-a68d-7ab8825059c8">
+</details>
+
+
+<details>
+<summary>3. 转tensorrt </summary>
+
+可以使用`trtexec.md`中的的脚本转换，或者使用`utils.py`中的Python代码转换
+```bash
+# trtexec -h
+trtexec --onnx=./yolov8l_w_nms.onnx --saveEngine=yolov8l_w_nms.engine --buildOnly --fp16
+```
+</details>
+
+
+<details>
+<summary>4. trtexec测速 </summary>
+
+可以使用`trtexec.md`中的的脚本转换，去掉`--buildOnly`参数
+
+</details>
+
+
+
+<details>
+<summary>5. profile分析（可选） </summary>
+
+在4的基础之上加以下命令
+```bash
+nsys profile --force-overwrite=true  -t 'nvtx,cuda,osrt,cudnn' -c cudaProfilerApi -o yolov8l_w_nms 
+```
+可以使用nsys可视化分析
+<img width="1090" alt="image" src="https://github.com/lyuwenyu/RT-DETR/assets/17582080/507d8bde-9e7c-4ae5-b571-976c540ef2c6">
+
+</details>
+
+
+<details>
+<summary>6. Python测速或者部署   </summary>
+
+在Coco val数据集上测模型的平均速度使用`trtinfer.py`中的代码推理
+
+</details>
--- a/benchmark/dataset.py
+++ b/benchmark/dataset.py
@@ -0,0 +1,102 @@
+'''by lyuwenyu
+'''
+
+import os
+import glob
+from PIL import Image
+
+import torch
+import torch.utils.data as data
+import torchvision
+import torchvision.transforms as T 
+import torchvision.transforms.functional as F 
+
+
+class ToTensor(T.ToTensor):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def __call__(self, pic):
+        if isinstance(pic, torch.Tensor):
+            return pic 
+        return super().__call__(pic)
+
+class PadToSize(T.Pad):
+    def __init__(self, size, fill=0, padding_mode='constant'):
+        super().__init__(0, fill, padding_mode)
+        self.size = size
+        self.fill = fill
+
+    def __call__(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be padded.
+
+        Returns:
+            PIL Image or Tensor: Padded image.
+        """
+        w, h = F.get_image_size(img)
+        padding = (0, 0, self.size[0] - w, self.size[1] - h)
+        return F.pad(img, padding, self.fill, self.padding_mode)
+
+
+class Dataset(data.Dataset):
+    def __init__(self, img_dir: str='', preprocess: T.Compose=None, device='cuda:0') -> None:
+        super().__init__()
+
+        self.device = device
+        self.size = 640
+
+        self.im_path_list = list(glob.glob(os.path.join(img_dir, '*.jpg')))
+
+        if preprocess is None: 
+            self.preprocess = T.Compose([
+                    T.Resize(size=639, max_size=640),
+                    PadToSize(size=(640, 640), fill=114),
+                    ToTensor(),
+                    T.ConvertImageDtype(torch.float),
+            ])
+        else:
+            self.preprocess = preprocess
+
+    def __len__(self, ):
+        return len(self.im_path_list)
+
+    def __getitem__(self, index):
+        # im = Image.open(self.img_path_list[index]).convert('RGB')
+        im = torchvision.io.read_file(self.im_path_list[index])
+        im = torchvision.io.decode_jpeg(im, mode=torchvision.io.ImageReadMode.RGB, device=self.device)
+        _, h, w = im.shape # c,h,w
+
+        im = self.preprocess(im)
+
+        blob = {
+            'image': im, 
+            'im_shape': torch.tensor([self.size, self.size]).to(im.device),
+            'scale_factor': torch.tensor([self.size / h, self.size / w]).to(im.device),
+            'orig_size': torch.tensor([w, h]).to(im.device),
+        }
+
+        return blob
+
+    @staticmethod
+    def post_process():
+        pass
+
+    @staticmethod
+    def collate_fn():
+        pass
+
+
+def draw_nms_result(blob, outputs, draw_score_threshold=0.25, name=''):
+    '''show result
+    Keys:
+        'num_dets', 'det_boxes', 'det_scores', 'det_classes'
+    '''    
+    for i in range(blob['image'].shape[0]):
+        det_scores = outputs['det_scores'][i]
+        det_boxes = outputs['det_boxes'][i][det_scores > draw_score_threshold]
+        
+        im = (blob['image'][i] * 255).to(torch.uint8)
+        im = torchvision.utils.draw_bounding_boxes(im, boxes=det_boxes, width=2)
+        Image.fromarray(im.permute(1, 2, 0).cpu().numpy()).save(f'test_{name}_{i}.jpg')
--- a/benchmark/trtexec.md
+++ b/benchmark/trtexec.md
@@ -0,0 +1,13 @@
+
+```bash
+# build tensorrt engine 
+trtexec --onnx=./yolov8l_w_nms.onnx --saveEngine=yolov8l_w_nms.engine --buildOnly --fp16
+
+# using dynamic shapes
+# --explicitBatch --minShapes=image:1x3x640x640 --optShapes=image:8x3x640x640  --maxShapes=image:16x3x640x640 --shapes=image:8x3x640x640
+
+# timeline 
+nsys profile --force-overwrite=true  -t 'nvtx,cuda,osrt,cudnn' -c cudaProfilerApi -o yolov8l_w_nms  trtexec --loadEngine=./yolov8l_w_nms.engine --fp16 --avgRuns=10 --loadInputs='image:input_tensor.bin'
+
+# https://forums.developer.nvidia.com/t/about-loadinputs-in-trtexec/218880
+```
--- a/benchmark/trtinfer.py
+++ b/benchmark/trtinfer.py
@@ -0,0 +1,153 @@
+'''by lyuwenyu
+'''
+
+import time 
+import contextlib
+from collections import namedtuple, OrderedDict
+
+import torch
+import numpy as np
+import tensorrt as trt
+
+from utils import TimeProfiler
+
+class TRTInference(object):
+    def __init__(self, engine_path, device='cuda:0', backend='torch', max_batch_size=32, verbose=False):
+        self.engine_path = engine_path
+        self.device = device
+        self.backend = backend
+        self.max_batch_size = max_batch_size
+        
+        self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO)  
+
+        self.engine = self.load_engine(engine_path)
+
+        self.context = self.engine.create_execution_context()
+
+        self.bindings = self.get_bindings(self.engine, self.context, self.max_batch_size, self.device)
+        self.bindings_addr = OrderedDict((n, v.ptr) for n, v in self.bindings.items())
+
+        self.input_names = self.get_input_names()
+        self.output_names = self.get_output_names()
+        
+        if self.backend == 'cuda':
+            self.stream = cuda.Stream()
+
+        self.time_profile = TimeProfiler()
+
+    def init(self, ):
+        self.dynamic = False 
+
+    def load_engine(self, path):
+        '''load engine
+        '''
+        trt.init_libnvinfer_plugins(self.logger, '')
+        with open(path, 'rb') as f, trt.Runtime(self.logger) as runtime:
+            return runtime.deserialize_cuda_engine(f.read())
+    
+    def get_input_names(self, ):
+        names = []
+        for _, name in enumerate(self.engine):
+            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
+                names.append(name)
+        return names
+    
+    def get_output_names(self, ):
+        names = []
+        for _, name in enumerate(self.engine):
+            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
+                names.append(name)
+        return names
+
+    def get_bindings(self, engine, context, max_batch_size=32, device=None):
+        '''build binddings
+        '''
+        Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
+        bindings = OrderedDict()
+        # max_batch_size = 1
+
+        for i, name in enumerate(engine):
+            shape = engine.get_tensor_shape(name)
+            dtype = trt.nptype(engine.get_tensor_dtype(name))
+
+            if shape[0] == -1:
+                dynamic = True 
+                shape[0] = max_batch_size
+                if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:  # dynamic
+                    context.set_input_shape(name, shape)
+
+            if self.backend == 'cuda':
+                if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
+                    data = np.random.randn(*shape).astype(dtype)
+                    ptr = cuda.mem_alloc(data.nbytes)
+                    bindings[name] = Binding(name, dtype, shape, data, ptr) 
+                else:
+                    data = cuda.pagelocked_empty(trt.volume(shape), dtype)
+                    ptr = cuda.mem_alloc(data.nbytes)
+                    bindings[name] = Binding(name, dtype, shape, data, ptr) 
+
+            else:
+                data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
+                bindings[name] = Binding(name, dtype, shape, data, data.data_ptr())
+
+        return bindings
+
+    def run_torch(self, blob):
+        '''torch input
+        '''
+        for n in self.input_names:
+            if self.bindings[n].shape != blob[n].shape:
+                self.context.set_input_shape(n, blob[n].shape) 
+                self.bindings[n] = self.bindings[n]._replace(shape=blob[n].shape)
+
+        self.bindings_addr.update({n: blob[n].data_ptr() for n in self.input_names})
+        self.context.execute_v2(list(self.bindings_addr.values()))
+        outputs = {n: self.bindings[n].data for n in self.output_names}
+
+        return outputs
+
+
+    def async_run_cuda(self, blob):
+        '''numpy input
+        '''
+        for n in self.input_names:
+            cuda.memcpy_htod_async(self.bindings_addr[n], blob[n], self.stream)
+        
+        bindings_addr = [int(v) for _, v in self.bindings_addr.items()]
+        self.context.execute_async_v2(bindings=bindings_addr, stream_handle=self.stream.handle)
+        
+        outputs = {}
+        for n in self.output_names:
+            cuda.memcpy_dtoh_async(self.bindings[n].data, self.bindings[n].ptr, self.stream)
+            outputs[n] = self.bindings[n].data
+        
+        self.stream.synchronize()
+        
+        return outputs
+    
+    def __call__(self, blob):
+        if self.backend == 'torch':
+            return self.run_torch(blob)
+
+        elif self.backend == 'cuda':
+            return self.async_run_cuda(blob)
+
+    def synchronize(self, ):
+        if self.backend == 'torch' and torch.cuda.is_available():
+            torch.cuda.synchronize()
+
+        elif self.backend == 'cuda':
+            self.stream.synchronize()
+    
+    def warmup(self, blob, n):
+        for _ in range(n):
+            _ = self(blob)
+
+    def speed(self, blob, n):
+        self.time_profile.reset()
+        for _ in range(n):
+            with self.time_profile:
+                _ = self(blob)
+
+        return self.time_profile.total / n 
+
--- a/benchmark/utils.py
+++ b/benchmark/utils.py
@@ -0,0 +1,83 @@
+'''by lyuwenyu
+'''
+
+import time 
+import contextlib
+import numpy as np
+from PIL import Image
+from collections import OrderedDict
+
+import onnx
+import torch 
+import onnx_graphsurgeon
+
+
+def to_binary_data(path, size=(640, 640), output_name='input_tensor.bin'):
+    '''--loadInputs='image:input_tensor.bin'
+    '''
+    im = Image.open(path).resize(size)
+    data = np.asarray(im, dtype=np.float32).transpose(2, 0, 1)[None] / 255.
+    data.tofile(output_name)
+
+
+def yolo_insert_nms(path, score_threshold=0.01, iou_threshold=0.7, max_output_boxes=300, simplify=False):
+    '''
+    http://www.xavierdupre.fr/app/onnxcustom/helpsphinx/api/onnxops/onnx__EfficientNMS_TRT.html
+    https://huggingface.co/spaces/muttalib1326/Punjabi_Character_Detection/blob/3dd1e17054c64e5f6b2254278f96cfa2bf418cd4/utils/add_nms.py
+    '''
+    onnx_model = onnx.load(path)
+
+    if simplify:
+        from onnxsim import simplify
+        onnx_model, _ = simplify(onnx_model,  overwrite_input_shapes={'image': [1, 3, 640, 640]})
+
+    graph = onnx_graphsurgeon.import_onnx(onnx_model)
+    graph.toposort()
+    graph.fold_constants()
+    graph.cleanup()
+
+    topk = max_output_boxes
+    attrs = OrderedDict(plugin_version='1',
+                        background_class=-1,
+                        max_output_boxes=topk,
+                        score_threshold=score_threshold,
+                        iou_threshold=iou_threshold,
+                        score_activation=False,
+                        box_coding=0, )
+
+    outputs = [onnx_graphsurgeon.Variable('num_dets', np.int32, [-1, 1]),
+               onnx_graphsurgeon.Variable('det_boxes', np.float32, [-1, topk, 4]),
+               onnx_graphsurgeon.Variable('det_scores', np.float32, [-1, topk]),
+               onnx_graphsurgeon.Variable('det_classes', np.int32, [-1, topk])]
+
+    graph.layer(op='EfficientNMS_TRT', 
+                name="batched_nms", 
+                inputs=[graph.outputs[0], 
+                        graph.outputs[1]], 
+                outputs=outputs, 
+                attrs=attrs, )
+
+    graph.outputs = outputs
+    graph.cleanup().toposort()
+
+    onnx.save(onnx_graphsurgeon.export_onnx(graph), f'yolo_w_nms.onnx')
+
+
+class TimeProfiler(contextlib.ContextDecorator):
+    def __init__(self, ):
+        self.total = 0
+        
+    def __enter__(self, ):
+        self.start = self.time()
+        return self 
+    
+    def __exit__(self, type, value, traceback):
+        self.total += self.time() - self.start
+    
+    def reset(self, ):
+        self.total = 0
+    
+    def time(self, ):
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        return time.time()
--- a/benchmark/yolov8_onnx.py
+++ b/benchmark/yolov8_onnx.py
@@ -0,0 +1,73 @@
+'''by lyuwenyu
+'''
+
+import torch 
+import torchvision
+
+import numpy as np 
+import onnxruntime as ort 
+
+from utils import yolo_insert_nms
+
+class YOLOv8(torch.nn.Module):
+    def __init__(self, name) -> None:
+        super().__init__()
+        from ultralytics import YOLO
+        # Load a model
+        # build a new model from scratch
+        # model = YOLO(f'{name}.yaml')  
+
+        # load a pretrained model (recommended for training)
+        model = YOLO(f'{name}.pt')  
+        self.model = model.model
+
+    def forward(self, x):
+        '''https://github.com/ultralytics/ultralytics/blob/main/ultralytics/nn/tasks.py#L216
+        '''
+        pred: torch.Tensor = self.model(x)[0] # n 84 8400,
+        pred = pred.permute(0, 2, 1)
+        nc = pred.shape[-1] - 4
+        boxes, scores = pred.split([4, nc], dim=-1)
+        boxes = torchvision.ops.box_convert(boxes, in_fmt='cxcywh', out_fmt='xyxy')
+
+        return boxes, scores
+
+
+
+def export_onnx(name='yolov8n'):
+    '''export onnx
+    '''
+    m = YOLOv8(name)
+
+    x = torch.rand(1, 3, 640, 640)
+    dynamic_axes = {
+        'image': {0: '-1'}
+    }
+    torch.onnx.export(m, x, f'{name}.onnx', 
+                      input_names=['image'], 
+                      output_names=['boxes', 'scores'], 
+                      opset_version=13, 
+                      dynamic_axes=dynamic_axes)
+
+    data = np.random.rand(1, 3, 640, 640).astype(np.float32)
+    sess = ort.InferenceSession(f'{name}.onnx')
+    _ = sess.run(output_names=None, input_feed={'image': data})
+
+
+if __name__ == '__main__':
+
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--name', type=str, default='yolov8l')
+    parser.add_argument('--score_threshold', type=float, default=0.001)
+    parser.add_argument('--iou_threshold', type=float, default=0.7)
+    parser.add_argument('--max_output_boxes', type=int, default=300)
+    args = parser.parse_args()
+
+    export_onnx(name=args.name)
+    
+    yolo_insert_nms(path=f'{args.name}.onnx', 
+                    score_threshold=args.score_threshold, 
+                    iou_threshold=args.iou_threshold, 
+                    max_output_boxes=args.max_output_boxes, )
+
--- a/hubconf.py
+++ b/hubconf.py
@@ -0,0 +1,174 @@
+"""Copyright(c) 2024 lyuwenyu. All Rights Reserved.
+"""
+
+
+import os
+import sys
+from pathlib import Path
+from urllib.parse import urlparse
+
+ROOT = Path(__file__).absolute().parent / 'rtdetrv2_pytorch'
+sys.path.append(str(ROOT))
+
+from src.core import YAMLConfig
+
+import torch
+import torch.nn as nn
+
+dependencies = ['torch', 'torchvision',]
+
+
+def _load_checkpoint(path: str, map_location='cpu'):
+    scheme = urlparse(str(path)).scheme
+    if not scheme:
+        state = torch.load(path, map_location=map_location)
+    else:
+        state = torch.hub.load_state_dict_from_url(path, map_location=map_location)
+    return state
+
+
+def _build_model(args, ):
+    """main
+    """
+    cfg = YAMLConfig(args.config)
+
+    if args.resume:
+        checkpoint = _load_checkpoint(args.resume, map_location='cpu') 
+        if 'ema' in checkpoint:
+            state = checkpoint['ema']['module']
+        else:
+            state = checkpoint['model']
+
+        # NOTE load train mode state
+        cfg.model.load_state_dict(state)
+
+
+    class Model(nn.Module):
+        def __init__(self, ) -> None:
+            super().__init__()
+            self.model = cfg.model.deploy()
+            self.postprocessor = cfg.postprocessor.deploy()
+            
+        def forward(self, images, orig_target_sizes):
+            outputs = self.model(images)
+            outputs = self.postprocessor(outputs, orig_target_sizes)
+            return outputs
+
+    return Model()
+
+
+CONFIG = {
+    # rtdetr
+    'rtdetr_r18vd': {
+        'config': ROOT / 'configs/rtdetr/rtdetr_r18vd_6x_coco.yml',
+        'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r18vd_dec3_6x_coco_from_paddle.pth',
+    },
+    'rtdetr_r34vd': {
+        'config': ROOT / 'configs/rtdetr/rtdetr_r34vd_6x_coco.yml',
+        'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r34vd_dec4_6x_coco_from_paddle.pth',
+    },
+    'rtdetr_r50vd_m': {
+        'config': ROOT / 'configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml',
+        'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_m_6x_coco_from_paddle.pth',
+    },
+    'rtdetr_r50vd': {
+        'config': ROOT / 'configs/rtdetr/rtdetr_r50vd_6x_coco.yml',
+        'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_6x_coco_from_paddle.pth',
+    },
+    'rtdetr_r101vd': {
+        'config': ROOT / 'configs/rtdetr/rtdetr_r101vd_6x_coco.yml',
+        'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r101vd_6x_coco_from_paddle.pth',
+    },
+
+    # rtdetrv2
+    'rtdetrv2_r18vd': {
+        'config': ROOT / 'configs/rtdetrv2/rtdetrv2_r18vd_120e_coco.yml',
+        'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.2/rtdetrv2_r18vd_120e_coco_rerun_48.1.pth',
+    },
+    'rtdetrv2_r34vd': {
+        'config': ROOT / 'configs/rtdetrv2/rtdetrv2_r34vd_120e_coco.yml',
+        'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r34vd_120e_coco_ema.pth',
+    },
+    'rtdetrv2_r50vd_m': {
+        'config': ROOT / 'configs/rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml',
+        'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_m_7x_coco_ema.pth',
+    },
+    'rtdetrv2_r50vd': {
+        'config': ROOT / 'configs/rtdetrv2/rtdetrv2_r50vd_6x_coco.yml',
+        'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_6x_coco_ema.pth',
+    },
+    'rtdetrv2_r101vd': {
+        'config': ROOT / 'configs/rtdetrv2/rtdetrv2_r101vd_6x_coco.yml',
+        'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r101vd_6x_coco_from_paddle.pth',
+    },
+}
+
+
+# rtdetr
+def rtdetr_r18vd(pretrained=True,):
+    args = type('Args', (), CONFIG['rtdetr_r18vd'])()
+    args.resume = args.resume if pretrained else ''
+    return _build_model(args, )
+
+
+def rtdetr_r34vd(pretrained=True,):
+    args = type('Args', (), CONFIG['rtdetr_r34vd'])()
+    args.resume = args.resume if pretrained else ''
+    return _build_model(args, )
+
+
+def rtdetr_r50vd_m(pretrained=True):
+    args = type('Args', (), CONFIG['rtdetr_r50vd_m'])()
+    args.resume = args.resume if pretrained else ''
+    return _build_model(args, )
+
+
+def rtdetr_r50vd(pretrained=True):
+    args = type('Args', (), CONFIG['rtdetr_r50vd'])()
+    args.resume = args.resume if pretrained else ''
+    return _build_model(args, )
+
+
+def rtdetr_r101vd(pretrained=True):
+    args = type('Args', (), CONFIG['rtdetr_r101vd'])()
+    args.resume = args.resume if pretrained else ''
+    return _build_model(args, )
+
+
+# rtdetrv2
+def rtdetrv2_r18vd(pretrained=True,):
+    args = type('Args', (), CONFIG['rtdetrv2_r18vd'])()
+    args.resume = args.resume if pretrained else ''
+    return _build_model(args, )
+
+
+def rtdetrv2_r34vd(pretrained=True,):
+    args = type('Args', (), CONFIG['rtdetrv2_r34vd'])()
+    args.resume = args.resume if pretrained else ''
+    return _build_model(args, )
+
+
+def rtdetrv2_r50vd_m(pretrained=True):
+    args = type('Args', (), CONFIG['rtdetrv2_r50vd_m'])()
+    args.resume = args.resume if pretrained else ''
+    return _build_model(args, )
+
+
+def rtdetrv2_r50vd(pretrained=True):
+    args = type('Args', (), CONFIG['rtdetrv2_r50vd'])()
+    args.resume = args.resume if pretrained else ''
+    return _build_model(args, )
+
+
+def rtdetrv2_r101vd(pretrained=True):
+    args = type('Args', (), CONFIG['rtdetrv2_r101vd'])()
+    args.resume = args.resume if pretrained else ''
+    return _build_model(args, )
+
+
+rtdetrv2_s = rtdetrv2_r18vd
+rtdetrv2_m_r34 = rtdetrv2_r34vd
+rtdetrv2_m_r50 = rtdetrv2_r50vd_m
+rtdetrv2_l = rtdetrv2_r50vd
+rtdetrv2_x = rtdetrv2_r101vd
+
--- a/rtdetr_paddle/README.md
+++ b/rtdetr_paddle/README.md
@@ -0,0 +1,244 @@
+English | [简体中文](README_cn.md)
+
+## Model Zoo on COCO
+
+| Model | Epoch | Backbone  | Input shape | $AP^{val}$ | $AP^{val}_{50}$| Params(M) | FLOPs(G) |  T4 TensorRT FP16(FPS) | Weight | Config | Log
+|:--------------:|:-----:|:----------:| :-------:|:--------------------------:|:---------------------------:|:---------:|:--------:| :---------------------: |:------------------------------------------------------------------------------------:|:-------------------------------------------:|:---|
+| RT-DETR-R18 | 6x |  ResNet-18 | 640 | 46.5 | 63.8 | 20 | 60 | 217 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r18vd_dec3_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r18vd_6x_coco.yml) | [rtdetr_r18vd_dec3_6x_coco_log.txt](https://github.com/lyuwenyu/RT-DETR/files/12038864/rtdetr_r18vd_dec3_6x_coco_log.txt)
+| RT-DETR-R34 | 6x |  ResNet-34 | 640 | 48.9 | 66.8 | 31 | 92 | 161 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r34vd_dec4_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r34vd_6x_coco.yml) | [rtdetr_r34vd_dec4_6x_coco_log.txt](https://github.com/lyuwenyu/RT-DETR/files/12038861/rtdetr_r34vd_dec4_6x_coco_log.txt)
+| RT-DETR-R50-m | 6x |  ResNet-50 | 640 | 51.3 | 69.6 | 36 | 100 | 145 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_m_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml) | -
+| RT-DETR-R50 | 6x |  ResNet-50 | 640 | 53.1 | 71.3 | 42 | 136 | 108 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r50vd_6x_coco.yml) | [rtdetr_r50vd_6x_coco_log.txt](https://github.com/lyuwenyu/RT-DETR/files/12038669/rtdetr_r50vd_6x_coco_log.txt)
+| RT-DETR-R101 | 6x |  ResNet-101 | 640 | 54.3 | 72.7 | 76 | 259 | 74 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r101vd_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r101vd_6x_coco.yml) | [rtdetr_r101vd_6x_coco_log.txt](https://github.com/lyuwenyu/RT-DETR/files/12038707/rtdetr_r101vd_6x_coco_log.txt)
+| RT-DETR-L | 6x |  HGNetv2 | 640 | 53.0 | 71.6 | 32 | 110 | 114 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_hgnetv2_l_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_hgnetv2_l_6x_coco.yml) | [rtdetr_hgnetv2_l_6x_coco_log.txt](https://github.com/lyuwenyu/RT-DETR/files/12038753/rtdetr_hgnetv2_l_6x_coco_log.txt)
+| RT-DETR-X | 6x |  HGNetv2 | 640 | 54.8 | 73.1 | 67 | 234 | 74 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_hgnetv2_x_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_hgnetv2_x_6x_coco.yml) | [rtdetr_hgnetv2_x_6x_coco_log.txt](https://github.com/lyuwenyu/RT-DETR/files/12038795/rtdetr_hgnetv2_x_6x_coco_log.txt)
+
+**Notes:**
+- RT-DETR uses 4 GPUs for training.
+- RT-DETR was trained on COCO train2017 and evaluated on val2017.
+
+
+## Model Zoo on Objects365
+| Model | Epoch | Dataset | Input shape | $AP^{val}$ | $AP^{val}_{50}$ | T4 TensorRT FP16(FPS) | Weight | Log
+|:---:|:---:|:---:| :---:|:---:|:---:|:---:|:---:|:---:|
+RT-DETR-R18 | 1x | Objects365 | 640 | 22.9 | 31.2 | - | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r18vd_1x_objects365.pdparams) | [log.txt](https://github.com/lyuwenyu/RT-DETR/files/12394706/rtdetr_r18vd_1x_objects365_log.txt)
+RT-DETR-R18 | 5x | COCO + Objects365 | 640 | **49.2** | **66.6** | **217** | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r18vd_5x_coco_objects365.pdparams) | [log.txt](https://github.com/lyuwenyu/RT-DETR/files/12416808/rtdetr_r18vd_5x_coco_objects365_log.txt)
+RT-DETR-R50 | 1x | Objects365 | 640 | 35.1 | 46.2 | - | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_1x_objects365.pdparams) |[log.txt](https://github.com/lyuwenyu/RT-DETR/files/12193246/rtdetr_r50vd_1x_objects365_log.txt)
+RT-DETR-R50 | 2x | COCO + Objects365 | 640 | **55.3** | **73.4** | **108** | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_2x_coco_objects365.pdparams) | [log.txt](https://github.com/lyuwenyu/RT-DETR/files/12208338/rtdetr_r50vd_2x_coco_objects365_log.txt)
+RT-DETR-R101 | 1x | Objects365 | 640 | 36.8 | 48.3 | - | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r101vd_1x_objects365.pdparams) | [log.txt](https://github.com/lyuwenyu/RT-DETR/files/12340691/rtdetr_r101vd_1x_objects365_log.txt)
+RT-DETR-R101 | 2x | COCO + Objects365 | 640 | **56.2** | **74.6** | **74** |[download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r101vd_2x_coco_objects365.pdparams) | [log.txt](https://github.com/lyuwenyu/RT-DETR/files/12340672/rtdetr_r101vd_2x_coco_objects365_log.txt)
+
+
+**Notes:**
+- `COCO + Objects365` in the table means finetuned model on COCO using pretrained weights trained on Objects365.
+
+
+
+## Quick start
+
+<details open>
+<summary>Install requirements</summary>
+
+<!-- - PaddlePaddle == 2.4.2 -->
+```bash
+pip install -r requirements.txt
+```
+
+</details>
+
+<details>
+<summary>Compile (optional)</summary>
+
+```bash
+cd ./ppdet/modeling/transformers/ext_op/
+
+python setup_ms_deformable_attn_op.py install
+```
+See [details](./ppdet/modeling/transformers/ext_op/)
+</details>
+
+
+<details>
+<summary>Data preparation</summary>
+
+- Download and extract COCO 2017 train and val images.
+```
+path/to/coco/
+  annotations/  # annotation json files
+  train2017/    # train images
+  val2017/      # val images
+```
+- Modify config [`dataset_dir`](configs/datasets/coco_detection.yml)
+</details>
+
+
+<details>
+<summary>Training & Evaluation & Testing</summary>
+
+- Training on a Single GPU:
+
+```shell
+# training on single-GPU
+export CUDA_VISIBLE_DEVICES=0
+python tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml --eval
+```
+
+- Training on Multiple GPUs:
+
+```shell
+# training on multi-GPU
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python -m paddle.distributed.launch --gpus 0,1,2,3 tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml --fleet --eval
+```
+
+- Evaluation:
+
+```shell
+python tools/eval.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \
+              -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams
+```
+
+- Inference:
+
+```shell
+python tools/infer.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \
+              -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams \
+              --infer_img=./demo/000000570688.jpg
+```
+
+</details>
+
+
+## Finetune
+<details>
+<summary>Details</summary>
+
+1. prepare data as coco format.
+```
+path/to/custom/data/
+    annotations/  # annotation json files
+    train/    # train images
+    val/      # val images
+```
+2. Modify dataset config [`dataset_dir`, `image_dir`, `anno_path`](configs/datasets/coco_detection.yml)
+
+3. Modify model config [`pretrain_weights`](configs/rtdetr/_base_/rtdetr_r50vd.yml) to coco pretrained parameters url in model zoo.
+
+```bash
+# or modified in command line
+
+fleetrun --gpus=0,1,2,3 tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml -o pretrain_weights=https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams --eval
+```
+</details>
+
+
+
+## Deploy
+
+<details open>
+<summary>1. Export model </summary>
+
+```shell
+python tools/export_model.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \
+              -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams trt=True \
+              --output_dir=output_inference
+```
+
+</details>
+
+<details>
+<summary>2. Convert to ONNX </summary>
+
+- Install [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX) and ONNX
+
+```shell
+pip install onnx==1.13.0
+pip install paddle2onnx==1.0.5
+```
+
+- Convert:
+
+```shell
+paddle2onnx --model_dir=./output_inference/rtdetr_r50vd_6x_coco/ \
+            --model_filename model.pdmodel  \
+            --params_filename model.pdiparams \
+            --opset_version 16 \
+            --save_file rtdetr_r50vd_6x_coco.onnx
+```
+</details>
+
+<details>
+<summary>3. Convert to TensorRT </summary>
+
+- TensorRT version >= 8.5.1
+- Inference can refer to [Bennchmark](../benchmark)
+
+```shell
+trtexec --onnx=./rtdetr_r50vd_6x_coco.onnx \
+        --workspace=4096 \
+        --shapes=image:1x3x640x640 \
+        --saveEngine=rtdetr_r50vd_6x_coco.trt \
+        --avgRuns=100 \
+        --fp16
+```
+
+-
+</details>
+
+
+## Others
+
+<details>
+<summary>1. Parameters and FLOPs </summary>
+
+1. Find and modify paddle [`dynamic_flops.py` ](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/hapi/dynamic_flops.py#L28) source code in your local machine
+
+```python
+# eg. /path/to/anaconda3/lib/python3.8/site-packages/paddle/hapi/dynamic_flops.py
+
+def flops(net, input_size, inputs=None, custom_ops=None, print_detail=False):
+    if isinstance(net, nn.Layer):
+        # If net is a dy2stat model, net.forward is StaticFunction instance,
+        # we set net.forward to original forward function.
+        _, net.forward = unwrap_decorators(net.forward)
+
+        # by lyuwenyu
+        if inputs is None:
+            inputs = paddle.randn(input_size)
+
+        return dynamic_flops(
+            net, inputs=inputs, custom_ops=custom_ops, print_detail=print_detail
+        )
+    elif isinstance(net, paddle.static.Program):
+        return static_flops(net, print_detail=print_detail)
+    else:
+        warnings.warn(
+            "Your model must be an instance of paddle.nn.Layer or paddle.static.Program."
+        )
+        return -1
+```
+
+2. Run below code
+
+```python
+import paddle
+from ppdet.core.workspace import load_config, merge_config
+from ppdet.core.workspace import create
+
+cfg_path = './configs/rtdetr/rtdetr_r50vd_6x_coco.yml'
+cfg = load_config(cfg_path)
+model = create(cfg.architecture)
+
+blob = {
+    'image': paddle.randn([1, 3, 640, 640]),
+    'im_shape': paddle.to_tensor([[640, 640]]),
+    'scale_factor': paddle.to_tensor([[1., 1.]])
+}
+paddle.flops(model, None, blob, custom_ops=None, print_detail=False)
+
+# Outpus
+# Total Flops: 68348108800     Total Params: 41514204
+
+```
+
+
+</details>
--- a/rtdetr_paddle/README_cn.md
+++ b/rtdetr_paddle/README_cn.md
@@ -0,0 +1,202 @@
+简体中文 | [English](README_en.md)
+
+## 模型
+
+| Model | Epoch | backbone  | input shape | $AP^{val}$ | $AP^{val}_{50}$| Params(M) | FLOPs(G) |  T4 TensorRT FP16(FPS) | Pretrained Model | config |
+|:--------------:|:-----:|:----------:| :-------:|:--------------------------:|:---------------------------:|:---------:|:--------:| :---------------------: |:------------------------------------------------------------------------------------:|:-------------------------------------------:|
+| RT-DETR-R18 | 6x |  ResNet-18 | 640 | 46.5 | 63.8 | 20 | 60 | 217 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r18vd_dec3_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r18vd_6x_coco.yml)
+| RT-DETR-R34 | 6x |  ResNet-34 | 640 | 48.9 | 66.8 | 31 | 92 | 161 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r34vd_dec4_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r34vd_6x_coco.yml)
+| RT-DETR-R50-m | 6x |  ResNet-50 | 640 | 51.3 | 69.6 | 36 | 100 | 145 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_m_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml)
+| RT-DETR-R50 | 6x |  ResNet-50 | 640 | 53.1 | 71.3 | 42 | 136 | 108 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r50vd_6x_coco.yml)
+| RT-DETR-R101 | 6x |  ResNet-101 | 640 | 54.3 | 72.7 | 76 | 259 | 74 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r101vd_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r101vd_6x_coco.yml)
+| RT-DETR-L | 6x |  HGNetv2 | 640 | 53.0 | 71.6 | 32 | 110 | 114 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_hgnetv2_l_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_hgnetv2_l_6x_coco.yml)
+| RT-DETR-X | 6x |  HGNetv2 | 640 | 54.8 | 73.1 | 67 | 234 | 74 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_hgnetv2_x_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_hgnetv2_x_6x_coco.yml)
+
+
+**注意事项:**
+- RT-DETR 使用4个GPU训练。
+- RT-DETR 在COCO train2017上训练，并在val2017上评估。
+
+## 快速开始
+
+<details open>
+<summary>依赖包</summary>
+
+<!-- - PaddlePaddle == 2.4.2 -->
+```bash
+pip install -r requirements.txt
+```
+
+</details>
+
+<details>
+<summary>准备数据</summary>
+
+- 修改[配置文件`dataset_dir`](configs/datasets/coco_detection.yml)
+</details>
+
+
+<details>
+<summary>训练&评估</summary>
+
+- 单卡GPU上训练:
+
+```shell
+# training on single-GPU
+export CUDA_VISIBLE_DEVICES=0
+python tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml --eval
+```
+
+- 多卡GPU上训练:
+
+```shell
+# training on multi-GPU
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python -m paddle.distributed.launch --gpus 0,1,2,3 tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml --fleet --eval
+```
+
+- 评估:
+
+```shell
+python tools/eval.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \
+              -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams
+```
+
+- 测试:
+
+```shell
+python tools/infer.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \
+              -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams \
+              --infer_img=./demo/000000570688.jpg
+```
+
+详情请参考[快速开始文档](https://github.com/PaddlePaddle/PaddleDetection/blob/develop/docs/tutorials/GETTING_STARTED.md).
+
+</details>
+
+## 部署
+
+<details open>
+<summary>1. 导出模型 </summary>
+
+```shell
+python tools/export_model.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \
+              -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams trt=True \
+              --output_dir=output_inference
+```
+
+</details>
+
+<details>
+<summary>2. 转换模型至ONNX </summary>
+
+- 安装[Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX) 和 ONNX
+
+```shell
+pip install onnx==1.13.0
+pip install paddle2onnx==1.0.5
+```
+
+- 转换模型:
+
+```shell
+paddle2onnx --model_dir=./output_inference/rtdetr_r50vd_6x_coco/ \
+            --model_filename model.pdmodel  \
+            --params_filename model.pdiparams \
+            --opset_version 16 \
+            --save_file rtdetr_r50vd_6x_coco.onnx
+```
+</details>
+
+<details>
+<summary>3. 转换成TensorRT </summary>
+
+- 确保TensorRT的版本>=8.5.1
+- TRT推理可以参考[RT-DETR](https://github.com/lyuwenyu/RT-DETR)的部分代码或者其他网络资源
+
+```shell
+trtexec --onnx=./rtdetr_r50vd_6x_coco.onnx \
+        --workspace=4096 \
+        --shapes=image:1x3x640x640 \
+        --saveEngine=rtdetr_r50vd_6x_coco.trt \
+        --avgRuns=100 \
+        --fp16
+```
+
+-
+</details>
+
+
+## 其他
+
+<details>
+<summary>1. 参数量和计算量统计 </summary>
+
+1. 找到[本地安装paddle的flops源代码](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/hapi/dynamic_flops.py#L28), 并修改为
+
+```python
+# anaconda3/lib/python3.8/site-packages/paddle/hapi/dynamic_flops.py
+def flops(net, input_size, inputs=None, custom_ops=None, print_detail=False):
+    if isinstance(net, nn.Layer):
+        # If net is a dy2stat model, net.forward is StaticFunction instance,
+        # we set net.forward to original forward function.
+        _, net.forward = unwrap_decorators(net.forward)
+
+        # by lyuwenyu
+        if inputs is None:
+            inputs = paddle.randn(input_size)
+
+        return dynamic_flops(
+            net, inputs=inputs, custom_ops=custom_ops, print_detail=print_detail
+        )
+    elif isinstance(net, paddle.static.Program):
+        return static_flops(net, print_detail=print_detail)
+    else:
+        warnings.warn(
+            "Your model must be an instance of paddle.nn.Layer or paddle.static.Program."
+        )
+        return -1
+```
+
+2. 使用以下代码片段实现参数量和计算量的统计
+
+```python
+import paddle
+from ppdet.core.workspace import load_config, merge_config
+from ppdet.core.workspace import create
+
+cfg_path = './configs/rtdetr/rtdetr_r50vd_6x_coco.yml'
+cfg = load_config(cfg_path)
+model = create(cfg.architecture)
+
+blob = {
+    'image': paddle.randn([1, 3, 640, 640]),
+    'im_shape': paddle.to_tensor([[640, 640]]),
+    'scale_factor': paddle.to_tensor([[1., 1.]])
+}
+paddle.flops(model, None, blob, custom_ops=None, print_detail=False)
+```
+</details>
+
+
+<details open>
+<summary>2. YOLOs端到端速度测速 </summary>
+
+- 可以参考[RT-DETR](https://github.com/lyuwenyu/RT-DETR) benchmark部分或者其他网络资源
+
+</details>
+
+
+
+## 引用RT-DETR
+如果需要在你的研究中使用RT-DETR，请通过以下方式引用我们的论文：
+```
+@misc{lv2023detrs,
+      title={DETRs Beat YOLOs on Real-time Object Detection},
+      author={Wenyu Lv and Shangliang Xu and Yian Zhao and Guanzhong Wang and Jinman Wei and Cheng Cui and Yuning Du and Qingqing Dang and Yi Liu},
+      year={2023},
+      eprint={2304.08069},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
--- a/rtdetr_paddle/configs/datasets/coco_detection.yml
+++ b/rtdetr_paddle/configs/datasets/coco_detection.yml
@@ -0,0 +1,21 @@
+metric: COCO
+num_classes: 80
+
+TrainDataset:
+  name: COCODataSet
+  image_dir: train2017
+  anno_path: annotations/instances_train2017.json
+  dataset_dir: dataset/coco
+  data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
+
+EvalDataset:
+  name: COCODataSet
+  image_dir: val2017
+  anno_path: annotations/instances_val2017.json
+  dataset_dir: dataset/coco
+  allow_empty: true
+
+TestDataset:
+  name: ImageFolder
+  anno_path: annotations/instances_val2017.json # also support txt (like VOC's label_list.txt)
+  dataset_dir: dataset/coco # if set, anno_path will be 'dataset_dir/anno_path'
--- a/rtdetr_paddle/configs/datasets/voc.yml
+++ b/rtdetr_paddle/configs/datasets/voc.yml
@@ -0,0 +1,21 @@
+metric: VOC
+map_type: 11point
+num_classes: 20
+
+TrainDataset:
+  name: VOCDataSet
+  dataset_dir: dataset/voc
+  anno_path: trainval.txt
+  label_list: label_list.txt
+  data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult']
+
+EvalDataset:
+  name: VOCDataSet
+  dataset_dir: dataset/voc
+  anno_path: test.txt
+  label_list: label_list.txt
+  data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult']
+
+TestDataset:
+  name: ImageFolder
+  anno_path: dataset/voc/label_list.txt
--- a/rtdetr_paddle/configs/rtdetr/_base_/optimizer_6x.yml
+++ b/rtdetr_paddle/configs/rtdetr/_base_/optimizer_6x.yml
@@ -0,0 +1,19 @@
+epoch: 72
+
+LearningRate:
+  base_lr: 0.0001
+  schedulers:
+  - !PiecewiseDecay
+    gamma: 1.0
+    milestones: [100]
+    use_warmup: true
+  - !LinearWarmup
+    start_factor: 0.001
+    steps: 2000
+
+OptimizerBuilder:
+  clip_grad_by_norm: 0.1
+  regularizer: false
+  optimizer:
+    type: AdamW
+    weight_decay: 0.0001
--- a/rtdetr_paddle/configs/rtdetr/_base_/rtdetr_r50vd.yml
+++ b/rtdetr_paddle/configs/rtdetr/_base_/rtdetr_r50vd.yml
@@ -0,0 +1,71 @@
+architecture: DETR
+pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams
+norm_type: sync_bn
+use_ema: True
+ema_decay: 0.9999
+ema_decay_type: "exponential"
+ema_filter_no_grad: True
+hidden_dim: 256
+use_focal_loss: True
+eval_size: [640, 640] # h, w
+
+
+DETR:
+  backbone: ResNet
+  neck: HybridEncoder
+  transformer: RTDETRTransformer
+  detr_head: DINOHead
+  post_process: DETRPostProcess
+
+ResNet:
+  # index 0 stands for res2
+  depth: 50
+  variant: d
+  norm_type: bn
+  freeze_at: 0
+  return_idx: [1, 2, 3]
+  lr_mult_list: [0.1, 0.1, 0.1, 0.1]
+  num_stages: 4
+  freeze_stem_only: True
+
+HybridEncoder:
+  hidden_dim: 256
+  use_encoder_idx: [2]
+  num_encoder_layers: 1
+  encoder_layer:
+    name: TransformerLayer
+    d_model: 256
+    nhead: 8
+    dim_feedforward: 1024
+    dropout: 0.
+    activation: 'gelu'
+  expansion: 1.0
+
+
+RTDETRTransformer:
+  num_queries: 300
+  position_embed_type: sine
+  feat_strides: [8, 16, 32]
+  num_levels: 3
+  nhead: 8
+  num_decoder_layers: 6
+  dim_feedforward: 1024
+  dropout: 0.0
+  activation: relu
+  num_denoising: 100
+  label_noise_ratio: 0.5
+  box_noise_scale: 1.0
+  learnt_init_query: False
+
+DINOHead:
+  loss:
+    name: DINOLoss
+    loss_coeff: {class: 1, bbox: 5, giou: 2}
+    aux_loss: True
+    use_vfl: True
+    matcher:
+      name: HungarianMatcher
+      matcher_coeff: {class: 2, bbox: 5, giou: 2}
+
+DETRPostProcess:
+  num_top_queries: 300
--- a/rtdetr_paddle/configs/rtdetr/_base_/rtdetr_reader.yml
+++ b/rtdetr_paddle/configs/rtdetr/_base_/rtdetr_reader.yml
@@ -0,0 +1,43 @@
+worker_num: 4
+TrainReader:
+  sample_transforms:
+    - Decode: {}
+    - RandomDistort: {prob: 0.8}
+    - RandomExpand: {fill_value: [123.675, 116.28, 103.53]}
+    - RandomCrop: {prob: 0.8}
+    - RandomFlip: {}
+  batch_transforms:
+    - BatchRandomResize: {target_size: [480, 512, 544, 576, 608, 640, 640, 640, 672, 704, 736, 768, 800], random_size: True, random_interp: True, keep_ratio: False}
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - NormalizeBox: {}
+    - BboxXYXY2XYWH: {}
+    - Permute: {}
+  batch_size: 4
+  shuffle: true
+  drop_last: true
+  collate_batch: false
+  use_shared_memory: false
+
+
+EvalReader:
+  sample_transforms:
+    - Decode: {}
+    - Resize: {target_size: [640, 640], keep_ratio: False, interp: 2} # target_size: (h, w)
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - Permute: {}
+  batch_size: 4
+  shuffle: false
+  drop_last: false
+
+
+TestReader:
+  inputs_def:
+    image_shape: [3, 640, 640]
+  sample_transforms:
+    - Decode: {}
+    - Resize: {target_size: [640, 640], keep_ratio: False, interp: 2}
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - Permute: {}
+  batch_size: 1
+  shuffle: false
+  drop_last: false
--- a/rtdetr_paddle/configs/rtdetr/rtdetr_hgnetv2_l_6x_coco.yml
+++ b/rtdetr_paddle/configs/rtdetr/rtdetr_hgnetv2_l_6x_coco.yml
@@ -0,0 +1,24 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_6x.yml',
+  '_base_/rtdetr_r50vd.yml',
+  '_base_/rtdetr_reader.yml',
+]
+
+weights: output/rtdetr_hgnetv2_l_6x_coco/model_final
+pretrain_weights: https://bj.bcebos.com/v1/paddledet/models/pretrained/PPHGNetV2_L_ssld_pretrained.pdparams
+find_unused_parameters: True
+log_iter: 200
+
+
+DETR:
+  backbone: PPHGNetV2
+
+PPHGNetV2:
+  arch: 'L'
+  return_idx: [1, 2, 3]
+  freeze_stem_only: True
+  freeze_at: 0
+  freeze_norm: True
+  lr_mult_list: [0., 0.05, 0.05, 0.05, 0.05]
--- a/rtdetr_paddle/configs/rtdetr/rtdetr_hgnetv2_x_6x_coco.yml
+++ b/rtdetr_paddle/configs/rtdetr/rtdetr_hgnetv2_x_6x_coco.yml
@@ -0,0 +1,40 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_6x.yml',
+  '_base_/rtdetr_r50vd.yml',
+  '_base_/rtdetr_reader.yml',
+]
+
+weights: output/rtdetr_hgnetv2_l_6x_coco/model_final
+pretrain_weights: https://bj.bcebos.com/v1/paddledet/models/pretrained/PPHGNetV2_X_ssld_pretrained.pdparams
+find_unused_parameters: True
+log_iter: 200
+
+
+
+DETR:
+  backbone: PPHGNetV2
+
+
+PPHGNetV2:
+  arch: 'X'
+  return_idx: [1, 2, 3]
+  freeze_stem_only: True
+  freeze_at: 0
+  freeze_norm: True
+  lr_mult_list: [0., 0.01, 0.01, 0.01, 0.01]
+
+
+HybridEncoder:
+  hidden_dim: 384
+  use_encoder_idx: [2]
+  num_encoder_layers: 1
+  encoder_layer:
+    name: TransformerLayer
+    d_model: 384
+    nhead: 8
+    dim_feedforward: 2048
+    dropout: 0.
+    activation: 'gelu'
+  expansion: 1.0
--- a/rtdetr_paddle/configs/rtdetr/rtdetr_r101vd_6x_coco.yml
+++ b/rtdetr_paddle/configs/rtdetr/rtdetr_r101vd_6x_coco.yml
@@ -0,0 +1,37 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_6x.yml',
+  '_base_/rtdetr_r50vd.yml',
+  '_base_/rtdetr_reader.yml',
+]
+
+weights: output/rtdetr_r101vd_6x_coco/model_final
+find_unused_parameters: True
+log_iter: 200
+
+pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet101_vd_ssld_pretrained.pdparams
+
+ResNet:
+  # index 0 stands for res2
+  depth: 101
+  variant: d
+  norm_type: bn
+  freeze_at: 0
+  return_idx: [1, 2, 3]
+  lr_mult_list: [0.01, 0.01, 0.01, 0.01]
+  num_stages: 4
+  freeze_stem_only: True
+
+HybridEncoder:
+  hidden_dim: 384
+  use_encoder_idx: [2]
+  num_encoder_layers: 1
+  encoder_layer:
+    name: TransformerLayer
+    d_model: 384
+    nhead: 8
+    dim_feedforward: 2048
+    dropout: 0.
+    activation: 'gelu'
+  expansion: 1.0
--- a/rtdetr_paddle/configs/rtdetr/rtdetr_r18vd_6x_coco.yml
+++ b/rtdetr_paddle/configs/rtdetr/rtdetr_r18vd_6x_coco.yml
@@ -0,0 +1,38 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_6x.yml',
+  '_base_/rtdetr_r50vd.yml',
+  '_base_/rtdetr_reader.yml',
+]
+
+weights: output/rtdetr_r18_6x_coco/model_final
+find_unused_parameters: True
+log_iter: 200
+
+pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet18_vd_pretrained.pdparams
+ResNet:
+  depth: 18
+  variant: d
+  return_idx: [1, 2, 3]
+  freeze_at: -1
+  freeze_norm: false
+  norm_decay: 0.
+
+HybridEncoder:
+  hidden_dim: 256
+  use_encoder_idx: [2]
+  num_encoder_layers: 1
+  encoder_layer:
+    name: TransformerLayer
+    d_model: 256
+    nhead: 8
+    dim_feedforward: 1024
+    dropout: 0.
+    activation: 'gelu'
+  expansion: 0.5
+  depth_mult: 1.0
+
+RTDETRTransformer:
+  eval_idx: -1
+  num_decoder_layers: 3
--- a/rtdetr_paddle/configs/rtdetr/rtdetr_r34vd_6x_coco.yml
+++ b/rtdetr_paddle/configs/rtdetr/rtdetr_r34vd_6x_coco.yml
@@ -0,0 +1,38 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_6x.yml',
+  '_base_/rtdetr_r50vd.yml',
+  '_base_/rtdetr_reader.yml',
+]
+
+weights: output/rtdetr_r34vd_6x_coco/model_final
+find_unused_parameters: True
+log_iter: 200
+
+pretrain_weights: https://bj.bcebos.com/v1/paddledet/models/pretrained/ResNet34_vd_pretrained.pdparams
+ResNet:
+  depth: 34
+  variant: d
+  return_idx: [1, 2, 3]
+  freeze_at: -1
+  freeze_norm: false
+  norm_decay: 0.
+
+HybridEncoder:
+  hidden_dim: 256
+  use_encoder_idx: [2]
+  num_encoder_layers: 1
+  encoder_layer:
+    name: TransformerLayer
+    d_model: 256
+    nhead: 8
+    dim_feedforward: 1024
+    dropout: 0.
+    activation: 'gelu'
+  expansion: 0.5
+  depth_mult: 1.0
+
+RTDETRTransformer:
+  eval_idx: -1
+  num_decoder_layers: 4
--- a/rtdetr_paddle/configs/rtdetr/rtdetr_r50vd_6x_coco.yml
+++ b/rtdetr_paddle/configs/rtdetr/rtdetr_r50vd_6x_coco.yml
@@ -0,0 +1,11 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_6x.yml',
+  '_base_/rtdetr_r50vd.yml',
+  '_base_/rtdetr_reader.yml',
+]
+
+weights: output/rtdetr_r50vd_6x_coco/model_final
+find_unused_parameters: True
+log_iter: 200
--- a/rtdetr_paddle/configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml
+++ b/rtdetr_paddle/configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml
@@ -0,0 +1,28 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_6x.yml',
+  '_base_/rtdetr_r50vd.yml',
+  '_base_/rtdetr_reader.yml',
+]
+
+weights: output/rtdetr_r50vd_m_6x_coco/model_final
+find_unused_parameters: True
+log_iter: 200
+
+HybridEncoder:
+  hidden_dim: 256
+  use_encoder_idx: [2]
+  num_encoder_layers: 1
+  encoder_layer:
+    name: TransformerLayer
+    d_model: 256
+    nhead: 8
+    dim_feedforward: 1024
+    dropout: 0.
+    activation: 'gelu'
+  expansion: 0.5
+  depth_mult: 1.0
+
+RTDETRTransformer:
+  eval_idx: 2 # use 3th decoder layer to eval
--- a/rtdetr_paddle/configs/runtime.yml
+++ b/rtdetr_paddle/configs/runtime.yml
@@ -0,0 +1,16 @@
+use_gpu: true
+use_xpu: false
+use_mlu: false
+use_npu: false
+log_iter: 20
+save_dir: output
+snapshot_epoch: 1
+print_flops: false
+print_params: false
+
+# Exporting the model
+export:
+  post_process: True  # Whether post-processing is included in the network when export model.
+  nms: True           # Whether NMS is included in the network when export model.
+  benchmark: False    # It is used to testing model performance, if set `True`, post-process and NMS will not be exported.
+  fuse_conv_bn: False
--- a/rtdetr_paddle/dataset/coco/download_coco.py
+++ b/rtdetr_paddle/dataset/coco/download_coco.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os.path as osp
+import logging
+# add python path of PaddleDetection to sys.path
+parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))
+if parent_path not in sys.path:
+    sys.path.append(parent_path)
+
+from ppdet.utils.download import download_dataset
+
+logging.basicConfig(level=logging.INFO)
+
+download_path = osp.split(osp.realpath(sys.argv[0]))[0]
+download_dataset(download_path, 'coco')
--- a/rtdetr_paddle/dataset/voc/create_list.py
+++ b/rtdetr_paddle/dataset/voc/create_list.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os.path as osp
+import logging
+# add python path of PaddleDetection to sys.path
+parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))
+if parent_path not in sys.path:
+    sys.path.append(parent_path)
+
+from ppdet.utils.download import create_voc_list
+
+logging.basicConfig(level=logging.INFO)
+
+voc_path = osp.split(osp.realpath(sys.argv[0]))[0]
+create_voc_list(voc_path)
--- a/rtdetr_paddle/dataset/voc/download_voc.py
+++ b/rtdetr_paddle/dataset/voc/download_voc.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os.path as osp
+import logging
+# add python path of PaddleDetection to sys.path
+parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))
+if parent_path not in sys.path:
+    sys.path.append(parent_path)
+
+from ppdet.utils.download import download_dataset
+
+logging.basicConfig(level=logging.INFO)
+
+download_path = osp.split(osp.realpath(sys.argv[0]))[0]
+download_dataset(download_path, 'voc')
--- a/rtdetr_paddle/dataset/voc/label_list.txt
+++ b/rtdetr_paddle/dataset/voc/label_list.txt
@@ -0,0 +1,20 @@
+aeroplane
+bicycle
+bird
+boat
+bottle
+bus
+car
+cat
+chair
+cow
+diningtable
+dog
+horse
+motorbike
+person
+pottedplant
+sheep
+sofa
+train
+tvmonitor
--- a/rtdetr_paddle/ppdet/init.py
+++ b/rtdetr_paddle/ppdet/init.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import (core, data, engine, modeling, optimizer, metrics, utils)
+
+
+try:
+    from .version import full_version as __version__
+    from .version import commit as __git_commit__
+except ImportError:
+    import sys
+    sys.stderr.write("Warning: import ppdet from source directory " \
+            "without installing, run 'python setup.py install' to " \
+            "install ppdet firstly\n")
--- a/rtdetr_paddle/ppdet/core/init.py
+++ b/rtdetr_paddle/ppdet/core/init.py
@@ -0,0 +1,15 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import config
--- a/rtdetr_paddle/ppdet/core/config/init.py
+++ b/rtdetr_paddle/ppdet/core/config/init.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/rtdetr_paddle/ppdet/core/config/schema.py
+++ b/rtdetr_paddle/ppdet/core/config/schema.py
@@ -0,0 +1,248 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import inspect
+import importlib
+import re
+
+try:
+    from docstring_parser import parse as doc_parse
+except Exception:
+
+    def doc_parse(*args):
+        pass
+
+
+try:
+    from typeguard import check_type
+except Exception:
+
+    def check_type(*args):
+        pass
+
+
+__all__ = ['SchemaValue', 'SchemaDict', 'SharedConfig', 'extract_schema']
+
+
+class SchemaValue(object):
+    def __init__(self, name, doc='', type=None):
+        super(SchemaValue, self).__init__()
+        self.name = name
+        self.doc = doc
+        self.type = type
+
+    def set_default(self, value):
+        self.default = value
+
+    def has_default(self):
+        return hasattr(self, 'default')
+
+
+class SchemaDict(dict):
+    def __init__(self, **kwargs):
+        super(SchemaDict, self).__init__()
+        self.schema = {}
+        self.strict = False
+        self.doc = ""
+        self.update(kwargs)
+
+    def __setitem__(self, key, value):
+        # XXX also update regular dict to SchemaDict??
+        if isinstance(value, dict) and key in self and isinstance(self[key],
+                                                                  SchemaDict):
+            self[key].update(value)
+        else:
+            super(SchemaDict, self).__setitem__(key, value)
+
+    def __missing__(self, key):
+        if self.has_default(key):
+            return self.schema[key].default
+        elif key in self.schema:
+            return self.schema[key]
+        else:
+            raise KeyError(key)
+
+    def copy(self):
+        newone = SchemaDict()
+        newone.__dict__.update(self.__dict__)
+        newone.update(self)
+        return newone
+
+    def set_schema(self, key, value):
+        assert isinstance(value, SchemaValue)
+        self.schema[key] = value
+
+    def set_strict(self, strict):
+        self.strict = strict
+
+    def has_default(self, key):
+        return key in self.schema and self.schema[key].has_default()
+
+    def is_default(self, key):
+        if not self.has_default(key):
+            return False
+        if hasattr(self[key], '__dict__'):
+            return True
+        else:
+            return key not in self or self[key] == self.schema[key].default
+
+    def find_default_keys(self):
+        return [
+            k for k in list(self.keys()) + list(self.schema.keys())
+            if self.is_default(k)
+        ]
+
+    def mandatory(self):
+        return any([k for k in self.schema.keys() if not self.has_default(k)])
+
+    def find_missing_keys(self):
+        missing = [
+            k for k in self.schema.keys()
+            if k not in self and not self.has_default(k)
+        ]
+        placeholders = [k for k in self if self[k] in ('<missing>', '<value>')]
+        return missing + placeholders
+
+    def find_extra_keys(self):
+        return list(set(self.keys()) - set(self.schema.keys()))
+
+    def find_mismatch_keys(self):
+        mismatch_keys = []
+        for arg in self.schema.values():
+            if arg.type is not None:
+                try:
+                    check_type("{}.{}".format(self.name, arg.name),
+                               self[arg.name], arg.type)
+                except Exception:
+                    mismatch_keys.append(arg.name)
+        return mismatch_keys
+
+    def validate(self):
+        missing_keys = self.find_missing_keys()
+        if missing_keys:
+            raise ValueError("Missing param for class<{}>: {}".format(
+                self.name, ", ".join(missing_keys)))
+        extra_keys = self.find_extra_keys()
+        if extra_keys and self.strict:
+            raise ValueError("Extraneous param for class<{}>: {}".format(
+                self.name, ", ".join(extra_keys)))
+        mismatch_keys = self.find_mismatch_keys()
+        if mismatch_keys:
+            raise TypeError("Wrong param type for class<{}>: {}".format(
+                self.name, ", ".join(mismatch_keys)))
+
+
+class SharedConfig(object):
+    """
+    Representation class for `__shared__` annotations, which work as follows:
+
+    - if `key` is set for the module in config file, its value will take
+      precedence
+    - if `key` is not set for the module but present in the config file, its
+      value will be used
+    - otherwise, use the provided `default_value` as fallback
+
+    Args:
+        key: config[key] will be injected
+        default_value: fallback value
+    """
+
+    def __init__(self, key, default_value=None):
+        super(SharedConfig, self).__init__()
+        self.key = key
+        self.default_value = default_value
+
+
+def extract_schema(cls):
+    """
+    Extract schema from a given class
+
+    Args:
+        cls (type): Class from which to extract.
+
+    Returns:
+        schema (SchemaDict): Extracted schema.
+    """
+    ctor = cls.__init__
+    # python 2 compatibility
+    if hasattr(inspect, 'getfullargspec'):
+        argspec = inspect.getfullargspec(ctor)
+        annotations = argspec.annotations
+        has_kwargs = argspec.varkw is not None
+    else:
+        argspec = inspect.getfullargspec(ctor)
+        # python 2 type hinting workaround, see pep-3107
+        # however, since `typeguard` does not support python 2, type checking
+        # is still python 3 only for now
+        annotations = getattr(ctor, '__annotations__', {})
+        has_kwargs = argspec.varkw is not None
+
+    names = [arg for arg in argspec.args if arg != 'self']
+    defaults = argspec.defaults
+    num_defaults = argspec.defaults is not None and len(argspec.defaults) or 0
+    num_required = len(names) - num_defaults
+
+    docs = cls.__doc__
+    if docs is None and getattr(cls, '__category__', None) == 'op':
+        docs = cls.__call__.__doc__
+    try:
+        docstring = doc_parse(docs)
+    except Exception:
+        docstring = None
+
+    if docstring is None:
+        comments = {}
+    else:
+        comments = {}
+        for p in docstring.params:
+            match_obj = re.match('^([a-zA-Z_]+[a-zA-Z_0-9]*).*', p.arg_name)
+            if match_obj is not None:
+                comments[match_obj.group(1)] = p.description
+
+    schema = SchemaDict()
+    schema.name = cls.__name__
+    schema.doc = ""
+    if docs is not None:
+        start_pos = docs[0] == '\n' and 1 or 0
+        schema.doc = docs[start_pos:].split("\n")[0].strip()
+    # XXX handle paddle's weird doc convention
+    if '**' == schema.doc[:2] and '**' == schema.doc[-2:]:
+        schema.doc = schema.doc[2:-2].strip()
+    schema.category = hasattr(cls, '__category__') and getattr(
+        cls, '__category__') or 'module'
+    schema.strict = not has_kwargs
+    schema.pymodule = importlib.import_module(cls.__module__)
+    schema.inject = getattr(cls, '__inject__', [])
+    schema.shared = getattr(cls, '__shared__', [])
+    for idx, name in enumerate(names):
+        comment = name in comments and comments[name] or name
+        if name in schema.inject:
+            type_ = None
+        else:
+            type_ = name in annotations and annotations[name] or None
+        value_schema = SchemaValue(name, comment, type_)
+        if name in schema.shared:
+            assert idx >= num_required, "shared config must have default value"
+            default = defaults[idx - num_required]
+            value_schema.set_default(SharedConfig(name, default))
+        elif idx >= num_required:
+            default = defaults[idx - num_required]
+            value_schema.set_default(default)
+        schema.set_schema(name, value_schema)
+
+    return schema
--- a/rtdetr_paddle/ppdet/core/config/yaml_helpers.py
+++ b/rtdetr_paddle/ppdet/core/config/yaml_helpers.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import inspect
+
+import yaml
+from .schema import SharedConfig
+
+__all__ = ['serializable', 'Callable']
+
+
+def represent_dictionary_order(self, dict_data):
+    return self.represent_mapping('tag:yaml.org,2002:map', dict_data.items())
+
+
+def setup_orderdict():
+    from collections import OrderedDict
+    yaml.add_representer(OrderedDict, represent_dictionary_order)
+
+
+def _make_python_constructor(cls):
+    def python_constructor(loader, node):
+        if isinstance(node, yaml.SequenceNode):
+            args = loader.construct_sequence(node, deep=True)
+            return cls(*args)
+        else:
+            kwargs = loader.construct_mapping(node, deep=True)
+            try:
+                return cls(**kwargs)
+            except Exception as ex:
+                print("Error when construct {} instance from yaml config".
+                      format(cls.__name__))
+                raise ex
+
+    return python_constructor
+
+
+def _make_python_representer(cls):
+    # python 2 compatibility
+    if hasattr(inspect, 'getfullargspec'):
+        argspec = inspect.getfullargspec(cls)
+    else:
+        argspec = inspect.getfullargspec(cls.__init__)
+    argnames = [arg for arg in argspec.args if arg != 'self']
+
+    def python_representer(dumper, obj):
+        if argnames:
+            data = {name: getattr(obj, name) for name in argnames}
+        else:
+            data = obj.__dict__
+        if '_id' in data:
+            del data['_id']
+        return dumper.represent_mapping(u'!{}'.format(cls.__name__), data)
+
+    return python_representer
+
+
+def serializable(cls):
+    """
+    Add loader and dumper for given class, which must be
+    "trivially serializable"
+
+    Args:
+        cls: class to be serialized
+
+    Returns: cls
+    """
+    yaml.add_constructor(u'!{}'.format(cls.__name__),
+                         _make_python_constructor(cls))
+    yaml.add_representer(cls, _make_python_representer(cls))
+    return cls
+
+
+yaml.add_representer(SharedConfig,
+                     lambda d, o: d.represent_data(o.default_value))
+
+
+@serializable
+class Callable(object):
+    """
+    Helper to be used in Yaml for creating arbitrary class objects
+
+    Args:
+        full_type (str): the full module path to target function
+    """
+
+    def __init__(self, full_type, args=[], kwargs={}):
+        super(Callable, self).__init__()
+        self.full_type = full_type
+        self.args = args
+        self.kwargs = kwargs
+
+    def __call__(self):
+        if '.' in self.full_type:
+            idx = self.full_type.rfind('.')
+            module = importlib.import_module(self.full_type[:idx])
+            func_name = self.full_type[idx + 1:]
+        else:
+            try:
+                module = importlib.import_module('builtins')
+            except Exception:
+                module = importlib.import_module('__builtin__')
+            func_name = self.full_type
+
+        func = getattr(module, func_name)
+        return func(*self.args, **self.kwargs)
--- a/rtdetr_paddle/ppdet/core/workspace.py
+++ b/rtdetr_paddle/ppdet/core/workspace.py
@@ -0,0 +1,292 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import importlib
+import os
+import sys
+
+import yaml
+import collections
+
+try:
+    collectionsAbc = collections.abc
+except AttributeError:
+    collectionsAbc = collections
+
+from .config.schema import SchemaDict, SharedConfig, extract_schema
+from .config.yaml_helpers import serializable
+
+__all__ = [
+    'global_config',
+    'load_config',
+    'merge_config',
+    'get_registered_modules',
+    'create',
+    'register',
+    'serializable',
+    'dump_value',
+]
+
+
+def dump_value(value):
+    # XXX this is hackish, but collections.abc is not available in python 2
+    if hasattr(value, '__dict__') or isinstance(value, (dict, tuple, list)):
+        value = yaml.dump(value, default_flow_style=True)
+        value = value.replace('\n', '')
+        value = value.replace('...', '')
+        return "'{}'".format(value)
+    else:
+        # primitive types
+        return str(value)
+
+
+class AttrDict(dict):
+    """Single level attribute dict, NOT recursive"""
+
+    def __init__(self, **kwargs):
+        super(AttrDict, self).__init__()
+        super(AttrDict, self).update(kwargs)
+
+    def __getattr__(self, key):
+        if key in self:
+            return self[key]
+        raise AttributeError("object has no attribute '{}'".format(key))
+
+    def __setattr__(self, key, value):
+        self[key] = value
+
+    def copy(self):
+        new_dict = AttrDict()
+        for k, v in self.items():
+            new_dict.update({k: v})
+        return new_dict
+
+
+global_config = AttrDict()
+
+BASE_KEY = '_BASE_'
+
+
+# parse and load _BASE_ recursively
+def _load_config_with_base(file_path):
+    with open(file_path) as f:
+        file_cfg = yaml.load(f, Loader=yaml.Loader)
+
+    # NOTE: cfgs outside have higher priority than cfgs in _BASE_
+    if BASE_KEY in file_cfg:
+        all_base_cfg = AttrDict()
+        base_ymls = list(file_cfg[BASE_KEY])
+        for base_yml in base_ymls:
+            if base_yml.startswith("~"):
+                base_yml = os.path.expanduser(base_yml)
+            if not base_yml.startswith('/'):
+                base_yml = os.path.join(os.path.dirname(file_path), base_yml)
+
+            with open(base_yml) as f:
+                base_cfg = _load_config_with_base(base_yml)
+                all_base_cfg = merge_config(base_cfg, all_base_cfg)
+
+        del file_cfg[BASE_KEY]
+        return merge_config(file_cfg, all_base_cfg)
+
+    return file_cfg
+
+
+def load_config(file_path):
+    """
+    Load config from file.
+
+    Args:
+        file_path (str): Path of the config file to be loaded.
+
+    Returns: global config
+    """
+    _, ext = os.path.splitext(file_path)
+    assert ext in ['.yml', '.yaml'], "only support yaml files for now"
+
+    # load config from file and merge into global config
+    cfg = _load_config_with_base(file_path)
+    cfg['filename'] = os.path.splitext(os.path.split(file_path)[-1])[0]
+    merge_config(cfg)
+
+    return global_config
+
+
+def dict_merge(dct, merge_dct):
+    """ Recursive dict merge. Inspired by :meth:``dict.update()``, instead of
+    updating only top-level keys, dict_merge recurses down into dicts nested
+    to an arbitrary depth, updating keys. The ``merge_dct`` is merged into
+    ``dct``.
+
+    Args:
+        dct: dict onto which the merge is executed
+        merge_dct: dct merged into dct
+
+    Returns: dct
+    """
+    for k, v in merge_dct.items():
+        if (k in dct and isinstance(dct[k], dict) and
+                isinstance(merge_dct[k], collectionsAbc.Mapping)):
+            dict_merge(dct[k], merge_dct[k])
+        else:
+            dct[k] = merge_dct[k]
+    return dct
+
+
+def merge_config(config, another_cfg=None):
+    """
+    Merge config into global config or another_cfg.
+
+    Args:
+        config (dict): Config to be merged.
+
+    Returns: global config
+    """
+    global global_config
+    dct = another_cfg or global_config
+    return dict_merge(dct, config)
+
+
+def get_registered_modules():
+    return {k: v for k, v in global_config.items() if isinstance(v, SchemaDict)}
+
+
+def make_partial(cls):
+    op_module = importlib.import_module(cls.__op__.__module__)
+    op = getattr(op_module, cls.__op__.__name__)
+    cls.__category__ = getattr(cls, '__category__', None) or 'op'
+
+    def partial_apply(self, *args, **kwargs):
+        kwargs_ = self.__dict__.copy()
+        kwargs_.update(kwargs)
+        return op(*args, **kwargs_)
+
+    if getattr(cls, '__append_doc__', True):  # XXX should default to True?
+        if sys.version_info[0] > 2:
+            cls.__doc__ = "Wrapper for `{}` OP".format(op.__name__)
+            cls.__init__.__doc__ = op.__doc__
+            cls.__call__ = partial_apply
+            cls.__call__.__doc__ = op.__doc__
+        else:
+            # XXX work around for python 2
+            partial_apply.__doc__ = op.__doc__
+            cls.__call__ = partial_apply
+    return cls
+
+
+def register(cls):
+    """
+    Register a given module class.
+
+    Args:
+        cls (type): Module class to be registered.
+
+    Returns: cls
+    """
+    if cls.__name__ in global_config:
+        raise ValueError("Module class already registered: {}".format(
+            cls.__name__))
+    if hasattr(cls, '__op__'):
+        cls = make_partial(cls)
+    global_config[cls.__name__] = extract_schema(cls)
+    return cls
+
+
+def create(cls_or_name, **kwargs):
+    """
+    Create an instance of given module class.
+
+    Args:
+        cls_or_name (type or str): Class of which to create instance.
+
+    Returns: instance of type `cls_or_name`
+    """
+    assert type(cls_or_name) in [type, str
+                                 ], "should be a class or name of a class"
+    name = type(cls_or_name) == str and cls_or_name or cls_or_name.__name__
+    if name in global_config:
+        if isinstance(global_config[name], SchemaDict):
+            pass
+        elif hasattr(global_config[name], "__dict__"):
+            # support instance return directly
+            return global_config[name]
+        else:
+            raise ValueError("The module {} is not registered".format(name))
+    else:
+        raise ValueError("The module {} is not registered".format(name))
+
+    config = global_config[name]
+    cls = getattr(config.pymodule, name)
+    cls_kwargs = {}
+    cls_kwargs.update(global_config[name])
+
+    # parse `shared` annoation of registered modules
+    if getattr(config, 'shared', None):
+        for k in config.shared:
+            target_key = config[k]
+            shared_conf = config.schema[k].default
+            assert isinstance(shared_conf, SharedConfig)
+            if target_key is not None and not isinstance(target_key,
+                                                         SharedConfig):
+                continue  # value is given for the module
+            elif shared_conf.key in global_config:
+                # `key` is present in config
+                cls_kwargs[k] = global_config[shared_conf.key]
+            else:
+                cls_kwargs[k] = shared_conf.default_value
+
+    # parse `inject` annoation of registered modules
+    if getattr(cls, 'from_config', None):
+        cls_kwargs.update(cls.from_config(config, **kwargs))
+
+    if getattr(config, 'inject', None):
+        for k in config.inject:
+            target_key = config[k]
+            # optional dependency
+            if target_key is None:
+                continue
+
+            if isinstance(target_key, dict) or hasattr(target_key, '__dict__'):
+                if 'name' not in target_key.keys():
+                    continue
+                inject_name = str(target_key['name'])
+                if inject_name not in global_config:
+                    raise ValueError(
+                        "Missing injection name {} and check it's name in cfg file".
+                        format(k))
+                target = global_config[inject_name]
+                for i, v in target_key.items():
+                    if i == 'name':
+                        continue
+                    target[i] = v
+                if isinstance(target, SchemaDict):
+                    cls_kwargs[k] = create(inject_name)
+            elif isinstance(target_key, str):
+                if target_key not in global_config:
+                    raise ValueError("Missing injection config:", target_key)
+                target = global_config[target_key]
+                if isinstance(target, SchemaDict):
+                    cls_kwargs[k] = create(target_key)
+                elif hasattr(target, '__dict__'):  # serialized object
+                    cls_kwargs[k] = target
+            else:
+                raise ValueError("Unsupported injection type:", target_key)
+    # prevent modification of global config values of reference types
+    # (e.g., list, dict) from within the created module instances
+    #kwargs = copy.deepcopy(kwargs)
+    return cls(**cls_kwargs)
--- a/rtdetr_paddle/ppdet/data/init.py
+++ b/rtdetr_paddle/ppdet/data/init.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from . import source
+from . import transform
+from . import reader
+
+from .source import *
+from .transform import *
+from .reader import *
--- a/rtdetr_paddle/ppdet/data/reader.py
+++ b/rtdetr_paddle/ppdet/data/reader.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+import traceback
+import six
+import sys
+if sys.version_info >= (3, 0):
+    pass
+else:
+    pass
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+from copy import deepcopy
+
+from paddle.io import DataLoader, DistributedBatchSampler
+from .utils import default_collate_fn
+
+from ppdet.core.workspace import register
+from . import transform
+from .shm_utils import _get_shared_memory_size_in_M
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('reader')
+
+MAIN_PID = os.getpid()
+
+
+class Compose(object):
+    def __init__(self, transforms, num_classes=80):
+        self.transforms = transforms
+        self.transforms_cls = []
+        for t in self.transforms:
+            for k, v in t.items():
+                op_cls = getattr(transform, k)
+                f = op_cls(**v)
+                if hasattr(f, 'num_classes'):
+                    f.num_classes = num_classes
+
+                self.transforms_cls.append(f)
+
+    def __call__(self, data):
+        for f in self.transforms_cls:
+            try:
+                data = f(data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map sample transform [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        return data
+
+
+class BatchCompose(Compose):
+    def __init__(self, transforms, num_classes=80, collate_batch=True):
+        super(BatchCompose, self).__init__(transforms, num_classes)
+        self.collate_batch = collate_batch
+
+    def __call__(self, data):
+        for f in self.transforms_cls:
+            try:
+                data = f(data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map batch transform [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        # remove keys which is not needed by model
+        extra_key = ['h', 'w', 'flipped']
+        for k in extra_key:
+            for sample in data:
+                if k in sample:
+                    sample.pop(k)
+
+        # batch data, if user-define batch function needed
+        # use user-defined here
+        if self.collate_batch:
+            batch_data = default_collate_fn(data)
+        else:
+            batch_data = {}
+            for k in data[0].keys():
+                tmp_data = []
+                for i in range(len(data)):
+                    tmp_data.append(data[i][k])
+                if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:
+                    tmp_data = np.stack(tmp_data, axis=0)
+                batch_data[k] = tmp_data
+        return batch_data
+
+
+class BaseDataLoader(object):
+    """
+    Base DataLoader implementation for detection models
+
+    Args:
+        sample_transforms (list): a list of transforms to perform
+                                  on each sample
+        batch_transforms (list): a list of transforms to perform
+                                 on batch
+        batch_size (int): batch size for batch collating, default 1.
+        shuffle (bool): whether to shuffle samples
+        drop_last (bool): whether to drop the last incomplete,
+                          default False
+        num_classes (int): class number of dataset, default 80
+        collate_batch (bool): whether to collate batch in dataloader.
+            If set to True, the samples will collate into batch according
+            to the batch size. Otherwise, the ground-truth will not collate,
+            which is used when the number of ground-truch is different in 
+            samples.
+        use_shared_memory (bool): whether to use shared memory to
+                accelerate data loading, enable this only if you
+                are sure that the shared memory size of your OS
+                is larger than memory cost of input datas of model.
+                Note that shared memory will be automatically
+                disabled if the shared memory of OS is less than
+                1G, which is not enough for detection models.
+                Default False.
+    """
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=False,
+                 drop_last=False,
+                 num_classes=80,
+                 collate_batch=True,
+                 use_shared_memory=False,
+                 **kwargs):
+        # sample transform
+        self._sample_transforms = Compose(
+            sample_transforms, num_classes=num_classes)
+
+        # batch transfrom 
+        self._batch_transforms = BatchCompose(batch_transforms, num_classes,
+                                              collate_batch)
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.use_shared_memory = use_shared_memory
+        self.kwargs = kwargs
+
+    def __call__(self,
+                 dataset,
+                 worker_num,
+                 batch_sampler=None,
+                 return_list=False):
+        self.dataset = dataset
+        self.dataset.check_or_download_dataset()
+        self.dataset.parse_dataset()
+        # get data
+        self.dataset.set_transform(self._sample_transforms)
+        # set kwargs
+        self.dataset.set_kwargs(**self.kwargs)
+        # batch sampler
+        if batch_sampler is None:
+            self._batch_sampler = DistributedBatchSampler(
+                self.dataset,
+                batch_size=self.batch_size,
+                shuffle=self.shuffle,
+                drop_last=self.drop_last)
+        else:
+            self._batch_sampler = batch_sampler
+
+        # DataLoader do not start sub-process in Windows and Mac
+        # system, do not need to use shared memory
+        use_shared_memory = self.use_shared_memory and \
+                            sys.platform not in ['win32', 'darwin']
+        # check whether shared memory size is bigger than 1G(1024M)
+        if use_shared_memory:
+            shm_size = _get_shared_memory_size_in_M()
+            if shm_size is not None and shm_size < 1024.:
+                logger.warning("Shared memory size is less than 1G, "
+                               "disable shared_memory in DataLoader")
+                use_shared_memory = False
+
+        self.dataloader = DataLoader(
+            dataset=self.dataset,
+            batch_sampler=self._batch_sampler,
+            collate_fn=self._batch_transforms,
+            num_workers=worker_num,
+            return_list=return_list,
+            use_shared_memory=use_shared_memory)
+        self.loader = iter(self.dataloader)
+
+        return self
+
+    def __len__(self):
+        return len(self._batch_sampler)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        try:
+            return next(self.loader)
+        except StopIteration:
+            self.loader = iter(self.dataloader)
+            six.reraise(*sys.exc_info())
+
+    def next(self):
+        # python2 compatibility
+        return self.__next__()
+
+
+@register
+class TrainReader(BaseDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=True,
+                 drop_last=True,
+                 num_classes=80,
+                 collate_batch=True,
+                 **kwargs):
+        super(TrainReader, self).__init__(sample_transforms, batch_transforms,
+                                          batch_size, shuffle, drop_last,
+                                          num_classes, collate_batch, **kwargs)
+
+
+@register
+class EvalReader(BaseDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=False,
+                 drop_last=False,
+                 num_classes=80,
+                 **kwargs):
+        super(EvalReader, self).__init__(sample_transforms, batch_transforms,
+                                         batch_size, shuffle, drop_last,
+                                         num_classes, **kwargs)
+
+
+@register
+class TestReader(BaseDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=False,
+                 drop_last=False,
+                 num_classes=80,
+                 **kwargs):
+        super(TestReader, self).__init__(sample_transforms, batch_transforms,
+                                         batch_size, shuffle, drop_last,
+                                         num_classes, **kwargs)
+
--- a/rtdetr_paddle/ppdet/data/shm_utils.py
+++ b/rtdetr_paddle/ppdet/data/shm_utils.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+SIZE_UNIT = ['K', 'M', 'G', 'T']
+SHM_QUERY_CMD = 'df -h'
+SHM_KEY = 'shm'
+SHM_DEFAULT_MOUNT = '/dev/shm'
+
+# [ shared memory size check ]
+# In detection models, image/target data occupies a lot of memory, and
+# will occupy lots of shared memory in multi-process DataLoader, we use
+# following code to get shared memory size and perform a size check to
+# disable shared memory use if shared memory size is not enough.
+# Shared memory getting process as follows:
+# 1. use `df -h` get all mount info
+# 2. pick up spaces whose mount info contains 'shm'
+# 3. if 'shm' space number is only 1, return its size
+# 4. if there are multiple 'shm' space, try to find the default mount
+#    directory '/dev/shm' is Linux-like system, otherwise return the
+#    biggest space size.
+
+
+def _parse_size_in_M(size_str):
+    if size_str[-1] == 'B':
+        num, unit = size_str[:-2], size_str[-2]
+    else:
+        num, unit = size_str[:-1], size_str[-1]
+    assert unit in SIZE_UNIT, \
+            "unknown shm size unit {}".format(unit)
+    return float(num) * \
+            (1024 ** (SIZE_UNIT.index(unit) - 1))
+
+
+def _get_shared_memory_size_in_M():
+    try:
+        df_infos = os.popen(SHM_QUERY_CMD).readlines()
+    except:
+        return None
+    else:
+        shm_infos = []
+        for df_info in df_infos:
+            info = df_info.strip()
+            if info.find(SHM_KEY) >= 0:
+                shm_infos.append(info.split())
+
+        if len(shm_infos) == 0:
+            return None
+        elif len(shm_infos) == 1:
+            return _parse_size_in_M(shm_infos[0][3])
+        else:
+            default_mount_infos = [
+                si for si in shm_infos if si[-1] == SHM_DEFAULT_MOUNT
+            ]
+            if default_mount_infos:
+                return _parse_size_in_M(default_mount_infos[0][3])
+            else:
+                return max([_parse_size_in_M(si[3]) for si in shm_infos])
--- a/rtdetr_paddle/ppdet/data/source/init.py
+++ b/rtdetr_paddle/ppdet/data/source/init.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .coco import *
+from .voc import *
+from .category import *
+from .dataset import ImageFolder
--- a/rtdetr_paddle/ppdet/data/source/category.py
+++ b/rtdetr_paddle/ppdet/data/source/category.py
@@ -0,0 +1,926 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from ppdet.data.source.voc import pascalvoc_label
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['get_categories']
+
+
+def get_categories(metric_type, anno_file=None, arch=None):
+    """
+    Get class id to category id map and category id
+    to category name map from annotation file.
+
+    Args:
+        metric_type (str): metric type, currently support 'coco', 'voc', 'oid'
+            and 'widerface'.
+        anno_file (str): annotation file path
+    """
+    if arch == 'keypoint_arch':
+        return (None, {'id': 'keypoint'})
+
+    if anno_file == None or (not os.path.isfile(anno_file)):
+        logger.warning(
+            "anno_file '{}' is None or not set or not exist, "
+            "please recheck TrainDataset/EvalDataset/TestDataset.anno_path, "
+            "otherwise the default categories will be used by metric_type.".
+            format(anno_file))
+
+    if metric_type.lower() == 'coco' or metric_type.lower(
+    ) == 'rbox' or metric_type.lower() == 'snipercoco':
+        if anno_file and os.path.isfile(anno_file):
+            if anno_file.endswith('json'):
+                # lazy import pycocotools here
+                from pycocotools.coco import COCO
+                coco = COCO(anno_file)
+                cats = coco.loadCats(coco.getCatIds())
+
+                clsid2catid = {i: cat['id'] for i, cat in enumerate(cats)}
+                catid2name = {cat['id']: cat['name'] for cat in cats}
+
+            elif anno_file.endswith('txt'):
+                cats = []
+                with open(anno_file) as f:
+                    for line in f.readlines():
+                        cats.append(line.strip())
+                if cats[0] == 'background': cats = cats[1:]
+
+                clsid2catid = {i: i for i in range(len(cats))}
+                catid2name = {i: name for i, name in enumerate(cats)}
+
+            else:
+                raise ValueError("anno_file {} should be json or txt.".format(
+                    anno_file))
+            return clsid2catid, catid2name
+
+        # anno file not exist, load default categories of COCO17
+        else:
+            if metric_type.lower() == 'rbox':
+                logger.warning(
+                    "metric_type: {}, load default categories of DOTA.".format(
+                        metric_type))
+                return _dota_category()
+            logger.warning("metric_type: {}, load default categories of COCO.".
+                           format(metric_type))
+            return _coco17_category()
+
+    elif metric_type.lower() == 'voc':
+        if anno_file and os.path.isfile(anno_file):
+            cats = []
+            with open(anno_file) as f:
+                for line in f.readlines():
+                    cats.append(line.strip())
+
+            if cats[0] == 'background':
+                cats = cats[1:]
+
+            clsid2catid = {i: i for i in range(len(cats))}
+            catid2name = {i: name for i, name in enumerate(cats)}
+
+            return clsid2catid, catid2name
+
+        # anno file not exist, load default categories of
+        # VOC all 20 categories
+        else:
+            logger.warning("metric_type: {}, load default categories of VOC.".
+                           format(metric_type))
+            return _vocall_category()
+
+    elif metric_type.lower() == 'oid':
+        if anno_file and os.path.isfile(anno_file):
+            logger.warning("only default categories support for OID19")
+        return _oid19_category()
+
+    elif metric_type.lower() == 'keypointtopdowncocoeval' or metric_type.lower(
+    ) == 'keypointtopdownmpiieval':
+        return (None, {'id': 'keypoint'})
+
+    elif metric_type.lower() == 'pose3deval':
+        return (None, {'id': 'pose3d'})
+
+    elif metric_type.lower() in ['mot', 'motdet', 'reid']:
+        if anno_file and os.path.isfile(anno_file):
+            cats = []
+            with open(anno_file) as f:
+                for line in f.readlines():
+                    cats.append(line.strip())
+            if cats[0] == 'background':
+                cats = cats[1:]
+            clsid2catid = {i: i for i in range(len(cats))}
+            catid2name = {i: name for i, name in enumerate(cats)}
+            return clsid2catid, catid2name
+        # anno file not exist, load default category 'pedestrian'.
+        else:
+            logger.warning(
+                "metric_type: {}, load default categories of pedestrian MOT.".
+                format(metric_type))
+            return _mot_category(category='pedestrian')
+
+    elif metric_type.lower() in ['kitti', 'bdd100kmot']:
+        return _mot_category(category='vehicle')
+
+    elif metric_type.lower() in ['mcmot']:
+        if anno_file and os.path.isfile(anno_file):
+            cats = []
+            with open(anno_file) as f:
+                for line in f.readlines():
+                    cats.append(line.strip())
+            if cats[0] == 'background':
+                cats = cats[1:]
+            clsid2catid = {i: i for i in range(len(cats))}
+            catid2name = {i: name for i, name in enumerate(cats)}
+            return clsid2catid, catid2name
+        # anno file not exist, load default categories of visdrone all 10 categories
+        else:
+            logger.warning(
+                "metric_type: {}, load default categories of VisDrone.".format(
+                    metric_type))
+            return _visdrone_category()
+
+    else:
+        raise ValueError("unknown metric type {}".format(metric_type))
+
+
+def _mot_category(category='pedestrian'):
+    """
+    Get class id to category id map and category id
+    to category name map of mot dataset
+    """
+    label_map = {category: 0}
+    label_map = sorted(label_map.items(), key=lambda x: x[1])
+    cats = [l[0] for l in label_map]
+
+    clsid2catid = {i: i for i in range(len(cats))}
+    catid2name = {i: name for i, name in enumerate(cats)}
+
+    return clsid2catid, catid2name
+
+
+def _coco17_category():
+    """
+    Get class id to category id map and category id
+    to category name map of COCO2017 dataset
+
+    """
+    clsid2catid = {
+        1: 1,
+        2: 2,
+        3: 3,
+        4: 4,
+        5: 5,
+        6: 6,
+        7: 7,
+        8: 8,
+        9: 9,
+        10: 10,
+        11: 11,
+        12: 13,
+        13: 14,
+        14: 15,
+        15: 16,
+        16: 17,
+        17: 18,
+        18: 19,
+        19: 20,
+        20: 21,
+        21: 22,
+        22: 23,
+        23: 24,
+        24: 25,
+        25: 27,
+        26: 28,
+        27: 31,
+        28: 32,
+        29: 33,
+        30: 34,
+        31: 35,
+        32: 36,
+        33: 37,
+        34: 38,
+        35: 39,
+        36: 40,
+        37: 41,
+        38: 42,
+        39: 43,
+        40: 44,
+        41: 46,
+        42: 47,
+        43: 48,
+        44: 49,
+        45: 50,
+        46: 51,
+        47: 52,
+        48: 53,
+        49: 54,
+        50: 55,
+        51: 56,
+        52: 57,
+        53: 58,
+        54: 59,
+        55: 60,
+        56: 61,
+        57: 62,
+        58: 63,
+        59: 64,
+        60: 65,
+        61: 67,
+        62: 70,
+        63: 72,
+        64: 73,
+        65: 74,
+        66: 75,
+        67: 76,
+        68: 77,
+        69: 78,
+        70: 79,
+        71: 80,
+        72: 81,
+        73: 82,
+        74: 84,
+        75: 85,
+        76: 86,
+        77: 87,
+        78: 88,
+        79: 89,
+        80: 90
+    }
+
+    catid2name = {
+        0: 'background',
+        1: 'person',
+        2: 'bicycle',
+        3: 'car',
+        4: 'motorcycle',
+        5: 'airplane',
+        6: 'bus',
+        7: 'train',
+        8: 'truck',
+        9: 'boat',
+        10: 'traffic light',
+        11: 'fire hydrant',
+        13: 'stop sign',
+        14: 'parking meter',
+        15: 'bench',
+        16: 'bird',
+        17: 'cat',
+        18: 'dog',
+        19: 'horse',
+        20: 'sheep',
+        21: 'cow',
+        22: 'elephant',
+        23: 'bear',
+        24: 'zebra',
+        25: 'giraffe',
+        27: 'backpack',
+        28: 'umbrella',
+        31: 'handbag',
+        32: 'tie',
+        33: 'suitcase',
+        34: 'frisbee',
+        35: 'skis',
+        36: 'snowboard',
+        37: 'sports ball',
+        38: 'kite',
+        39: 'baseball bat',
+        40: 'baseball glove',
+        41: 'skateboard',
+        42: 'surfboard',
+        43: 'tennis racket',
+        44: 'bottle',
+        46: 'wine glass',
+        47: 'cup',
+        48: 'fork',
+        49: 'knife',
+        50: 'spoon',
+        51: 'bowl',
+        52: 'banana',
+        53: 'apple',
+        54: 'sandwich',
+        55: 'orange',
+        56: 'broccoli',
+        57: 'carrot',
+        58: 'hot dog',
+        59: 'pizza',
+        60: 'donut',
+        61: 'cake',
+        62: 'chair',
+        63: 'couch',
+        64: 'potted plant',
+        65: 'bed',
+        67: 'dining table',
+        70: 'toilet',
+        72: 'tv',
+        73: 'laptop',
+        74: 'mouse',
+        75: 'remote',
+        76: 'keyboard',
+        77: 'cell phone',
+        78: 'microwave',
+        79: 'oven',
+        80: 'toaster',
+        81: 'sink',
+        82: 'refrigerator',
+        84: 'book',
+        85: 'clock',
+        86: 'vase',
+        87: 'scissors',
+        88: 'teddy bear',
+        89: 'hair drier',
+        90: 'toothbrush'
+    }
+
+    clsid2catid = {k - 1: v for k, v in clsid2catid.items()}
+    catid2name.pop(0)
+
+    return clsid2catid, catid2name
+
+
+def _dota_category():
+    """
+    Get class id to category id map and category id
+    to category name map of dota dataset
+    """
+    catid2name = {
+        0: 'background',
+        1: 'plane',
+        2: 'baseball-diamond',
+        3: 'bridge',
+        4: 'ground-track-field',
+        5: 'small-vehicle',
+        6: 'large-vehicle',
+        7: 'ship',
+        8: 'tennis-court',
+        9: 'basketball-court',
+        10: 'storage-tank',
+        11: 'soccer-ball-field',
+        12: 'roundabout',
+        13: 'harbor',
+        14: 'swimming-pool',
+        15: 'helicopter'
+    }
+    catid2name.pop(0)
+    clsid2catid = {i: i + 1 for i in range(len(catid2name))}
+    return clsid2catid, catid2name
+
+
+def _vocall_category():
+    """
+    Get class id to category id map and category id
+    to category name map of mixup voc dataset
+
+    """
+    label_map = pascalvoc_label()
+    label_map = sorted(label_map.items(), key=lambda x: x[1])
+    cats = [l[0] for l in label_map]
+
+    clsid2catid = {i: i for i in range(len(cats))}
+    catid2name = {i: name for i, name in enumerate(cats)}
+
+    return clsid2catid, catid2name
+
+
+def _oid19_category():
+    clsid2catid = {k: k + 1 for k in range(500)}
+
+    catid2name = {
+        0: "background",
+        1: "Infant bed",
+        2: "Rose",
+        3: "Flag",
+        4: "Flashlight",
+        5: "Sea turtle",
+        6: "Camera",
+        7: "Animal",
+        8: "Glove",
+        9: "Crocodile",
+        10: "Cattle",
+        11: "House",
+        12: "Guacamole",
+        13: "Penguin",
+        14: "Vehicle registration plate",
+        15: "Bench",
+        16: "Ladybug",
+        17: "Human nose",
+        18: "Watermelon",
+        19: "Flute",
+        20: "Butterfly",
+        21: "Washing machine",
+        22: "Raccoon",
+        23: "Segway",
+        24: "Taco",
+        25: "Jellyfish",
+        26: "Cake",
+        27: "Pen",
+        28: "Cannon",
+        29: "Bread",
+        30: "Tree",
+        31: "Shellfish",
+        32: "Bed",
+        33: "Hamster",
+        34: "Hat",
+        35: "Toaster",
+        36: "Sombrero",
+        37: "Tiara",
+        38: "Bowl",
+        39: "Dragonfly",
+        40: "Moths and butterflies",
+        41: "Antelope",
+        42: "Vegetable",
+        43: "Torch",
+        44: "Building",
+        45: "Power plugs and sockets",
+        46: "Blender",
+        47: "Billiard table",
+        48: "Cutting board",
+        49: "Bronze sculpture",
+        50: "Turtle",
+        51: "Broccoli",
+        52: "Tiger",
+        53: "Mirror",
+        54: "Bear",
+        55: "Zucchini",
+        56: "Dress",
+        57: "Volleyball",
+        58: "Guitar",
+        59: "Reptile",
+        60: "Golf cart",
+        61: "Tart",
+        62: "Fedora",
+        63: "Carnivore",
+        64: "Car",
+        65: "Lighthouse",
+        66: "Coffeemaker",
+        67: "Food processor",
+        68: "Truck",
+        69: "Bookcase",
+        70: "Surfboard",
+        71: "Footwear",
+        72: "Bench",
+        73: "Necklace",
+        74: "Flower",
+        75: "Radish",
+        76: "Marine mammal",
+        77: "Frying pan",
+        78: "Tap",
+        79: "Peach",
+        80: "Knife",
+        81: "Handbag",
+        82: "Laptop",
+        83: "Tent",
+        84: "Ambulance",
+        85: "Christmas tree",
+        86: "Eagle",
+        87: "Limousine",
+        88: "Kitchen & dining room table",
+        89: "Polar bear",
+        90: "Tower",
+        91: "Football",
+        92: "Willow",
+        93: "Human head",
+        94: "Stop sign",
+        95: "Banana",
+        96: "Mixer",
+        97: "Binoculars",
+        98: "Dessert",
+        99: "Bee",
+        100: "Chair",
+        101: "Wood-burning stove",
+        102: "Flowerpot",
+        103: "Beaker",
+        104: "Oyster",
+        105: "Woodpecker",
+        106: "Harp",
+        107: "Bathtub",
+        108: "Wall clock",
+        109: "Sports uniform",
+        110: "Rhinoceros",
+        111: "Beehive",
+        112: "Cupboard",
+        113: "Chicken",
+        114: "Man",
+        115: "Blue jay",
+        116: "Cucumber",
+        117: "Balloon",
+        118: "Kite",
+        119: "Fireplace",
+        120: "Lantern",
+        121: "Missile",
+        122: "Book",
+        123: "Spoon",
+        124: "Grapefruit",
+        125: "Squirrel",
+        126: "Orange",
+        127: "Coat",
+        128: "Punching bag",
+        129: "Zebra",
+        130: "Billboard",
+        131: "Bicycle",
+        132: "Door handle",
+        133: "Mechanical fan",
+        134: "Ring binder",
+        135: "Table",
+        136: "Parrot",
+        137: "Sock",
+        138: "Vase",
+        139: "Weapon",
+        140: "Shotgun",
+        141: "Glasses",
+        142: "Seahorse",
+        143: "Belt",
+        144: "Watercraft",
+        145: "Window",
+        146: "Giraffe",
+        147: "Lion",
+        148: "Tire",
+        149: "Vehicle",
+        150: "Canoe",
+        151: "Tie",
+        152: "Shelf",
+        153: "Picture frame",
+        154: "Printer",
+        155: "Human leg",
+        156: "Boat",
+        157: "Slow cooker",
+        158: "Croissant",
+        159: "Candle",
+        160: "Pancake",
+        161: "Pillow",
+        162: "Coin",
+        163: "Stretcher",
+        164: "Sandal",
+        165: "Woman",
+        166: "Stairs",
+        167: "Harpsichord",
+        168: "Stool",
+        169: "Bus",
+        170: "Suitcase",
+        171: "Human mouth",
+        172: "Juice",
+        173: "Skull",
+        174: "Door",
+        175: "Violin",
+        176: "Chopsticks",
+        177: "Digital clock",
+        178: "Sunflower",
+        179: "Leopard",
+        180: "Bell pepper",
+        181: "Harbor seal",
+        182: "Snake",
+        183: "Sewing machine",
+        184: "Goose",
+        185: "Helicopter",
+        186: "Seat belt",
+        187: "Coffee cup",
+        188: "Microwave oven",
+        189: "Hot dog",
+        190: "Countertop",
+        191: "Serving tray",
+        192: "Dog bed",
+        193: "Beer",
+        194: "Sunglasses",
+        195: "Golf ball",
+        196: "Waffle",
+        197: "Palm tree",
+        198: "Trumpet",
+        199: "Ruler",
+        200: "Helmet",
+        201: "Ladder",
+        202: "Office building",
+        203: "Tablet computer",
+        204: "Toilet paper",
+        205: "Pomegranate",
+        206: "Skirt",
+        207: "Gas stove",
+        208: "Cookie",
+        209: "Cart",
+        210: "Raven",
+        211: "Egg",
+        212: "Burrito",
+        213: "Goat",
+        214: "Kitchen knife",
+        215: "Skateboard",
+        216: "Salt and pepper shakers",
+        217: "Lynx",
+        218: "Boot",
+        219: "Platter",
+        220: "Ski",
+        221: "Swimwear",
+        222: "Swimming pool",
+        223: "Drinking straw",
+        224: "Wrench",
+        225: "Drum",
+        226: "Ant",
+        227: "Human ear",
+        228: "Headphones",
+        229: "Fountain",
+        230: "Bird",
+        231: "Jeans",
+        232: "Television",
+        233: "Crab",
+        234: "Microphone",
+        235: "Home appliance",
+        236: "Snowplow",
+        237: "Beetle",
+        238: "Artichoke",
+        239: "Jet ski",
+        240: "Stationary bicycle",
+        241: "Human hair",
+        242: "Brown bear",
+        243: "Starfish",
+        244: "Fork",
+        245: "Lobster",
+        246: "Corded phone",
+        247: "Drink",
+        248: "Saucer",
+        249: "Carrot",
+        250: "Insect",
+        251: "Clock",
+        252: "Castle",
+        253: "Tennis racket",
+        254: "Ceiling fan",
+        255: "Asparagus",
+        256: "Jaguar",
+        257: "Musical instrument",
+        258: "Train",
+        259: "Cat",
+        260: "Rifle",
+        261: "Dumbbell",
+        262: "Mobile phone",
+        263: "Taxi",
+        264: "Shower",
+        265: "Pitcher",
+        266: "Lemon",
+        267: "Invertebrate",
+        268: "Turkey",
+        269: "High heels",
+        270: "Bust",
+        271: "Elephant",
+        272: "Scarf",
+        273: "Barrel",
+        274: "Trombone",
+        275: "Pumpkin",
+        276: "Box",
+        277: "Tomato",
+        278: "Frog",
+        279: "Bidet",
+        280: "Human face",
+        281: "Houseplant",
+        282: "Van",
+        283: "Shark",
+        284: "Ice cream",
+        285: "Swim cap",
+        286: "Falcon",
+        287: "Ostrich",
+        288: "Handgun",
+        289: "Whiteboard",
+        290: "Lizard",
+        291: "Pasta",
+        292: "Snowmobile",
+        293: "Light bulb",
+        294: "Window blind",
+        295: "Muffin",
+        296: "Pretzel",
+        297: "Computer monitor",
+        298: "Horn",
+        299: "Furniture",
+        300: "Sandwich",
+        301: "Fox",
+        302: "Convenience store",
+        303: "Fish",
+        304: "Fruit",
+        305: "Earrings",
+        306: "Curtain",
+        307: "Grape",
+        308: "Sofa bed",
+        309: "Horse",
+        310: "Luggage and bags",
+        311: "Desk",
+        312: "Crutch",
+        313: "Bicycle helmet",
+        314: "Tick",
+        315: "Airplane",
+        316: "Canary",
+        317: "Spatula",
+        318: "Watch",
+        319: "Lily",
+        320: "Kitchen appliance",
+        321: "Filing cabinet",
+        322: "Aircraft",
+        323: "Cake stand",
+        324: "Candy",
+        325: "Sink",
+        326: "Mouse",
+        327: "Wine",
+        328: "Wheelchair",
+        329: "Goldfish",
+        330: "Refrigerator",
+        331: "French fries",
+        332: "Drawer",
+        333: "Treadmill",
+        334: "Picnic basket",
+        335: "Dice",
+        336: "Cabbage",
+        337: "Football helmet",
+        338: "Pig",
+        339: "Person",
+        340: "Shorts",
+        341: "Gondola",
+        342: "Honeycomb",
+        343: "Doughnut",
+        344: "Chest of drawers",
+        345: "Land vehicle",
+        346: "Bat",
+        347: "Monkey",
+        348: "Dagger",
+        349: "Tableware",
+        350: "Human foot",
+        351: "Mug",
+        352: "Alarm clock",
+        353: "Pressure cooker",
+        354: "Human hand",
+        355: "Tortoise",
+        356: "Baseball glove",
+        357: "Sword",
+        358: "Pear",
+        359: "Miniskirt",
+        360: "Traffic sign",
+        361: "Girl",
+        362: "Roller skates",
+        363: "Dinosaur",
+        364: "Porch",
+        365: "Human beard",
+        366: "Submarine sandwich",
+        367: "Screwdriver",
+        368: "Strawberry",
+        369: "Wine glass",
+        370: "Seafood",
+        371: "Racket",
+        372: "Wheel",
+        373: "Sea lion",
+        374: "Toy",
+        375: "Tea",
+        376: "Tennis ball",
+        377: "Waste container",
+        378: "Mule",
+        379: "Cricket ball",
+        380: "Pineapple",
+        381: "Coconut",
+        382: "Doll",
+        383: "Coffee table",
+        384: "Snowman",
+        385: "Lavender",
+        386: "Shrimp",
+        387: "Maple",
+        388: "Cowboy hat",
+        389: "Goggles",
+        390: "Rugby ball",
+        391: "Caterpillar",
+        392: "Poster",
+        393: "Rocket",
+        394: "Organ",
+        395: "Saxophone",
+        396: "Traffic light",
+        397: "Cocktail",
+        398: "Plastic bag",
+        399: "Squash",
+        400: "Mushroom",
+        401: "Hamburger",
+        402: "Light switch",
+        403: "Parachute",
+        404: "Teddy bear",
+        405: "Winter melon",
+        406: "Deer",
+        407: "Musical keyboard",
+        408: "Plumbing fixture",
+        409: "Scoreboard",
+        410: "Baseball bat",
+        411: "Envelope",
+        412: "Adhesive tape",
+        413: "Briefcase",
+        414: "Paddle",
+        415: "Bow and arrow",
+        416: "Telephone",
+        417: "Sheep",
+        418: "Jacket",
+        419: "Boy",
+        420: "Pizza",
+        421: "Otter",
+        422: "Office supplies",
+        423: "Couch",
+        424: "Cello",
+        425: "Bull",
+        426: "Camel",
+        427: "Ball",
+        428: "Duck",
+        429: "Whale",
+        430: "Shirt",
+        431: "Tank",
+        432: "Motorcycle",
+        433: "Accordion",
+        434: "Owl",
+        435: "Porcupine",
+        436: "Sun hat",
+        437: "Nail",
+        438: "Scissors",
+        439: "Swan",
+        440: "Lamp",
+        441: "Crown",
+        442: "Piano",
+        443: "Sculpture",
+        444: "Cheetah",
+        445: "Oboe",
+        446: "Tin can",
+        447: "Mango",
+        448: "Tripod",
+        449: "Oven",
+        450: "Mouse",
+        451: "Barge",
+        452: "Coffee",
+        453: "Snowboard",
+        454: "Common fig",
+        455: "Salad",
+        456: "Marine invertebrates",
+        457: "Umbrella",
+        458: "Kangaroo",
+        459: "Human arm",
+        460: "Measuring cup",
+        461: "Snail",
+        462: "Loveseat",
+        463: "Suit",
+        464: "Teapot",
+        465: "Bottle",
+        466: "Alpaca",
+        467: "Kettle",
+        468: "Trousers",
+        469: "Popcorn",
+        470: "Centipede",
+        471: "Spider",
+        472: "Sparrow",
+        473: "Plate",
+        474: "Bagel",
+        475: "Personal care",
+        476: "Apple",
+        477: "Brassiere",
+        478: "Bathroom cabinet",
+        479: "studio couch",
+        480: "Computer keyboard",
+        481: "Table tennis racket",
+        482: "Sushi",
+        483: "Cabinetry",
+        484: "Street light",
+        485: "Towel",
+        486: "Nightstand",
+        487: "Rabbit",
+        488: "Dolphin",
+        489: "Dog",
+        490: "Jug",
+        491: "Wok",
+        492: "Fire hydrant",
+        493: "Human eye",
+        494: "Skyscraper",
+        495: "Backpack",
+        496: "Potato",
+        497: "Paper towel",
+        498: "Lifejacket",
+        499: "Bicycle wheel",
+        500: "Toilet",
+    }
+
+    return clsid2catid, catid2name
+
+
+def _visdrone_category():
+    clsid2catid = {i: i for i in range(10)}
+
+    catid2name = {
+        0: 'pedestrian',
+        1: 'people',
+        2: 'bicycle',
+        3: 'car',
+        4: 'van',
+        5: 'truck',
+        6: 'tricycle',
+        7: 'awning-tricycle',
+        8: 'bus',
+        9: 'motor'
+    }
+    return clsid2catid, catid2name
--- a/rtdetr_paddle/ppdet/data/source/coco.py
+++ b/rtdetr_paddle/ppdet/data/source/coco.py
@@ -0,0 +1,587 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import os
+import copy
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+import numpy as np
+from ppdet.core.workspace import register, serializable
+from .dataset import DetDataset
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['COCODataSet', 'SlicedCOCODataSet', 'SemiCOCODataSet']
+
+
+@register
+@serializable
+class COCODataSet(DetDataset):
+    """
+    Load dataset with COCO format.
+
+    Args:
+        dataset_dir (str): root directory for dataset.
+        image_dir (str): directory for images.
+        anno_path (str): coco annotation file path.
+        data_fields (list): key name of data dictionary, at least have 'image'.
+        sample_num (int): number of samples to load, -1 means all.
+        load_crowd (bool): whether to load crowded ground-truth. 
+            False as default
+        allow_empty (bool): whether to load empty entry. False as default
+        empty_ratio (float): the ratio of empty record number to total 
+            record's, if empty_ratio is out of [0. ,1.), do not sample the 
+            records and use all the empty entries. 1. as default
+        repeat (int): repeat times for dataset, use in benchmark.
+    """
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 load_crowd=False,
+                 allow_empty=False,
+                 empty_ratio=1.,
+                 repeat=1):
+        super(COCODataSet, self).__init__(
+            dataset_dir,
+            image_dir,
+            anno_path,
+            data_fields,
+            sample_num,
+            repeat=repeat)
+        self.load_image_only = False
+        self.load_semantic = False
+        self.load_crowd = load_crowd
+        self.allow_empty = allow_empty
+        self.empty_ratio = empty_ratio
+
+    def _sample_empty(self, records, num):
+        # if empty_ratio is out of [0. ,1.), do not sample the records
+        if self.empty_ratio < 0. or self.empty_ratio >= 1.:
+            return records
+        import random
+        sample_num = min(
+            int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records))
+        records = random.sample(records, sample_num)
+        return records
+
+    def parse_dataset(self):
+        anno_path = os.path.join(self.dataset_dir, self.anno_path)
+        image_dir = os.path.join(self.dataset_dir, self.image_dir)
+
+        assert anno_path.endswith('.json'), \
+            'invalid coco annotation file: ' + anno_path
+        from pycocotools.coco import COCO
+        coco = COCO(anno_path)
+        img_ids = coco.getImgIds()
+        img_ids.sort()
+        cat_ids = coco.getCatIds()
+        records = []
+        empty_records = []
+        ct = 0
+
+        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
+        self.cname2cid = dict({
+            coco.loadCats(catid)[0]['name']: clsid
+            for catid, clsid in self.catid2clsid.items()
+        })
+
+        if 'annotations' not in coco.dataset:
+            self.load_image_only = True
+            logger.warning('Annotation file: {} does not contains ground truth '
+                           'and load image information only.'.format(anno_path))
+
+        for img_id in img_ids:
+            img_anno = coco.loadImgs([img_id])[0]
+            im_fname = img_anno['file_name']
+            im_w = float(img_anno['width'])
+            im_h = float(img_anno['height'])
+
+            im_path = os.path.join(image_dir,
+                                   im_fname) if image_dir else im_fname
+            is_empty = False
+            if not os.path.exists(im_path):
+                logger.warning('Illegal image file: {}, and it will be '
+                               'ignored'.format(im_path))
+                continue
+
+            if im_w < 0 or im_h < 0:
+                logger.warning('Illegal width: {} or height: {} in annotation, '
+                               'and im_id: {} will be ignored'.format(
+                                   im_w, im_h, img_id))
+                continue
+
+            coco_rec = {
+                'im_file': im_path,
+                'im_id': np.array([img_id]),
+                'h': im_h,
+                'w': im_w,
+            } if 'image' in self.data_fields else {}
+
+            if not self.load_image_only:
+                ins_anno_ids = coco.getAnnIds(
+                    imgIds=[img_id], iscrowd=None if self.load_crowd else False)
+                instances = coco.loadAnns(ins_anno_ids)
+
+                bboxes = []
+                is_rbox_anno = False
+                for inst in instances:
+                    # check gt bbox
+                    if inst.get('ignore', False):
+                        continue
+                    if 'bbox' not in inst.keys():
+                        continue
+                    else:
+                        if not any(np.array(inst['bbox'])):
+                            continue
+
+                    x1, y1, box_w, box_h = inst['bbox']
+                    x2 = x1 + box_w
+                    y2 = y1 + box_h
+                    eps = 1e-5
+                    if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps:
+                        inst['clean_bbox'] = [
+                            round(float(x), 3) for x in [x1, y1, x2, y2]
+                        ]
+                        bboxes.append(inst)
+                    else:
+                        logger.warning(
+                            'Found an invalid bbox in annotations: im_id: {}, '
+                            'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
+                                img_id, float(inst['area']), x1, y1, x2, y2))
+
+                num_bbox = len(bboxes)
+                if num_bbox <= 0 and not self.allow_empty:
+                    continue
+                elif num_bbox <= 0:
+                    is_empty = True
+
+                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
+                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
+                is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
+                gt_poly = [None] * num_bbox
+                gt_track_id = -np.ones((num_bbox, 1), dtype=np.int32)
+
+                has_segmentation = False
+                has_track_id = False
+                for i, box in enumerate(bboxes):
+                    catid = box['category_id']
+                    gt_class[i][0] = self.catid2clsid[catid]
+                    gt_bbox[i, :] = box['clean_bbox']
+                    is_crowd[i][0] = box['iscrowd']
+                    # check RLE format 
+                    if 'segmentation' in box and box['iscrowd'] == 1:
+                        gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
+                    elif 'segmentation' in box and box['segmentation']:
+                        if not np.array(
+                                box['segmentation'],
+                                dtype=object).size > 0 and not self.allow_empty:
+                            bboxes.pop(i)
+                            gt_poly.pop(i)
+                            np.delete(is_crowd, i)
+                            np.delete(gt_class, i)
+                            np.delete(gt_bbox, i)
+                        else:
+                            gt_poly[i] = box['segmentation']
+                        has_segmentation = True
+
+                    if 'track_id' in box:
+                        gt_track_id[i][0] = box['track_id']
+                        has_track_id = True
+
+                if has_segmentation and not any(
+                        gt_poly) and not self.allow_empty:
+                    continue
+
+                gt_rec = {
+                    'is_crowd': is_crowd,
+                    'gt_class': gt_class,
+                    'gt_bbox': gt_bbox,
+                    'gt_poly': gt_poly,
+                }
+                if has_track_id:
+                    gt_rec.update({'gt_track_id': gt_track_id})
+
+                for k, v in gt_rec.items():
+                    if k in self.data_fields:
+                        coco_rec[k] = v
+
+                # TODO: remove load_semantic
+                if self.load_semantic and 'semantic' in self.data_fields:
+                    seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps',
+                                            'train2017', im_fname[:-3] + 'png')
+                    coco_rec.update({'semantic': seg_path})
+
+            logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format(
+                im_path, img_id, im_h, im_w))
+            if is_empty:
+                empty_records.append(coco_rec)
+            else:
+                records.append(coco_rec)
+            ct += 1
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+        assert ct > 0, 'not found any coco record in %s' % (anno_path)
+        logger.info('Load [{} samples valid, {} samples invalid] in file {}.'.
+                    format(ct, len(img_ids) - ct, anno_path))
+        if self.allow_empty and len(empty_records) > 0:
+            empty_records = self._sample_empty(empty_records, len(records))
+            records += empty_records
+        self.roidbs = records
+
+
+@register
+@serializable
+class SlicedCOCODataSet(COCODataSet):
+    """Sliced COCODataSet"""
+
+    def __init__(
+            self,
+            dataset_dir=None,
+            image_dir=None,
+            anno_path=None,
+            data_fields=['image'],
+            sample_num=-1,
+            load_crowd=False,
+            allow_empty=False,
+            empty_ratio=1.,
+            repeat=1,
+            sliced_size=[640, 640],
+            overlap_ratio=[0.25, 0.25], ):
+        super(SlicedCOCODataSet, self).__init__(
+            dataset_dir=dataset_dir,
+            image_dir=image_dir,
+            anno_path=anno_path,
+            data_fields=data_fields,
+            sample_num=sample_num,
+            load_crowd=load_crowd,
+            allow_empty=allow_empty,
+            empty_ratio=empty_ratio,
+            repeat=repeat, )
+        self.sliced_size = sliced_size
+        self.overlap_ratio = overlap_ratio
+
+    def parse_dataset(self):
+        anno_path = os.path.join(self.dataset_dir, self.anno_path)
+        image_dir = os.path.join(self.dataset_dir, self.image_dir)
+
+        assert anno_path.endswith('.json'), \
+            'invalid coco annotation file: ' + anno_path
+        from pycocotools.coco import COCO
+        coco = COCO(anno_path)
+        img_ids = coco.getImgIds()
+        img_ids.sort()
+        cat_ids = coco.getCatIds()
+        records = []
+        empty_records = []
+        ct = 0
+        ct_sub = 0
+
+        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
+        self.cname2cid = dict({
+            coco.loadCats(catid)[0]['name']: clsid
+            for catid, clsid in self.catid2clsid.items()
+        })
+
+        if 'annotations' not in coco.dataset:
+            self.load_image_only = True
+            logger.warning('Annotation file: {} does not contains ground truth '
+                           'and load image information only.'.format(anno_path))
+        try:
+            import sahi
+            from sahi.slicing import slice_image
+        except Exception as e:
+            logger.error(
+                'sahi not found, plaese install sahi. '
+                'for example: `pip install sahi`, see https://github.com/obss/sahi.'
+            )
+            raise e
+
+        sub_img_ids = 0
+        for img_id in img_ids:
+            img_anno = coco.loadImgs([img_id])[0]
+            im_fname = img_anno['file_name']
+            im_w = float(img_anno['width'])
+            im_h = float(img_anno['height'])
+
+            im_path = os.path.join(image_dir,
+                                   im_fname) if image_dir else im_fname
+            is_empty = False
+            if not os.path.exists(im_path):
+                logger.warning('Illegal image file: {}, and it will be '
+                               'ignored'.format(im_path))
+                continue
+
+            if im_w < 0 or im_h < 0:
+                logger.warning('Illegal width: {} or height: {} in annotation, '
+                               'and im_id: {} will be ignored'.format(
+                                   im_w, im_h, img_id))
+                continue
+
+            slice_image_result = sahi.slicing.slice_image(
+                image=im_path,
+                slice_height=self.sliced_size[0],
+                slice_width=self.sliced_size[1],
+                overlap_height_ratio=self.overlap_ratio[0],
+                overlap_width_ratio=self.overlap_ratio[1])
+
+            sub_img_num = len(slice_image_result)
+            for _ind in range(sub_img_num):
+                im = slice_image_result.images[_ind]
+                coco_rec = {
+                    'image': im,
+                    'im_id': np.array([sub_img_ids + _ind]),
+                    'h': im.shape[0],
+                    'w': im.shape[1],
+                    'ori_im_id': np.array([img_id]),
+                    'st_pix': np.array(
+                        slice_image_result.starting_pixels[_ind],
+                        dtype=np.float32),
+                    'is_last': 1 if _ind == sub_img_num - 1 else 0,
+                } if 'image' in self.data_fields else {}
+                records.append(coco_rec)
+            ct_sub += sub_img_num
+            ct += 1
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+        assert ct > 0, 'not found any coco record in %s' % (anno_path)
+        logger.info('{} samples and slice to {} sub_samples in file {}'.format(
+            ct, ct_sub, anno_path))
+        if self.allow_empty and len(empty_records) > 0:
+            empty_records = self._sample_empty(empty_records, len(records))
+            records += empty_records
+        self.roidbs = records
+
+
+@register
+@serializable
+class SemiCOCODataSet(COCODataSet):
+    """Semi-COCODataSet used for supervised and unsupervised dataSet"""
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 load_crowd=False,
+                 allow_empty=False,
+                 empty_ratio=1.,
+                 repeat=1,
+                 supervised=True):
+        super(SemiCOCODataSet, self).__init__(
+            dataset_dir, image_dir, anno_path, data_fields, sample_num,
+            load_crowd, allow_empty, empty_ratio, repeat)
+        self.supervised = supervised
+        self.length = -1  # defalut -1 means all
+
+    def parse_dataset(self):
+        anno_path = os.path.join(self.dataset_dir, self.anno_path)
+        image_dir = os.path.join(self.dataset_dir, self.image_dir)
+
+        assert anno_path.endswith('.json'), \
+            'invalid coco annotation file: ' + anno_path
+        from pycocotools.coco import COCO
+        coco = COCO(anno_path)
+        img_ids = coco.getImgIds()
+        img_ids.sort()
+        cat_ids = coco.getCatIds()
+        records = []
+        empty_records = []
+        ct = 0
+
+        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
+        self.cname2cid = dict({
+            coco.loadCats(catid)[0]['name']: clsid
+            for catid, clsid in self.catid2clsid.items()
+        })
+
+        if 'annotations' not in coco.dataset or self.supervised == False:
+            self.load_image_only = True
+            logger.warning('Annotation file: {} does not contains ground truth '
+                           'and load image information only.'.format(anno_path))
+
+        for img_id in img_ids:
+            img_anno = coco.loadImgs([img_id])[0]
+            im_fname = img_anno['file_name']
+            im_w = float(img_anno['width'])
+            im_h = float(img_anno['height'])
+
+            im_path = os.path.join(image_dir,
+                                   im_fname) if image_dir else im_fname
+            is_empty = False
+            if not os.path.exists(im_path):
+                logger.warning('Illegal image file: {}, and it will be '
+                               'ignored'.format(im_path))
+                continue
+
+            if im_w < 0 or im_h < 0:
+                logger.warning('Illegal width: {} or height: {} in annotation, '
+                               'and im_id: {} will be ignored'.format(
+                                   im_w, im_h, img_id))
+                continue
+
+            coco_rec = {
+                'im_file': im_path,
+                'im_id': np.array([img_id]),
+                'h': im_h,
+                'w': im_w,
+            } if 'image' in self.data_fields else {}
+
+            if not self.load_image_only:
+                ins_anno_ids = coco.getAnnIds(
+                    imgIds=[img_id], iscrowd=None if self.load_crowd else False)
+                instances = coco.loadAnns(ins_anno_ids)
+
+                bboxes = []
+                is_rbox_anno = False
+                for inst in instances:
+                    # check gt bbox
+                    if inst.get('ignore', False):
+                        continue
+                    if 'bbox' not in inst.keys():
+                        continue
+                    else:
+                        if not any(np.array(inst['bbox'])):
+                            continue
+
+                    x1, y1, box_w, box_h = inst['bbox']
+                    x2 = x1 + box_w
+                    y2 = y1 + box_h
+                    eps = 1e-5
+                    if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps:
+                        inst['clean_bbox'] = [
+                            round(float(x), 3) for x in [x1, y1, x2, y2]
+                        ]
+                        bboxes.append(inst)
+                    else:
+                        logger.warning(
+                            'Found an invalid bbox in annotations: im_id: {}, '
+                            'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
+                                img_id, float(inst['area']), x1, y1, x2, y2))
+
+                num_bbox = len(bboxes)
+                if num_bbox <= 0 and not self.allow_empty:
+                    continue
+                elif num_bbox <= 0:
+                    is_empty = True
+
+                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
+                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
+                is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
+                gt_poly = [None] * num_bbox
+
+                has_segmentation = False
+                for i, box in enumerate(bboxes):
+                    catid = box['category_id']
+                    gt_class[i][0] = self.catid2clsid[catid]
+                    gt_bbox[i, :] = box['clean_bbox']
+                    is_crowd[i][0] = box['iscrowd']
+                    # check RLE format 
+                    if 'segmentation' in box and box['iscrowd'] == 1:
+                        gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
+                    elif 'segmentation' in box and box['segmentation']:
+                        if not np.array(box['segmentation']
+                                        ).size > 0 and not self.allow_empty:
+                            bboxes.pop(i)
+                            gt_poly.pop(i)
+                            np.delete(is_crowd, i)
+                            np.delete(gt_class, i)
+                            np.delete(gt_bbox, i)
+                        else:
+                            gt_poly[i] = box['segmentation']
+                        has_segmentation = True
+
+                if has_segmentation and not any(
+                        gt_poly) and not self.allow_empty:
+                    continue
+
+                gt_rec = {
+                    'is_crowd': is_crowd,
+                    'gt_class': gt_class,
+                    'gt_bbox': gt_bbox,
+                    'gt_poly': gt_poly,
+                }
+
+                for k, v in gt_rec.items():
+                    if k in self.data_fields:
+                        coco_rec[k] = v
+
+                # TODO: remove load_semantic
+                if self.load_semantic and 'semantic' in self.data_fields:
+                    seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps',
+                                            'train2017', im_fname[:-3] + 'png')
+                    coco_rec.update({'semantic': seg_path})
+
+            logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format(
+                im_path, img_id, im_h, im_w))
+            if is_empty:
+                empty_records.append(coco_rec)
+            else:
+                records.append(coco_rec)
+            ct += 1
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+        assert ct > 0, 'not found any coco record in %s' % (anno_path)
+        logger.info('Load [{} samples valid, {} samples invalid] in file {}.'.
+                    format(ct, len(img_ids) - ct, anno_path))
+        if self.allow_empty and len(empty_records) > 0:
+            empty_records = self._sample_empty(empty_records, len(records))
+            records += empty_records
+        self.roidbs = records
+
+        if self.supervised:
+            logger.info(f'Use {len(self.roidbs)} sup_samples data as LABELED')
+        else:
+            if self.length > 0:  # unsup length will be decide by sup length
+                all_roidbs = self.roidbs.copy()
+                selected_idxs = [
+                    np.random.choice(len(all_roidbs))
+                    for _ in range(self.length)
+                ]
+                self.roidbs = [all_roidbs[i] for i in selected_idxs]
+            logger.info(
+                f'Use {len(self.roidbs)} unsup_samples data as UNLABELED')
+
+    def __getitem__(self, idx):
+        n = len(self.roidbs)
+        if self.repeat > 1:
+            idx %= n
+        # data batch
+        roidb = copy.deepcopy(self.roidbs[idx])
+        if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch:
+            idx = np.random.randint(n)
+            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
+        elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch:
+            idx = np.random.randint(n)
+            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
+        elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch:
+            roidb = [roidb, ] + [
+                copy.deepcopy(self.roidbs[np.random.randint(n)])
+                for _ in range(4)
+            ]
+        if isinstance(roidb, Sequence):
+            for r in roidb:
+                r['curr_iter'] = self._curr_iter
+        else:
+            roidb['curr_iter'] = self._curr_iter
+        self._curr_iter += 1
+
+        return self.transform(roidb)
--- a/rtdetr_paddle/ppdet/data/source/dataset.py
+++ b/rtdetr_paddle/ppdet/data/source/dataset.py
@@ -0,0 +1,307 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import os
+import copy
+import numpy as np
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+from paddle.io import Dataset
+from ppdet.core.workspace import register, serializable
+from ppdet.utils.download import get_dataset_path
+from ppdet.data import source
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+@serializable
+class DetDataset(Dataset):
+    """
+    Load detection dataset.
+
+    Args:
+        dataset_dir (str): root directory for dataset.
+        image_dir (str): directory for images.
+        anno_path (str): annotation file path.
+        data_fields (list): key name of data dictionary, at least have 'image'.
+        sample_num (int): number of samples to load, -1 means all.
+        use_default_label (bool): whether to load default label list.
+        repeat (int): repeat times for dataset, use in benchmark.
+    """
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 use_default_label=None,
+                 repeat=1,
+                 **kwargs):
+        super(DetDataset, self).__init__()
+        self.dataset_dir = dataset_dir if dataset_dir is not None else ''
+        self.anno_path = anno_path
+        self.image_dir = image_dir if image_dir is not None else ''
+        self.data_fields = data_fields
+        self.sample_num = sample_num
+        self.use_default_label = use_default_label
+        self.repeat = repeat
+        self._epoch = 0
+        self._curr_iter = 0
+
+    def __len__(self, ):
+        return len(self.roidbs) * self.repeat
+
+    def __call__(self, *args, **kwargs):
+        return self
+
+    def __getitem__(self, idx):
+        n = len(self.roidbs)
+        if self.repeat > 1:
+            idx %= n
+        # data batch
+        roidb = copy.deepcopy(self.roidbs[idx])
+        if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch:
+            idx = np.random.randint(n)
+            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
+        elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch:
+            idx = np.random.randint(n)
+            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
+        elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch:
+            roidb = [roidb, ] + [
+                copy.deepcopy(self.roidbs[np.random.randint(n)])
+                for _ in range(4)
+            ]
+        elif self.pre_img_epoch == 0 or self._epoch < self.pre_img_epoch:
+            # Add previous image as input, only used in CenterTrack
+            idx_pre_img = idx - 1
+            if idx_pre_img < 0:
+                idx_pre_img = idx + 1
+            roidb = [roidb, ] + [copy.deepcopy(self.roidbs[idx_pre_img])]
+        if isinstance(roidb, Sequence):
+            for r in roidb:
+                r['curr_iter'] = self._curr_iter
+        else:
+            roidb['curr_iter'] = self._curr_iter
+        self._curr_iter += 1
+
+        return self.transform(roidb)
+
+    def check_or_download_dataset(self):
+        self.dataset_dir = get_dataset_path(self.dataset_dir, self.anno_path,
+                                            self.image_dir)
+
+    def set_kwargs(self, **kwargs):
+        self.mixup_epoch = kwargs.get('mixup_epoch', -1)
+        self.cutmix_epoch = kwargs.get('cutmix_epoch', -1)
+        self.mosaic_epoch = kwargs.get('mosaic_epoch', -1)
+        self.pre_img_epoch = kwargs.get('pre_img_epoch', -1)
+
+    def set_transform(self, transform):
+        self.transform = transform
+
+    def set_epoch(self, epoch_id):
+        self._epoch = epoch_id
+
+    def parse_dataset(self, ):
+        raise NotImplementedError(
+            "Need to implement parse_dataset method of Dataset")
+
+    def get_anno(self):
+        if self.anno_path is None:
+            return
+        return os.path.join(self.dataset_dir, self.anno_path)
+
+
+def _is_valid_file(f, extensions=('.jpg', '.jpeg', '.png', '.bmp')):
+    return f.lower().endswith(extensions)
+
+
+def _make_dataset(dir):
+    dir = os.path.expanduser(dir)
+    if not os.path.isdir(dir):
+        raise ('{} should be a dir'.format(dir))
+    images = []
+    for root, _, fnames in sorted(os.walk(dir, followlinks=True)):
+        for fname in sorted(fnames):
+            path = os.path.join(root, fname)
+            if _is_valid_file(path):
+                images.append(path)
+    return images
+
+
+@register
+@serializable
+class ImageFolder(DetDataset):
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 sample_num=-1,
+                 use_default_label=None,
+                 **kwargs):
+        super(ImageFolder, self).__init__(
+            dataset_dir,
+            image_dir,
+            anno_path,
+            sample_num=sample_num,
+            use_default_label=use_default_label)
+        self._imid2path = {}
+        self.roidbs = None
+        self.sample_num = sample_num
+
+    def check_or_download_dataset(self):
+        return
+
+    def get_anno(self):
+        if self.anno_path is None:
+            return
+        if self.dataset_dir:
+            return os.path.join(self.dataset_dir, self.anno_path)
+        else:
+            return self.anno_path
+
+    def parse_dataset(self, ):
+        if not self.roidbs:
+            self.roidbs = self._load_images()
+
+    def _parse(self):
+        image_dir = self.image_dir
+        if not isinstance(image_dir, Sequence):
+            image_dir = [image_dir]
+        images = []
+        for im_dir in image_dir:
+            if os.path.isdir(im_dir):
+                im_dir = os.path.join(self.dataset_dir, im_dir)
+                images.extend(_make_dataset(im_dir))
+            elif os.path.isfile(im_dir) and _is_valid_file(im_dir):
+                images.append(im_dir)
+        return images
+
+    def _load_images(self):
+        images = self._parse()
+        ct = 0
+        records = []
+        for image in images:
+            assert image != '' and os.path.isfile(image), \
+                    "Image {} not found".format(image)
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+            rec = {'im_id': np.array([ct]), 'im_file': image}
+            self._imid2path[ct] = image
+            ct += 1
+            records.append(rec)
+        assert len(records) > 0, "No image file found"
+        return records
+
+    def get_imid2path(self):
+        return self._imid2path
+
+    def set_images(self, images):
+        self.image_dir = images
+        self.roidbs = self._load_images()
+
+    def set_slice_images(self,
+                         images,
+                         slice_size=[640, 640],
+                         overlap_ratio=[0.25, 0.25]):
+        self.image_dir = images
+        ori_records = self._load_images()
+        try:
+            import sahi
+            from sahi.slicing import slice_image
+        except Exception as e:
+            logger.error(
+                'sahi not found, plaese install sahi. '
+                'for example: `pip install sahi`, see https://github.com/obss/sahi.'
+            )
+            raise e
+
+        sub_img_ids = 0
+        ct = 0
+        ct_sub = 0
+        records = []
+        for i, ori_rec in enumerate(ori_records):
+            im_path = ori_rec['im_file']
+            slice_image_result = sahi.slicing.slice_image(
+                image=im_path,
+                slice_height=slice_size[0],
+                slice_width=slice_size[1],
+                overlap_height_ratio=overlap_ratio[0],
+                overlap_width_ratio=overlap_ratio[1])
+
+            sub_img_num = len(slice_image_result)
+            for _ind in range(sub_img_num):
+                im = slice_image_result.images[_ind]
+                rec = {
+                    'image': im,
+                    'im_id': np.array([sub_img_ids + _ind]),
+                    'h': im.shape[0],
+                    'w': im.shape[1],
+                    'ori_im_id': np.array([ori_rec['im_id'][0]]),
+                    'st_pix': np.array(
+                        slice_image_result.starting_pixels[_ind],
+                        dtype=np.float32),
+                    'is_last': 1 if _ind == sub_img_num - 1 else 0,
+                } if 'image' in self.data_fields else {}
+                records.append(rec)
+            ct_sub += sub_img_num
+            ct += 1
+        logger.info('{} samples and slice to {} sub_samples.'.format(ct,
+                                                                     ct_sub))
+        self.roidbs = records
+
+    def get_label_list(self):
+        # Only VOC dataset needs label list in ImageFold 
+        return self.anno_path
+
+
+@register
+class CommonDataset(object):
+    def __init__(self, **dataset_args):
+        super(CommonDataset, self).__init__()
+        dataset_args = copy.deepcopy(dataset_args)
+        type = dataset_args.pop("name")
+        self.dataset = getattr(source, type)(**dataset_args)
+
+    def __call__(self):
+        return self.dataset
+
+
+@register
+class TrainDataset(CommonDataset):
+    pass
+
+
+@register
+class EvalMOTDataset(CommonDataset):
+    pass
+
+
+@register
+class TestMOTDataset(CommonDataset):
+    pass
+
+
+@register
+class EvalDataset(CommonDataset):
+    pass
+
+
+@register
+class TestDataset(CommonDataset):
+    pass
--- a/rtdetr_paddle/ppdet/data/source/voc.py
+++ b/rtdetr_paddle/ppdet/data/source/voc.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+
+import xml.etree.ElementTree as ET
+
+from ppdet.core.workspace import register, serializable
+
+from .dataset import DetDataset
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+@register
+@serializable
+class VOCDataSet(DetDataset):
+    """
+    Load dataset with PascalVOC format.
+
+    Notes:
+    `anno_path` must contains xml file and image file path for annotations.
+
+    Args:
+        dataset_dir (str): root directory for dataset.
+        image_dir (str): directory for images.
+        anno_path (str): voc annotation file path.
+        data_fields (list): key name of data dictionary, at least have 'image'.
+        sample_num (int): number of samples to load, -1 means all.
+        label_list (str): if use_default_label is False, will load
+            mapping between category and class index.
+        allow_empty (bool): whether to load empty entry. False as default
+        empty_ratio (float): the ratio of empty record number to total 
+            record's, if empty_ratio is out of [0. ,1.), do not sample the 
+            records and use all the empty entries. 1. as default
+        repeat (int): repeat times for dataset, use in benchmark.
+    """
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 label_list=None,
+                 allow_empty=False,
+                 empty_ratio=1.,
+                 repeat=1):
+        super(VOCDataSet, self).__init__(
+            dataset_dir=dataset_dir,
+            image_dir=image_dir,
+            anno_path=anno_path,
+            data_fields=data_fields,
+            sample_num=sample_num,
+            repeat=repeat)
+        self.label_list = label_list
+        self.allow_empty = allow_empty
+        self.empty_ratio = empty_ratio
+
+    def _sample_empty(self, records, num):
+        # if empty_ratio is out of [0. ,1.), do not sample the records
+        if self.empty_ratio < 0. or self.empty_ratio >= 1.:
+            return records
+        import random
+        sample_num = min(
+            int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records))
+        records = random.sample(records, sample_num)
+        return records
+
+    def parse_dataset(self, ):
+        anno_path = os.path.join(self.dataset_dir, self.anno_path)
+        image_dir = os.path.join(self.dataset_dir, self.image_dir)
+
+        # mapping category name to class id
+        # first_class:0, second_class:1, ...
+        records = []
+        empty_records = []
+        ct = 0
+        cname2cid = {}
+        if self.label_list:
+            label_path = os.path.join(self.dataset_dir, self.label_list)
+            if not os.path.exists(label_path):
+                raise ValueError("label_list {} does not exists".format(
+                    label_path))
+            with open(label_path, 'r') as fr:
+                label_id = 0
+                for line in fr.readlines():
+                    cname2cid[line.strip()] = label_id
+                    label_id += 1
+        else:
+            cname2cid = pascalvoc_label()
+
+        with open(anno_path, 'r') as fr:
+            while True:
+                line = fr.readline()
+                if not line:
+                    break
+                img_file, xml_file = [os.path.join(image_dir, x) \
+                        for x in line.strip().split()[:2]]
+                if not os.path.exists(img_file):
+                    logger.warning(
+                        'Illegal image file: {}, and it will be ignored'.format(
+                            img_file))
+                    continue
+                if not os.path.isfile(xml_file):
+                    logger.warning(
+                        'Illegal xml file: {}, and it will be ignored'.format(
+                            xml_file))
+                    continue
+                tree = ET.parse(xml_file)
+                if tree.find('id') is None:
+                    im_id = np.array([ct])
+                else:
+                    im_id = np.array([int(tree.find('id').text)])
+
+                objs = tree.findall('object')
+                im_w = float(tree.find('size').find('width').text)
+                im_h = float(tree.find('size').find('height').text)
+                if im_w < 0 or im_h < 0:
+                    logger.warning(
+                        'Illegal width: {} or height: {} in annotation, '
+                        'and {} will be ignored'.format(im_w, im_h, xml_file))
+                    continue
+
+                num_bbox, i = len(objs), 0
+                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
+                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
+                gt_score = np.zeros((num_bbox, 1), dtype=np.float32)
+                difficult = np.zeros((num_bbox, 1), dtype=np.int32)
+                for obj in objs:
+                    cname = obj.find('name').text
+
+                    # user dataset may not contain difficult field
+                    _difficult = obj.find('difficult')
+                    _difficult = int(
+                        _difficult.text) if _difficult is not None else 0
+
+                    x1 = float(obj.find('bndbox').find('xmin').text)
+                    y1 = float(obj.find('bndbox').find('ymin').text)
+                    x2 = float(obj.find('bndbox').find('xmax').text)
+                    y2 = float(obj.find('bndbox').find('ymax').text)
+                    x1 = max(0, x1)
+                    y1 = max(0, y1)
+                    x2 = min(im_w - 1, x2)
+                    y2 = min(im_h - 1, y2)
+                    if x2 > x1 and y2 > y1:
+                        gt_bbox[i, :] = [x1, y1, x2, y2]
+                        gt_class[i, 0] = cname2cid[cname]
+                        gt_score[i, 0] = 1.
+                        difficult[i, 0] = _difficult
+                        i += 1
+                    else:
+                        logger.warning(
+                            'Found an invalid bbox in annotations: xml_file: {}'
+                            ', x1: {}, y1: {}, x2: {}, y2: {}.'.format(
+                                xml_file, x1, y1, x2, y2))
+                gt_bbox = gt_bbox[:i, :]
+                gt_class = gt_class[:i, :]
+                gt_score = gt_score[:i, :]
+                difficult = difficult[:i, :]
+
+                voc_rec = {
+                    'im_file': img_file,
+                    'im_id': im_id,
+                    'h': im_h,
+                    'w': im_w
+                } if 'image' in self.data_fields else {}
+
+                gt_rec = {
+                    'gt_class': gt_class,
+                    'gt_score': gt_score,
+                    'gt_bbox': gt_bbox,
+                    'difficult': difficult
+                }
+                for k, v in gt_rec.items():
+                    if k in self.data_fields:
+                        voc_rec[k] = v
+
+                if len(objs) == 0:
+                    empty_records.append(voc_rec)
+                else:
+                    records.append(voc_rec)
+
+                ct += 1
+                if self.sample_num > 0 and ct >= self.sample_num:
+                    break
+        assert ct > 0, 'not found any voc record in %s' % (self.anno_path)
+        logger.debug('{} samples in file {}'.format(ct, anno_path))
+        if self.allow_empty and len(empty_records) > 0:
+            empty_records = self._sample_empty(empty_records, len(records))
+            records += empty_records
+        self.roidbs, self.cname2cid = records, cname2cid
+
+    def get_label_list(self):
+        return os.path.join(self.dataset_dir, self.label_list)
+
+
+def pascalvoc_label():
+    labels_map = {
+        'aeroplane': 0,
+        'bicycle': 1,
+        'bird': 2,
+        'boat': 3,
+        'bottle': 4,
+        'bus': 5,
+        'car': 6,
+        'cat': 7,
+        'chair': 8,
+        'cow': 9,
+        'diningtable': 10,
+        'dog': 11,
+        'horse': 12,
+        'motorbike': 13,
+        'person': 14,
+        'pottedplant': 15,
+        'sheep': 16,
+        'sofa': 17,
+        'train': 18,
+        'tvmonitor': 19
+    }
+    return labels_map
--- a/rtdetr_paddle/ppdet/data/transform/init.py
+++ b/rtdetr_paddle/ppdet/data/transform/init.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import operators
+from . import batch_operators
+
+
+from .operators import *
+from .batch_operators import *
+
+
+__all__ = []
+__all__ += registered_ops
+
--- a/rtdetr_paddle/ppdet/data/transform/batch_operators.py
+++ b/rtdetr_paddle/ppdet/data/transform/batch_operators.py
@@ -0,0 +1,322 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import typing
+
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+
+import cv2
+import numpy as np
+from .operators import register_op, BaseOperator, Resize
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = [
+    'PadBatch',
+    'BatchRandomResize',
+    'PadGT',
+]
+
+
+@register_op
+class PadBatch(BaseOperator):
+    """
+    Pad a batch of samples so they can be divisible by a stride.
+    The layout of each image should be 'CHW'.
+    Args:
+        pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure
+            height and width is divisible by `pad_to_stride`.
+    """
+
+    def __init__(self, pad_to_stride=0):
+        super(PadBatch, self).__init__()
+        self.pad_to_stride = pad_to_stride
+
+    def __call__(self, samples, context=None):
+        """
+        Args:
+            samples (list): a batch of sample, each is dict.
+        """
+        coarsest_stride = self.pad_to_stride
+
+        # multi scale input is nested list
+        if isinstance(samples,
+                      typing.Sequence) and len(samples) > 0 and isinstance(
+                          samples[0], typing.Sequence):
+            inner_samples = samples[0]
+        else:
+            inner_samples = samples
+
+        max_shape = np.array(
+            [data['image'].shape for data in inner_samples]).max(axis=0)
+        if coarsest_stride > 0:
+            max_shape[1] = int(
+                np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride)
+            max_shape[2] = int(
+                np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride)
+
+        for data in inner_samples:
+            im = data['image']
+            im_c, im_h, im_w = im.shape[:]
+            padding_im = np.zeros(
+                (im_c, max_shape[1], max_shape[2]), dtype=np.float32)
+            padding_im[:, :im_h, :im_w] = im
+            data['image'] = padding_im
+            if 'semantic' in data and data['semantic'] is not None:
+                semantic = data['semantic']
+                padding_sem = np.zeros(
+                    (1, max_shape[1], max_shape[2]), dtype=np.float32)
+                padding_sem[:, :im_h, :im_w] = semantic
+                data['semantic'] = padding_sem
+            if 'gt_segm' in data and data['gt_segm'] is not None:
+                gt_segm = data['gt_segm']
+                padding_segm = np.zeros(
+                    (gt_segm.shape[0], max_shape[1], max_shape[2]),
+                    dtype=np.uint8)
+                padding_segm[:, :im_h, :im_w] = gt_segm
+                data['gt_segm'] = padding_segm
+
+        return samples
+
+
+@register_op
+class BatchRandomResize(BaseOperator):
+    """
+    Resize image to target size randomly. random target_size and interpolation method
+    Args:
+        target_size (int, list, tuple): image target size, if random size is True, must be list or tuple
+        keep_ratio (bool): whether keep_raio or not, default true
+        interp (int): the interpolation method
+        random_size (bool): whether random select target size of image
+        random_interp (bool): whether random select interpolation method
+    """
+
+    def __init__(self,
+                 target_size,
+                 keep_ratio,
+                 interp=cv2.INTER_NEAREST,
+                 random_size=True,
+                 random_interp=False):
+        super(BatchRandomResize, self).__init__()
+        self.keep_ratio = keep_ratio
+        self.interps = [
+            cv2.INTER_NEAREST,
+            cv2.INTER_LINEAR,
+            cv2.INTER_AREA,
+            cv2.INTER_CUBIC,
+            cv2.INTER_LANCZOS4,
+        ]
+        self.interp = interp
+        assert isinstance(target_size, (
+            int, Sequence)), "target_size must be int, list or tuple"
+        if random_size and not isinstance(target_size, list):
+            raise TypeError(
+                "Type of target_size is invalid when random_size is True. Must be List, now is {}".
+                format(type(target_size)))
+        self.target_size = target_size
+        self.random_size = random_size
+        self.random_interp = random_interp
+
+    def __call__(self, samples, context=None):
+        if self.random_size:
+            index = np.random.choice(len(self.target_size))
+            target_size = self.target_size[index]
+        else:
+            target_size = self.target_size
+
+        if self.random_interp:
+            interp = np.random.choice(self.interps)
+        else:
+            interp = self.interp
+
+        resizer = Resize(target_size, keep_ratio=self.keep_ratio, interp=interp)
+        return resizer(samples, context=context)
+
+
+@register_op
+class PadGT(BaseOperator):
+    """
+    Pad 0 to `gt_class`, `gt_bbox`, `gt_score`...
+    The num_max_boxes is the largest for batch.
+    Args:
+        return_gt_mask (bool): If true, return `pad_gt_mask`,
+                                1 means bbox, 0 means no bbox.
+    """
+
+    def __init__(self, return_gt_mask=True, pad_img=False, minimum_gtnum=0):
+        super(PadGT, self).__init__()
+        self.return_gt_mask = return_gt_mask
+        self.pad_img = pad_img
+        self.minimum_gtnum = minimum_gtnum
+
+    def _impad(self,
+               img: np.ndarray,
+               *,
+               shape=None,
+               padding=None,
+               pad_val=0,
+               padding_mode='constant') -> np.ndarray:
+        """Pad the given image to a certain shape or pad on all sides with
+        specified padding mode and padding value.
+
+        Args:
+            img (ndarray): Image to be padded.
+            shape (tuple[int]): Expected padding shape (h, w). Default: None.
+            padding (int or tuple[int]): Padding on each border. If a single int is
+                provided this is used to pad all borders. If tuple of length 2 is
+                provided this is the padding on left/right and top/bottom
+                respectively. If a tuple of length 4 is provided this is the
+                padding for the left, top, right and bottom borders respectively.
+                Default: None. Note that `shape` and `padding` can not be both
+                set.
+            pad_val (Number | Sequence[Number]): Values to be filled in padding
+                areas when padding_mode is 'constant'. Default: 0.
+            padding_mode (str): Type of padding. Should be: constant, edge,
+                reflect or symmetric. Default: constant.
+                - constant: pads with a constant value, this value is specified
+                with pad_val.
+                - edge: pads with the last value at the edge of the image.
+                - reflect: pads with reflection of image without repeating the last
+                value on the edge. For example, padding [1, 2, 3, 4] with 2
+                elements on both sides in reflect mode will result in
+                [3, 2, 1, 2, 3, 4, 3, 2].
+                - symmetric: pads with reflection of image repeating the last value
+                on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
+                both sides in symmetric mode will result in
+                [2, 1, 1, 2, 3, 4, 4, 3]
+
+        Returns:
+            ndarray: The padded image.
+        """
+
+        assert (shape is not None) ^ (padding is not None)
+        if shape is not None:
+            width = max(shape[1] - img.shape[1], 0)
+            height = max(shape[0] - img.shape[0], 0)
+            padding = (0, 0, int(width), int(height))
+
+        # check pad_val
+        import numbers
+        if isinstance(pad_val, tuple):
+            assert len(pad_val) == img.shape[-1]
+        elif not isinstance(pad_val, numbers.Number):
+            raise TypeError('pad_val must be a int or a tuple. '
+                            f'But received {type(pad_val)}')
+
+        # check padding
+        if isinstance(padding, tuple) and len(padding) in [2, 4]:
+            if len(padding) == 2:
+                padding = (padding[0], padding[1], padding[0], padding[1])
+        elif isinstance(padding, numbers.Number):
+            padding = (padding, padding, padding, padding)
+        else:
+            raise ValueError('Padding must be a int or a 2, or 4 element tuple.'
+                             f'But received {padding}')
+
+        # check padding mode
+        assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
+
+        border_type = {
+            'constant': cv2.BORDER_CONSTANT,
+            'edge': cv2.BORDER_REPLICATE,
+            'reflect': cv2.BORDER_REFLECT_101,
+            'symmetric': cv2.BORDER_REFLECT
+        }
+        img = cv2.copyMakeBorder(
+            img,
+            padding[1],
+            padding[3],
+            padding[0],
+            padding[2],
+            border_type[padding_mode],
+            value=pad_val)
+
+        return img
+
+    def checkmaxshape(self, samples):
+        maxh, maxw = 0, 0
+        for sample in samples:
+            h, w = sample['im_shape']
+            if h > maxh:
+                maxh = h
+            if w > maxw:
+                maxw = w
+        return (maxh, maxw)
+
+    def __call__(self, samples, context=None):
+        num_max_boxes = max([len(s['gt_bbox']) for s in samples])
+        num_max_boxes = max(self.minimum_gtnum, num_max_boxes)
+        if self.pad_img:
+            maxshape = self.checkmaxshape(samples)
+        for sample in samples:
+            if self.pad_img:
+                img = sample['image']
+                padimg = self._impad(img, shape=maxshape)
+                sample['image'] = padimg
+            if self.return_gt_mask:
+                sample['pad_gt_mask'] = np.zeros(
+                    (num_max_boxes, 1), dtype=np.float32)
+            if num_max_boxes == 0:
+                continue
+
+            num_gt = len(sample['gt_bbox'])
+            pad_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32)
+            pad_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32)
+            if num_gt > 0:
+                pad_gt_class[:num_gt] = sample['gt_class']
+                pad_gt_bbox[:num_gt] = sample['gt_bbox']
+            sample['gt_class'] = pad_gt_class
+            sample['gt_bbox'] = pad_gt_bbox
+            # pad_gt_mask
+            if 'pad_gt_mask' in sample:
+                sample['pad_gt_mask'][:num_gt] = 1
+            # gt_score
+            if 'gt_score' in sample:
+                pad_gt_score = np.zeros((num_max_boxes, 1), dtype=np.float32)
+                if num_gt > 0:
+                    pad_gt_score[:num_gt] = sample['gt_score']
+                sample['gt_score'] = pad_gt_score
+            if 'is_crowd' in sample:
+                pad_is_crowd = np.zeros((num_max_boxes, 1), dtype=np.int32)
+                if num_gt > 0:
+                    pad_is_crowd[:num_gt] = sample['is_crowd']
+                sample['is_crowd'] = pad_is_crowd
+            if 'difficult' in sample:
+                pad_diff = np.zeros((num_max_boxes, 1), dtype=np.int32)
+                if num_gt > 0:
+                    pad_diff[:num_gt] = sample['difficult']
+                sample['difficult'] = pad_diff
+            if 'gt_joints' in sample:
+                num_joints = sample['gt_joints'].shape[1]
+                pad_gt_joints = np.zeros(
+                    (num_max_boxes, num_joints, 3), dtype=np.float32)
+                if num_gt > 0:
+                    pad_gt_joints[:num_gt] = sample['gt_joints']
+                sample['gt_joints'] = pad_gt_joints
+            if 'gt_areas' in sample:
+                pad_gt_areas = np.zeros((num_max_boxes, 1), dtype=np.float32)
+                if num_gt > 0:
+                    pad_gt_areas[:num_gt, 0] = sample['gt_areas']
+                sample['gt_areas'] = pad_gt_areas
+        return samples
+
+
+
--- a/rtdetr_paddle/ppdet/data/transform/op_helper.py
+++ b/rtdetr_paddle/ppdet/data/transform/op_helper.py
@@ -0,0 +1,494 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# this file contains helper methods for BBOX processing
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import random
+import math
+import cv2
+
+
+def meet_emit_constraint(src_bbox, sample_bbox):
+    center_x = (src_bbox[2] + src_bbox[0]) / 2
+    center_y = (src_bbox[3] + src_bbox[1]) / 2
+    if center_x >= sample_bbox[0] and \
+            center_x <= sample_bbox[2] and \
+            center_y >= sample_bbox[1] and \
+            center_y <= sample_bbox[3]:
+        return True
+    return False
+
+
+def clip_bbox(src_bbox):
+    src_bbox[0] = max(min(src_bbox[0], 1.0), 0.0)
+    src_bbox[1] = max(min(src_bbox[1], 1.0), 0.0)
+    src_bbox[2] = max(min(src_bbox[2], 1.0), 0.0)
+    src_bbox[3] = max(min(src_bbox[3], 1.0), 0.0)
+    return src_bbox
+
+
+def bbox_area(src_bbox):
+    if src_bbox[2] < src_bbox[0] or src_bbox[3] < src_bbox[1]:
+        return 0.
+    else:
+        width = src_bbox[2] - src_bbox[0]
+        height = src_bbox[3] - src_bbox[1]
+        return width * height
+
+
+def is_overlap(object_bbox, sample_bbox):
+    if object_bbox[0] >= sample_bbox[2] or \
+       object_bbox[2] <= sample_bbox[0] or \
+       object_bbox[1] >= sample_bbox[3] or \
+       object_bbox[3] <= sample_bbox[1]:
+        return False
+    else:
+        return True
+
+
+def filter_and_process(sample_bbox, bboxes, labels, scores=None,
+                       keypoints=None):
+    new_bboxes = []
+    new_labels = []
+    new_scores = []
+    new_keypoints = []
+    new_kp_ignore = []
+    for i in range(len(bboxes)):
+        new_bbox = [0, 0, 0, 0]
+        obj_bbox = [bboxes[i][0], bboxes[i][1], bboxes[i][2], bboxes[i][3]]
+        if not meet_emit_constraint(obj_bbox, sample_bbox):
+            continue
+        if not is_overlap(obj_bbox, sample_bbox):
+            continue
+        sample_width = sample_bbox[2] - sample_bbox[0]
+        sample_height = sample_bbox[3] - sample_bbox[1]
+        new_bbox[0] = (obj_bbox[0] - sample_bbox[0]) / sample_width
+        new_bbox[1] = (obj_bbox[1] - sample_bbox[1]) / sample_height
+        new_bbox[2] = (obj_bbox[2] - sample_bbox[0]) / sample_width
+        new_bbox[3] = (obj_bbox[3] - sample_bbox[1]) / sample_height
+        new_bbox = clip_bbox(new_bbox)
+        if bbox_area(new_bbox) > 0:
+            new_bboxes.append(new_bbox)
+            new_labels.append([labels[i][0]])
+            if scores is not None:
+                new_scores.append([scores[i][0]])
+            if keypoints is not None:
+                sample_keypoint = keypoints[0][i]
+                for j in range(len(sample_keypoint)):
+                    kp_len = sample_height if j % 2 else sample_width
+                    sample_coord = sample_bbox[1] if j % 2 else sample_bbox[0]
+                    sample_keypoint[j] = (
+                        sample_keypoint[j] - sample_coord) / kp_len
+                    sample_keypoint[j] = max(min(sample_keypoint[j], 1.0), 0.0)
+                new_keypoints.append(sample_keypoint)
+                new_kp_ignore.append(keypoints[1][i])
+
+    bboxes = np.array(new_bboxes)
+    labels = np.array(new_labels)
+    scores = np.array(new_scores)
+    if keypoints is not None:
+        keypoints = np.array(new_keypoints)
+        new_kp_ignore = np.array(new_kp_ignore)
+        return bboxes, labels, scores, (keypoints, new_kp_ignore)
+    return bboxes, labels, scores
+
+
+def bbox_area_sampling(bboxes, labels, scores, target_size, min_size):
+    new_bboxes = []
+    new_labels = []
+    new_scores = []
+    for i, bbox in enumerate(bboxes):
+        w = float((bbox[2] - bbox[0]) * target_size)
+        h = float((bbox[3] - bbox[1]) * target_size)
+        if w * h < float(min_size * min_size):
+            continue
+        else:
+            new_bboxes.append(bbox)
+            new_labels.append(labels[i])
+            if scores is not None and scores.size != 0:
+                new_scores.append(scores[i])
+    bboxes = np.array(new_bboxes)
+    labels = np.array(new_labels)
+    scores = np.array(new_scores)
+    return bboxes, labels, scores
+
+
+def generate_sample_bbox(sampler):
+    scale = np.random.uniform(sampler[2], sampler[3])
+    aspect_ratio = np.random.uniform(sampler[4], sampler[5])
+    aspect_ratio = max(aspect_ratio, (scale**2.0))
+    aspect_ratio = min(aspect_ratio, 1 / (scale**2.0))
+    bbox_width = scale * (aspect_ratio**0.5)
+    bbox_height = scale / (aspect_ratio**0.5)
+    xmin_bound = 1 - bbox_width
+    ymin_bound = 1 - bbox_height
+    xmin = np.random.uniform(0, xmin_bound)
+    ymin = np.random.uniform(0, ymin_bound)
+    xmax = xmin + bbox_width
+    ymax = ymin + bbox_height
+    sampled_bbox = [xmin, ymin, xmax, ymax]
+    return sampled_bbox
+
+
+def generate_sample_bbox_square(sampler, image_width, image_height):
+    scale = np.random.uniform(sampler[2], sampler[3])
+    aspect_ratio = np.random.uniform(sampler[4], sampler[5])
+    aspect_ratio = max(aspect_ratio, (scale**2.0))
+    aspect_ratio = min(aspect_ratio, 1 / (scale**2.0))
+    bbox_width = scale * (aspect_ratio**0.5)
+    bbox_height = scale / (aspect_ratio**0.5)
+    if image_height < image_width:
+        bbox_width = bbox_height * image_height / image_width
+    else:
+        bbox_height = bbox_width * image_width / image_height
+    xmin_bound = 1 - bbox_width
+    ymin_bound = 1 - bbox_height
+    xmin = np.random.uniform(0, xmin_bound)
+    ymin = np.random.uniform(0, ymin_bound)
+    xmax = xmin + bbox_width
+    ymax = ymin + bbox_height
+    sampled_bbox = [xmin, ymin, xmax, ymax]
+    return sampled_bbox
+
+
+def data_anchor_sampling(bbox_labels, image_width, image_height, scale_array,
+                         resize_width):
+    num_gt = len(bbox_labels)
+    # np.random.randint range: [low, high)
+    rand_idx = np.random.randint(0, num_gt) if num_gt != 0 else 0
+
+    if num_gt != 0:
+        norm_xmin = bbox_labels[rand_idx][0]
+        norm_ymin = bbox_labels[rand_idx][1]
+        norm_xmax = bbox_labels[rand_idx][2]
+        norm_ymax = bbox_labels[rand_idx][3]
+
+        xmin = norm_xmin * image_width
+        ymin = norm_ymin * image_height
+        wid = image_width * (norm_xmax - norm_xmin)
+        hei = image_height * (norm_ymax - norm_ymin)
+        range_size = 0
+
+        area = wid * hei
+        for scale_ind in range(0, len(scale_array) - 1):
+            if area > scale_array[scale_ind] ** 2 and area < \
+                    scale_array[scale_ind + 1] ** 2:
+                range_size = scale_ind + 1
+                break
+
+        if area > scale_array[len(scale_array) - 2]**2:
+            range_size = len(scale_array) - 2
+
+        scale_choose = 0.0
+        if range_size == 0:
+            rand_idx_size = 0
+        else:
+            # np.random.randint range: [low, high)
+            rng_rand_size = np.random.randint(0, range_size + 1)
+            rand_idx_size = rng_rand_size % (range_size + 1)
+
+        if rand_idx_size == range_size:
+            min_resize_val = scale_array[rand_idx_size] / 2.0
+            max_resize_val = min(2.0 * scale_array[rand_idx_size],
+                                 2 * math.sqrt(wid * hei))
+            scale_choose = random.uniform(min_resize_val, max_resize_val)
+        else:
+            min_resize_val = scale_array[rand_idx_size] / 2.0
+            max_resize_val = 2.0 * scale_array[rand_idx_size]
+            scale_choose = random.uniform(min_resize_val, max_resize_val)
+
+        sample_bbox_size = wid * resize_width / scale_choose
+
+        w_off_orig = 0.0
+        h_off_orig = 0.0
+        if sample_bbox_size < max(image_height, image_width):
+            if wid <= sample_bbox_size:
+                w_off_orig = np.random.uniform(xmin + wid - sample_bbox_size,
+                                               xmin)
+            else:
+                w_off_orig = np.random.uniform(xmin,
+                                               xmin + wid - sample_bbox_size)
+
+            if hei <= sample_bbox_size:
+                h_off_orig = np.random.uniform(ymin + hei - sample_bbox_size,
+                                               ymin)
+            else:
+                h_off_orig = np.random.uniform(ymin,
+                                               ymin + hei - sample_bbox_size)
+
+        else:
+            w_off_orig = np.random.uniform(image_width - sample_bbox_size, 0.0)
+            h_off_orig = np.random.uniform(image_height - sample_bbox_size, 0.0)
+
+        w_off_orig = math.floor(w_off_orig)
+        h_off_orig = math.floor(h_off_orig)
+
+        # Figure out top left coordinates.
+        w_off = float(w_off_orig / image_width)
+        h_off = float(h_off_orig / image_height)
+
+        sampled_bbox = [
+            w_off, h_off, w_off + float(sample_bbox_size / image_width),
+            h_off + float(sample_bbox_size / image_height)
+        ]
+        return sampled_bbox
+    else:
+        return 0
+
+
+def jaccard_overlap(sample_bbox, object_bbox):
+    if sample_bbox[0] >= object_bbox[2] or \
+        sample_bbox[2] <= object_bbox[0] or \
+        sample_bbox[1] >= object_bbox[3] or \
+        sample_bbox[3] <= object_bbox[1]:
+        return 0
+    intersect_xmin = max(sample_bbox[0], object_bbox[0])
+    intersect_ymin = max(sample_bbox[1], object_bbox[1])
+    intersect_xmax = min(sample_bbox[2], object_bbox[2])
+    intersect_ymax = min(sample_bbox[3], object_bbox[3])
+    intersect_size = (intersect_xmax - intersect_xmin) * (
+        intersect_ymax - intersect_ymin)
+    sample_bbox_size = bbox_area(sample_bbox)
+    object_bbox_size = bbox_area(object_bbox)
+    overlap = intersect_size / (
+        sample_bbox_size + object_bbox_size - intersect_size)
+    return overlap
+
+
+def intersect_bbox(bbox1, bbox2):
+    if bbox2[0] > bbox1[2] or bbox2[2] < bbox1[0] or \
+        bbox2[1] > bbox1[3] or bbox2[3] < bbox1[1]:
+        intersection_box = [0.0, 0.0, 0.0, 0.0]
+    else:
+        intersection_box = [
+            max(bbox1[0], bbox2[0]), max(bbox1[1], bbox2[1]),
+            min(bbox1[2], bbox2[2]), min(bbox1[3], bbox2[3])
+        ]
+    return intersection_box
+
+
+def bbox_coverage(bbox1, bbox2):
+    inter_box = intersect_bbox(bbox1, bbox2)
+    intersect_size = bbox_area(inter_box)
+
+    if intersect_size > 0:
+        bbox1_size = bbox_area(bbox1)
+        return intersect_size / bbox1_size
+    else:
+        return 0.
+
+
+def satisfy_sample_constraint(sampler,
+                              sample_bbox,
+                              gt_bboxes,
+                              satisfy_all=False):
+    if sampler[6] == 0 and sampler[7] == 0:
+        return True
+    satisfied = []
+    for i in range(len(gt_bboxes)):
+        object_bbox = [
+            gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3]
+        ]
+        overlap = jaccard_overlap(sample_bbox, object_bbox)
+        if sampler[6] != 0 and \
+                overlap < sampler[6]:
+            satisfied.append(False)
+            continue
+        if sampler[7] != 0 and \
+                overlap > sampler[7]:
+            satisfied.append(False)
+            continue
+        satisfied.append(True)
+        if not satisfy_all:
+            return True
+
+    if satisfy_all:
+        return np.all(satisfied)
+    else:
+        return False
+
+
+def satisfy_sample_constraint_coverage(sampler, sample_bbox, gt_bboxes):
+    if sampler[6] == 0 and sampler[7] == 0:
+        has_jaccard_overlap = False
+    else:
+        has_jaccard_overlap = True
+    if sampler[8] == 0 and sampler[9] == 0:
+        has_object_coverage = False
+    else:
+        has_object_coverage = True
+
+    if not has_jaccard_overlap and not has_object_coverage:
+        return True
+    found = False
+    for i in range(len(gt_bboxes)):
+        object_bbox = [
+            gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3]
+        ]
+        if has_jaccard_overlap:
+            overlap = jaccard_overlap(sample_bbox, object_bbox)
+            if sampler[6] != 0 and \
+                    overlap < sampler[6]:
+                continue
+            if sampler[7] != 0 and \
+                    overlap > sampler[7]:
+                continue
+            found = True
+        if has_object_coverage:
+            object_coverage = bbox_coverage(object_bbox, sample_bbox)
+            if sampler[8] != 0 and \
+                    object_coverage < sampler[8]:
+                continue
+            if sampler[9] != 0 and \
+                    object_coverage > sampler[9]:
+                continue
+            found = True
+        if found:
+            return True
+    return found
+
+
+def crop_image_sampling(img, sample_bbox, image_width, image_height,
+                        target_size):
+    # no clipping here
+    xmin = int(sample_bbox[0] * image_width)
+    xmax = int(sample_bbox[2] * image_width)
+    ymin = int(sample_bbox[1] * image_height)
+    ymax = int(sample_bbox[3] * image_height)
+
+    w_off = xmin
+    h_off = ymin
+    width = xmax - xmin
+    height = ymax - ymin
+    cross_xmin = max(0.0, float(w_off))
+    cross_ymin = max(0.0, float(h_off))
+    cross_xmax = min(float(w_off + width - 1.0), float(image_width))
+    cross_ymax = min(float(h_off + height - 1.0), float(image_height))
+    cross_width = cross_xmax - cross_xmin
+    cross_height = cross_ymax - cross_ymin
+
+    roi_xmin = 0 if w_off >= 0 else abs(w_off)
+    roi_ymin = 0 if h_off >= 0 else abs(h_off)
+    roi_width = cross_width
+    roi_height = cross_height
+
+    roi_y1 = int(roi_ymin)
+    roi_y2 = int(roi_ymin + roi_height)
+    roi_x1 = int(roi_xmin)
+    roi_x2 = int(roi_xmin + roi_width)
+
+    cross_y1 = int(cross_ymin)
+    cross_y2 = int(cross_ymin + cross_height)
+    cross_x1 = int(cross_xmin)
+    cross_x2 = int(cross_xmin + cross_width)
+
+    sample_img = np.zeros((height, width, 3))
+    sample_img[roi_y1: roi_y2, roi_x1: roi_x2] = \
+        img[cross_y1: cross_y2, cross_x1: cross_x2]
+
+    sample_img = cv2.resize(
+        sample_img, (target_size, target_size), interpolation=cv2.INTER_AREA)
+
+    return sample_img
+
+
+def is_poly(segm):
+    assert isinstance(segm, (list, dict)), \
+        "Invalid segm type: {}".format(type(segm))
+    return isinstance(segm, list)
+
+
+def gaussian_radius(bbox_size, min_overlap):
+    height, width = bbox_size
+
+    a1 = 1
+    b1 = (height + width)
+    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
+    sq1 = np.sqrt(b1**2 - 4 * a1 * c1)
+    radius1 = (b1 + sq1) / (2 * a1)
+
+    a2 = 4
+    b2 = 2 * (height + width)
+    c2 = (1 - min_overlap) * width * height
+    sq2 = np.sqrt(b2**2 - 4 * a2 * c2)
+    radius2 = (b2 + sq2) / 2
+
+    a3 = 4 * min_overlap
+    b3 = -2 * min_overlap * (height + width)
+    c3 = (min_overlap - 1) * width * height
+    sq3 = np.sqrt(b3**2 - 4 * a3 * c3)
+    radius3 = (b3 + sq3) / 2
+    return min(radius1, radius2, radius3)
+
+
+def draw_gaussian(heatmap, center, radius, k=1, delte=6):
+    diameter = 2 * radius + 1
+    sigma = diameter / delte
+    gaussian = gaussian2D((diameter, diameter), sigma_x=sigma, sigma_y=sigma)
+
+    x, y = center
+
+    height, width = heatmap.shape[0:2]
+
+    left, right = min(x, radius), min(width - x, radius + 1)
+    top, bottom = min(y, radius), min(height - y, radius + 1)
+
+    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:
+                               radius + right]
+    np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
+
+
+def gaussian2D(shape, sigma_x=1, sigma_y=1):
+    m, n = [(ss - 1.) / 2. for ss in shape]
+    y, x = np.ogrid[-m:m + 1, -n:n + 1]
+
+    h = np.exp(-(x * x / (2 * sigma_x * sigma_x) + y * y / (2 * sigma_y *
+                                                            sigma_y)))
+    h[h < np.finfo(h.dtype).eps * h.max()] = 0
+    return h
+
+
+def draw_umich_gaussian(heatmap, center, radius, k=1):
+    """
+    draw_umich_gaussian, refer to https://github.com/xingyizhou/CenterNet/blob/master/src/lib/utils/image.py#L126
+    """
+    diameter = 2 * radius + 1
+    gaussian = gaussian2D(
+        (diameter, diameter), sigma_x=diameter / 6, sigma_y=diameter / 6)
+
+    x, y = int(center[0]), int(center[1])
+
+    height, width = heatmap.shape[0:2]
+
+    left, right = min(x, radius), min(width - x, radius + 1)
+    top, bottom = min(y, radius), min(height - y, radius + 1)
+
+    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:
+                               radius + right]
+    if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
+        np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
+    return heatmap
+
+
+def get_border(border, size):
+    i = 1
+    while size - border // i <= border // i:
+        i *= 2
+    return border // i
--- a/rtdetr_paddle/ppdet/data/transform/operators.py
+++ b/rtdetr_paddle/ppdet/data/transform/operators.py
--- a/rtdetr_paddle/ppdet/data/utils.py
+++ b/rtdetr_paddle/ppdet/data/utils.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numbers
+import numpy as np
+
+try:
+    from collections.abc import Sequence, Mapping
+except:
+    from collections import Sequence, Mapping
+
+
+def default_collate_fn(batch):
+    """
+    Default batch collating function for :code:`paddle.io.DataLoader`,
+    get input data as a list of sample datas, each element in list
+    if the data of a sample, and sample data should composed of list,
+    dictionary, string, number, numpy array, this
+    function will parse input data recursively and stack number,
+    numpy array and paddle.Tensor datas as batch datas. e.g. for
+    following input data:
+    [{'image': np.array(shape=[3, 224, 224]), 'label': 1},
+     {'image': np.array(shape=[3, 224, 224]), 'label': 3},
+     {'image': np.array(shape=[3, 224, 224]), 'label': 4},
+     {'image': np.array(shape=[3, 224, 224]), 'label': 5},]
+    
+    
+    This default collate function zipped each number and numpy array
+    field together and stack each field as the batch field as follows:
+    {'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])}
+    Args:  
+        batch(list of sample data): batch should be a list of sample data.
+    
+    Returns:
+        Batched data: batched each number, numpy array and paddle.Tensor
+                      in input data.
+    """
+    sample = batch[0]
+    if isinstance(sample, np.ndarray):
+        batch = np.stack(batch, axis=0)
+        return batch
+    elif isinstance(sample, numbers.Number):
+        batch = np.array(batch)
+        return batch
+    elif isinstance(sample, (str, bytes)):
+        return batch
+    elif isinstance(sample, Mapping):
+        return {
+            key: default_collate_fn([d[key] for d in batch])
+            for key in sample
+        }
+    elif isinstance(sample, Sequence):
+        sample_fields_num = len(sample)
+        if not all(len(sample) == sample_fields_num for sample in iter(batch)):
+            raise RuntimeError(
+                "fileds number not same among samples in a batch")
+        return [default_collate_fn(fields) for fields in zip(*batch)]
+
+    raise TypeError("batch data con only contains: tensor, numpy.ndarray, "
+                    "dict, list, number, but got {}".format(type(sample)))
--- a/rtdetr_paddle/ppdet/engine/init.py
+++ b/rtdetr_paddle/ppdet/engine/init.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from . import trainer
+from .trainer import *
+
+from . import callbacks
+from .callbacks import *
+
+from . import env
+from .env import *
+
+__all__ = trainer.__all__ \
+        + callbacks.__all__ \
+        + env.__all__
--- a/rtdetr_paddle/ppdet/engine/callbacks.py
+++ b/rtdetr_paddle/ppdet/engine/callbacks.py
@@ -0,0 +1,557 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import datetime
+import six
+import copy
+import json
+
+import paddle
+import paddle.distributed as dist
+
+from ppdet.utils.checkpoint import save_model
+from ppdet.metrics import get_infer_results
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('ppdet.engine')
+
+__all__ = [
+    'Callback', 'ComposeCallback', 'LogPrinter', 'Checkpointer',
+    'VisualDLWriter', 'SniperProposalsGenerator'
+]
+
+
+class Callback(object):
+    def __init__(self, model):
+        self.model = model
+
+    def on_step_begin(self, status):
+        pass
+
+    def on_step_end(self, status):
+        pass
+
+    def on_epoch_begin(self, status):
+        pass
+
+    def on_epoch_end(self, status):
+        pass
+
+    def on_train_begin(self, status):
+        pass
+
+    def on_train_end(self, status):
+        pass
+
+
+class ComposeCallback(object):
+    def __init__(self, callbacks):
+        callbacks = [c for c in list(callbacks) if c is not None]
+        for c in callbacks:
+            assert isinstance(
+                c, Callback), "callback should be subclass of Callback"
+        self._callbacks = callbacks
+
+    def on_step_begin(self, status):
+        for c in self._callbacks:
+            c.on_step_begin(status)
+
+    def on_step_end(self, status):
+        for c in self._callbacks:
+            c.on_step_end(status)
+
+    def on_epoch_begin(self, status):
+        for c in self._callbacks:
+            c.on_epoch_begin(status)
+
+    def on_epoch_end(self, status):
+        for c in self._callbacks:
+            c.on_epoch_end(status)
+
+    def on_train_begin(self, status):
+        for c in self._callbacks:
+            c.on_train_begin(status)
+
+    def on_train_end(self, status):
+        for c in self._callbacks:
+            c.on_train_end(status)
+
+
+class LogPrinter(Callback):
+    def __init__(self, model):
+        super(LogPrinter, self).__init__(model)
+
+    def on_step_end(self, status):
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            mode = status['mode']
+            if mode == 'train':
+                epoch_id = status['epoch_id']
+                step_id = status['step_id']
+                steps_per_epoch = status['steps_per_epoch']
+                training_status = status['training_status']
+                batch_time = status['batch_time']
+                data_time = status['data_time']
+
+                epoches = self.model.cfg.epoch
+                batch_size = self.model.cfg['{}Reader'.format(mode.capitalize(
+                ))]['batch_size']
+
+                logs = training_status.log()
+                space_fmt = ':' + str(len(str(steps_per_epoch))) + 'd'
+                if step_id % self.model.cfg.log_iter == 0:
+                    eta_steps = (epoches - epoch_id) * steps_per_epoch - step_id
+                    eta_sec = eta_steps * batch_time.global_avg
+                    eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
+                    ips = float(batch_size) / batch_time.avg
+                    fmt = ' '.join([
+                        'Epoch: [{}]',
+                        '[{' + space_fmt + '}/{}]',
+                        'learning_rate: {lr:.6f}',
+                        '{meters}',
+                        'eta: {eta}',
+                        'batch_cost: {btime}',
+                        'data_cost: {dtime}',
+                        'ips: {ips:.4f} images/s',
+                    ])
+                    fmt = fmt.format(
+                        epoch_id,
+                        step_id,
+                        steps_per_epoch,
+                        lr=status['learning_rate'],
+                        meters=logs,
+                        eta=eta_str,
+                        btime=str(batch_time),
+                        dtime=str(data_time),
+                        ips=ips)
+                    logger.info(fmt)
+            if mode == 'eval':
+                step_id = status['step_id']
+                if step_id % 100 == 0:
+                    logger.info("Eval iter: {}".format(step_id))
+
+    def on_epoch_end(self, status):
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            mode = status['mode']
+            if mode == 'eval':
+                sample_num = status['sample_num']
+                cost_time = status['cost_time']
+                logger.info('Total sample number: {}, average FPS: {}'.format(
+                    sample_num, sample_num / cost_time))
+
+
+class Checkpointer(Callback):
+    def __init__(self, model):
+        super(Checkpointer, self).__init__(model)
+        self.best_ap = -1000.
+        self.save_dir = os.path.join(self.model.cfg.save_dir,
+                                     self.model.cfg.filename)
+        if hasattr(self.model.model, 'student_model'):
+            self.weight = self.model.model.student_model
+        else:
+            self.weight = self.model.model
+
+    def on_epoch_end(self, status):
+        # Checkpointer only performed during training
+        mode = status['mode']
+        epoch_id = status['epoch_id']
+        weight = None
+        save_name = None
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            if mode == 'train':
+                end_epoch = self.model.cfg.epoch
+                if (
+                        epoch_id + 1
+                ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:
+                    save_name = str(
+                        epoch_id) if epoch_id != end_epoch - 1 else "model_final"
+                    weight = self.weight.state_dict()
+            elif mode == 'eval':
+                if 'save_best_model' in status and status['save_best_model']:
+                    for metric in self.model._metrics:
+                        map_res = metric.get_results()
+                        eval_func = "ap"
+                        if 'pose3d' in map_res:
+                            key = 'pose3d'
+                            eval_func = "mpjpe"
+                        elif 'bbox' in map_res:
+                            key = 'bbox'
+                        elif 'keypoint' in map_res:
+                            key = 'keypoint'
+                        else:
+                            key = 'mask'
+                        if key not in map_res:
+                            logger.warning("Evaluation results empty, this may be due to " \
+                                        "training iterations being too few or not " \
+                                        "loading the correct weights.")
+                            return
+                        if map_res[key][0] >= self.best_ap:
+                            self.best_ap = map_res[key][0]
+                            save_name = 'best_model'
+                            weight = self.weight.state_dict()
+                        logger.info("Best test {} {} is {:0.3f}.".format(
+                            key, eval_func, abs(self.best_ap)))
+            if weight:
+                if self.model.use_ema:
+                    exchange_save_model = status.get('exchange_save_model',
+                                                     False)
+                    if not exchange_save_model:
+                        # save model and ema_model
+                        save_model(
+                            status['weight'],
+                            self.model.optimizer,
+                            self.save_dir,
+                            save_name,
+                            epoch_id + 1,
+                            ema_model=weight)
+                    else:
+                        # save model(student model) and ema_model(teacher model)
+                        # in DenseTeacher SSOD, the teacher model will be higher,
+                        # so exchange when saving pdparams
+                        student_model = status['weight']  # model
+                        teacher_model = weight  # ema_model
+                        save_model(
+                            teacher_model,
+                            self.model.optimizer,
+                            self.save_dir,
+                            save_name,
+                            epoch_id + 1,
+                            ema_model=student_model)
+                        del teacher_model
+                        del student_model
+                else:
+                    save_model(weight, self.model.optimizer, self.save_dir,
+                               save_name, epoch_id + 1)
+
+
+class WiferFaceEval(Callback):
+    def __init__(self, model):
+        super(WiferFaceEval, self).__init__(model)
+
+    def on_epoch_begin(self, status):
+        assert self.model.mode == 'eval', \
+            "WiferFaceEval can only be set during evaluation"
+        for metric in self.model._metrics:
+            metric.update(self.model.model)
+        sys.exit()
+
+
+class VisualDLWriter(Callback):
+    """
+    Use VisualDL to log data or image
+    """
+
+    def __init__(self, model):
+        super(VisualDLWriter, self).__init__(model)
+
+        assert six.PY3, "VisualDL requires Python >= 3.5"
+        try:
+            from visualdl import LogWriter
+        except Exception as e:
+            logger.error('visualdl not found, plaese install visualdl. '
+                         'for example: `pip install visualdl`.')
+            raise e
+        self.vdl_writer = LogWriter(
+            model.cfg.get('vdl_log_dir', 'vdl_log_dir/scalar'))
+        self.vdl_loss_step = 0
+        self.vdl_mAP_step = 0
+        self.vdl_image_step = 0
+        self.vdl_image_frame = 0
+
+    def on_step_end(self, status):
+        mode = status['mode']
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            if mode == 'train':
+                training_status = status['training_status']
+                for loss_name, loss_value in training_status.get().items():
+                    self.vdl_writer.add_scalar(loss_name, loss_value,
+                                               self.vdl_loss_step)
+                self.vdl_loss_step += 1
+            elif mode == 'test':
+                ori_image = status['original_image']
+                result_image = status['result_image']
+                self.vdl_writer.add_image(
+                    "original/frame_{}".format(self.vdl_image_frame), ori_image,
+                    self.vdl_image_step)
+                self.vdl_writer.add_image(
+                    "result/frame_{}".format(self.vdl_image_frame),
+                    result_image, self.vdl_image_step)
+                self.vdl_image_step += 1
+                # each frame can display ten pictures at most.
+                if self.vdl_image_step % 10 == 0:
+                    self.vdl_image_step = 0
+                    self.vdl_image_frame += 1
+
+    def on_epoch_end(self, status):
+        mode = status['mode']
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            if mode == 'eval':
+                for metric in self.model._metrics:
+                    for key, map_value in metric.get_results().items():
+                        self.vdl_writer.add_scalar("{}-mAP".format(key),
+                                                   map_value[0],
+                                                   self.vdl_mAP_step)
+                self.vdl_mAP_step += 1
+
+
+class WandbCallback(Callback):
+    def __init__(self, model):
+        super(WandbCallback, self).__init__(model)
+
+        try:
+            import wandb
+            self.wandb = wandb
+        except Exception as e:
+            logger.error('wandb not found, please install wandb. '
+                         'Use: `pip install wandb`.')
+            raise e
+
+        self.wandb_params = model.cfg.get('wandb', None)
+        self.save_dir = os.path.join(self.model.cfg.save_dir,
+                                     self.model.cfg.filename)
+        if self.wandb_params is None:
+            self.wandb_params = {}
+        for k, v in model.cfg.items():
+            if k.startswith("wandb_"):
+                self.wandb_params.update({k.lstrip("wandb_"): v})
+
+        self._run = None
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            _ = self.run
+            self.run.config.update(self.model.cfg)
+            self.run.define_metric("epoch")
+            self.run.define_metric("eval/*", step_metric="epoch")
+
+        self.best_ap = -1000.
+        self.fps = []
+
+    @property
+    def run(self):
+        if self._run is None:
+            if self.wandb.run is not None:
+                logger.info(
+                    "There is an ongoing wandb run which will be used"
+                    "for logging. Please use `wandb.finish()` to end that"
+                    "if the behaviour is not intended")
+                self._run = self.wandb.run
+            else:
+                self._run = self.wandb.init(**self.wandb_params)
+        return self._run
+
+    def save_model(self,
+                   optimizer,
+                   save_dir,
+                   save_name,
+                   last_epoch,
+                   ema_model=None,
+                   ap=None,
+                   fps=None,
+                   tags=None):
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            model_path = os.path.join(save_dir, save_name)
+            metadata = {}
+            metadata["last_epoch"] = last_epoch
+            if ap:
+                metadata["ap"] = ap
+
+            if fps:
+                metadata["fps"] = fps
+
+            if ema_model is None:
+                ema_artifact = self.wandb.Artifact(
+                    name="ema_model-{}".format(self.run.id),
+                    type="model",
+                    metadata=metadata)
+                model_artifact = self.wandb.Artifact(
+                    name="model-{}".format(self.run.id),
+                    type="model",
+                    metadata=metadata)
+
+                ema_artifact.add_file(model_path + ".pdema", name="model_ema")
+                model_artifact.add_file(model_path + ".pdparams", name="model")
+
+                self.run.log_artifact(ema_artifact, aliases=tags)
+                self.run.log_artfact(model_artifact, aliases=tags)
+            else:
+                model_artifact = self.wandb.Artifact(
+                    name="model-{}".format(self.run.id),
+                    type="model",
+                    metadata=metadata)
+                model_artifact.add_file(model_path + ".pdparams", name="model")
+                self.run.log_artifact(model_artifact, aliases=tags)
+
+    def on_step_end(self, status):
+
+        mode = status['mode']
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            if mode == 'train':
+                training_status = status['training_status'].get()
+                for k, v in training_status.items():
+                    training_status[k] = float(v)
+
+                # calculate ips, data_cost, batch_cost
+                batch_time = status['batch_time']
+                data_time = status['data_time']
+                batch_size = self.model.cfg['{}Reader'.format(mode.capitalize(
+                ))]['batch_size']
+
+                ips = float(batch_size) / float(batch_time.avg)
+                data_cost = float(data_time.avg)
+                batch_cost = float(batch_time.avg)
+
+                metrics = {"train/" + k: v for k, v in training_status.items()}
+
+                metrics["train/ips"] = ips
+                metrics["train/data_cost"] = data_cost
+                metrics["train/batch_cost"] = batch_cost
+
+                self.fps.append(ips)
+                self.run.log(metrics)
+
+    def on_epoch_end(self, status):
+        mode = status['mode']
+        epoch_id = status['epoch_id']
+        save_name = None
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            if mode == 'train':
+                fps = sum(self.fps) / len(self.fps)
+                self.fps = []
+
+                end_epoch = self.model.cfg.epoch
+                if (
+                        epoch_id + 1
+                ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:
+                    save_name = str(
+                        epoch_id) if epoch_id != end_epoch - 1 else "model_final"
+                    tags = ["latest", "epoch_{}".format(epoch_id)]
+                    self.save_model(
+                        self.model.optimizer,
+                        self.save_dir,
+                        save_name,
+                        epoch_id + 1,
+                        self.model.use_ema,
+                        fps=fps,
+                        tags=tags)
+            if mode == 'eval':
+                sample_num = status['sample_num']
+                cost_time = status['cost_time']
+
+                fps = sample_num / cost_time
+
+                merged_dict = {}
+                for metric in self.model._metrics:
+                    for key, map_value in metric.get_results().items():
+                        merged_dict["eval/{}-mAP".format(key)] = map_value[0]
+                merged_dict["epoch"] = status["epoch_id"]
+                merged_dict["eval/fps"] = sample_num / cost_time
+
+                self.run.log(merged_dict)
+
+                if 'save_best_model' in status and status['save_best_model']:
+                    for metric in self.model._metrics:
+                        map_res = metric.get_results()
+                        if 'pose3d' in map_res:
+                            key = 'pose3d'
+                        elif 'bbox' in map_res:
+                            key = 'bbox'
+                        elif 'keypoint' in map_res:
+                            key = 'keypoint'
+                        else:
+                            key = 'mask'
+                        if key not in map_res:
+                            logger.warning("Evaluation results empty, this may be due to " \
+                                        "training iterations being too few or not " \
+                                        "loading the correct weights.")
+                            return
+                        if map_res[key][0] >= self.best_ap:
+                            self.best_ap = map_res[key][0]
+                            save_name = 'best_model'
+                            tags = ["best", "epoch_{}".format(epoch_id)]
+
+                            self.save_model(
+                                self.model.optimizer,
+                                self.save_dir,
+                                save_name,
+                                last_epoch=epoch_id + 1,
+                                ema_model=self.model.use_ema,
+                                ap=abs(self.best_ap),
+                                fps=fps,
+                                tags=tags)
+
+    def on_train_end(self, status):
+        self.run.finish()
+
+
+class SniperProposalsGenerator(Callback):
+    def __init__(self, model):
+        super(SniperProposalsGenerator, self).__init__(model)
+        ori_dataset = self.model.dataset
+        self.dataset = self._create_new_dataset(ori_dataset)
+        self.loader = self.model.loader
+        self.cfg = self.model.cfg
+        self.infer_model = self.model.model
+
+    def _create_new_dataset(self, ori_dataset):
+        dataset = copy.deepcopy(ori_dataset)
+        # init anno_cropper
+        dataset.init_anno_cropper()
+        # generate infer roidbs
+        ori_roidbs = dataset.get_ori_roidbs()
+        roidbs = dataset.anno_cropper.crop_infer_anno_records(ori_roidbs)
+        # set new roidbs
+        dataset.set_roidbs(roidbs)
+
+        return dataset
+
+    def _eval_with_loader(self, loader):
+        results = []
+        with paddle.no_grad():
+            self.infer_model.eval()
+            for step_id, data in enumerate(loader):
+                outs = self.infer_model(data)
+                for key in ['im_shape', 'scale_factor', 'im_id']:
+                    outs[key] = data[key]
+                for key, value in outs.items():
+                    if hasattr(value, 'numpy'):
+                        outs[key] = value.numpy()
+
+                results.append(outs)
+
+        return results
+
+    def on_train_end(self, status):
+        self.loader.dataset = self.dataset
+        results = self._eval_with_loader(self.loader)
+        results = self.dataset.anno_cropper.aggregate_chips_detections(results)
+        # sniper
+        proposals = []
+        clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()}
+        for outs in results:
+            batch_res = get_infer_results(outs, clsid2catid)
+            start = 0
+            for i, im_id in enumerate(outs['im_id']):
+                bbox_num = outs['bbox_num']
+                end = start + bbox_num[i]
+                bbox_res = batch_res['bbox'][start:end] \
+                    if 'bbox' in batch_res else None
+                if bbox_res:
+                    proposals += bbox_res
+        logger.info("save proposals in {}".format(self.cfg.proposals_path))
+        with open(self.cfg.proposals_path, 'w') as f:
+            json.dump(proposals, f)
--- a/rtdetr_paddle/ppdet/engine/env.py
+++ b/rtdetr_paddle/ppdet/engine/env.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import random
+import numpy as np
+
+import paddle
+from paddle.distributed import fleet
+
+__all__ = ['init_parallel_env', 'set_random_seed', 'init_fleet_env']
+
+
+def init_fleet_env(find_unused_parameters=False):
+    strategy = fleet.DistributedStrategy()
+    strategy.find_unused_parameters = find_unused_parameters
+    fleet.init(is_collective=True, strategy=strategy)
+
+
+def init_parallel_env():
+    env = os.environ
+    dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env
+    if dist:
+        trainer_id = int(env['PADDLE_TRAINER_ID'])
+        local_seed = (99 + trainer_id)
+        random.seed(local_seed)
+        np.random.seed(local_seed)
+
+    paddle.distributed.init_parallel_env()
+
+
+def set_random_seed(seed):
+    paddle.seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
--- a/rtdetr_paddle/ppdet/engine/export_utils.py
+++ b/rtdetr_paddle/ppdet/engine/export_utils.py
@@ -0,0 +1,349 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import yaml
+from collections import OrderedDict
+
+import paddle
+from ppdet.data.source.category import get_categories
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('ppdet.engine')
+
+# Global dictionary
+TRT_MIN_SUBGRAPH = {
+    'YOLO': 3,
+    'PPYOLOE': 3,
+    'SSD': 60,
+    'RCNN': 40,
+    'RetinaNet': 40,
+    'S2ANet': 80,
+    'EfficientDet': 40,
+    'Face': 3,
+    'TTFNet': 60,
+    'FCOS': 16,
+    'SOLOv2': 60,
+    'HigherHRNet': 3,
+    'HRNet': 3,
+    'DeepSORT': 3,
+    'ByteTrack': 10,
+    'CenterTrack': 5,
+    'JDE': 10,
+    'FairMOT': 5,
+    'GFL': 16,
+    'PicoDet': 3,
+    'CenterNet': 5,
+    'TOOD': 5,
+    'YOLOX': 8,
+    'YOLOF': 40,
+    'METRO_Body': 3,
+    'DETR': 3,
+}
+
+KEYPOINT_ARCH = ['HigherHRNet', 'TopDownHRNet']
+MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']
+
+TO_STATIC_SPEC = {
+    'yolov3_darknet53_270e_coco': [{
+        'im_id': paddle.static.InputSpec(
+            name='im_id', shape=[-1, 1], dtype='float32'),
+        'is_crowd': paddle.static.InputSpec(
+            name='is_crowd', shape=[-1, 50], dtype='float32'),
+        'gt_bbox': paddle.static.InputSpec(
+            name='gt_bbox', shape=[-1, 50, 4], dtype='float32'),
+        'curr_iter': paddle.static.InputSpec(
+            name='curr_iter', shape=[-1], dtype='float32'),
+        'image': paddle.static.InputSpec(
+            name='image', shape=[-1, 3, -1, -1], dtype='float32'),
+        'im_shape': paddle.static.InputSpec(
+            name='im_shape', shape=[-1, 2], dtype='float32'),
+        'scale_factor': paddle.static.InputSpec(
+            name='scale_factor', shape=[-1, 2], dtype='float32'),
+        'target0': paddle.static.InputSpec(
+            name='target0', shape=[-1, 3, 86, -1, -1], dtype='float32'),
+        'target1': paddle.static.InputSpec(
+            name='target1', shape=[-1, 3, 86, -1, -1], dtype='float32'),
+        'target2': paddle.static.InputSpec(
+            name='target2', shape=[-1, 3, 86, -1, -1], dtype='float32'),
+    }],
+    'tinypose_128x96': [{
+        'center': paddle.static.InputSpec(
+            name='center', shape=[-1, 2], dtype='float32'),
+        'scale': paddle.static.InputSpec(
+            name='scale', shape=[-1, 2], dtype='float32'),
+        'im_id': paddle.static.InputSpec(
+            name='im_id', shape=[-1, 1], dtype='float32'),
+        'image': paddle.static.InputSpec(
+            name='image', shape=[-1, 3, 128, 96], dtype='float32'),
+        'score': paddle.static.InputSpec(
+            name='score', shape=[-1], dtype='float32'),
+        'rotate': paddle.static.InputSpec(
+            name='rotate', shape=[-1], dtype='float32'),
+        'target': paddle.static.InputSpec(
+            name='target', shape=[-1, 17, 32, 24], dtype='float32'),
+        'target_weight': paddle.static.InputSpec(
+            name='target_weight', shape=[-1, 17, 1], dtype='float32'),
+    }],
+    'fcos_r50_fpn_1x_coco': [{
+        'im_id': paddle.static.InputSpec(
+            name='im_id', shape=[-1, 1], dtype='float32'),
+        'curr_iter': paddle.static.InputSpec(
+            name='curr_iter', shape=[-1], dtype='float32'),
+        'image': paddle.static.InputSpec(
+            name='image', shape=[-1, 3, -1, -1], dtype='float32'),
+        'im_shape': paddle.static.InputSpec(
+            name='im_shape', shape=[-1, 2], dtype='float32'),
+        'scale_factor': paddle.static.InputSpec(
+            name='scale_factor', shape=[-1, 2], dtype='float32'),
+        'reg_target0': paddle.static.InputSpec(
+            name='reg_target0', shape=[-1, 160, 160, 4], dtype='float32'),
+        'labels0': paddle.static.InputSpec(
+            name='labels0', shape=[-1, 160, 160, 1], dtype='int32'),
+        'centerness0': paddle.static.InputSpec(
+            name='centerness0', shape=[-1, 160, 160, 1], dtype='float32'),
+        'reg_target1': paddle.static.InputSpec(
+            name='reg_target1', shape=[-1, 80, 80, 4], dtype='float32'),
+        'labels1': paddle.static.InputSpec(
+            name='labels1', shape=[-1, 80, 80, 1], dtype='int32'),
+        'centerness1': paddle.static.InputSpec(
+            name='centerness1', shape=[-1, 80, 80, 1], dtype='float32'),
+        'reg_target2': paddle.static.InputSpec(
+            name='reg_target2', shape=[-1, 40, 40, 4], dtype='float32'),
+        'labels2': paddle.static.InputSpec(
+            name='labels2', shape=[-1, 40, 40, 1], dtype='int32'),
+        'centerness2': paddle.static.InputSpec(
+            name='centerness2', shape=[-1, 40, 40, 1], dtype='float32'),
+        'reg_target3': paddle.static.InputSpec(
+            name='reg_target3', shape=[-1, 20, 20, 4], dtype='float32'),
+        'labels3': paddle.static.InputSpec(
+            name='labels3', shape=[-1, 20, 20, 1], dtype='int32'),
+        'centerness3': paddle.static.InputSpec(
+            name='centerness3', shape=[-1, 20, 20, 1], dtype='float32'),
+        'reg_target4': paddle.static.InputSpec(
+            name='reg_target4', shape=[-1, 10, 10, 4], dtype='float32'),
+        'labels4': paddle.static.InputSpec(
+            name='labels4', shape=[-1, 10, 10, 1], dtype='int32'),
+        'centerness4': paddle.static.InputSpec(
+            name='centerness4', shape=[-1, 10, 10, 1], dtype='float32'),
+    }],
+    'picodet_s_320_coco_lcnet': [{
+        'im_id': paddle.static.InputSpec(
+            name='im_id', shape=[-1, 1], dtype='float32'),
+        'is_crowd': paddle.static.InputSpec(
+            name='is_crowd', shape=[-1, -1, 1], dtype='float32'),
+        'gt_class': paddle.static.InputSpec(
+            name='gt_class', shape=[-1, -1, 1], dtype='int32'),
+        'gt_bbox': paddle.static.InputSpec(
+            name='gt_bbox', shape=[-1, -1, 4], dtype='float32'),
+        'curr_iter': paddle.static.InputSpec(
+            name='curr_iter', shape=[-1], dtype='float32'),
+        'image': paddle.static.InputSpec(
+            name='image', shape=[-1, 3, -1, -1], dtype='float32'),
+        'im_shape': paddle.static.InputSpec(
+            name='im_shape', shape=[-1, 2], dtype='float32'),
+        'scale_factor': paddle.static.InputSpec(
+            name='scale_factor', shape=[-1, 2], dtype='float32'),
+        'pad_gt_mask': paddle.static.InputSpec(
+            name='pad_gt_mask', shape=[-1, -1, 1], dtype='float32'),
+    }],
+    'ppyoloe_crn_s_300e_coco': [{
+        'im_id': paddle.static.InputSpec(
+            name='im_id', shape=[-1, 1], dtype='float32'),
+        'is_crowd': paddle.static.InputSpec(
+            name='is_crowd', shape=[-1, -1, 1], dtype='float32'),
+        'gt_class': paddle.static.InputSpec(
+            name='gt_class', shape=[-1, -1, 1], dtype='int32'),
+        'gt_bbox': paddle.static.InputSpec(
+            name='gt_bbox', shape=[-1, -1, 4], dtype='float32'),
+        'curr_iter': paddle.static.InputSpec(
+            name='curr_iter', shape=[-1], dtype='float32'),
+        'image': paddle.static.InputSpec(
+            name='image', shape=[-1, 3, -1, -1], dtype='float32'),
+        'im_shape': paddle.static.InputSpec(
+            name='im_shape', shape=[-1, 2], dtype='float32'),
+        'scale_factor': paddle.static.InputSpec(
+            name='scale_factor', shape=[-1, 2], dtype='float32'),
+        'pad_gt_mask': paddle.static.InputSpec(
+            name='pad_gt_mask', shape=[-1, -1, 1], dtype='float32'),
+    }],
+}
+
+
+def apply_to_static(config, model):
+    filename = config.get('filename', None)
+    spec = TO_STATIC_SPEC.get(filename, None)
+    model = paddle.jit.to_static(model, input_spec=spec)
+    logger.info("Successfully to apply @to_static with specs: {}".format(spec))
+    return model
+
+
+def _prune_input_spec(input_spec, program, targets):
+    # try to prune static program to figure out pruned input spec
+    # so we perform following operations in static mode
+    device = paddle.get_device()
+    paddle.enable_static()
+    paddle.set_device(device)
+    pruned_input_spec = [{}]
+    program = program.clone()
+    program = program._prune(targets=targets)
+    global_block = program.global_block()
+    for name, spec in input_spec[0].items():
+        try:
+            v = global_block.var(name)
+            pruned_input_spec[0][name] = spec
+        except Exception:
+            pass
+    paddle.disable_static(place=device)
+    return pruned_input_spec
+
+
+def _parse_reader(reader_cfg, dataset_cfg, metric, arch, image_shape):
+    preprocess_list = []
+
+    anno_file = dataset_cfg.get_anno()
+
+    clsid2catid, catid2name = get_categories(metric, anno_file, arch)
+
+    label_list = [str(cat) for cat in catid2name.values()]
+
+    fuse_normalize = reader_cfg.get('fuse_normalize', False)
+    sample_transforms = reader_cfg['sample_transforms']
+    for st in sample_transforms[1:]:
+        for key, value in st.items():
+            p = {'type': key}
+            if key == 'Resize':
+                if int(image_shape[1]) != -1:
+                    value['target_size'] = image_shape[1:]
+                value['interp'] = value.get('interp', 1)  # cv2.INTER_LINEAR
+            if fuse_normalize and key == 'NormalizeImage':
+                continue
+            p.update(value)
+            preprocess_list.append(p)
+    batch_transforms = reader_cfg.get('batch_transforms', None)
+    if batch_transforms:
+        for bt in batch_transforms:
+            for key, value in bt.items():
+                # for deploy/infer, use PadStride(stride) instead PadBatch(pad_to_stride)
+                if key == 'PadBatch':
+                    preprocess_list.append({
+                        'type': 'PadStride',
+                        'stride': value['pad_to_stride']
+                    })
+                    break
+
+    return preprocess_list, label_list
+
+
+def _parse_tracker(tracker_cfg):
+    tracker_params = {}
+    for k, v in tracker_cfg.items():
+        tracker_params.update({k: v})
+    return tracker_params
+
+
+def _dump_infer_config(config, path, image_shape, model):
+    arch_state = False
+    from ppdet.core.config.yaml_helpers import setup_orderdict
+    setup_orderdict()
+    use_dynamic_shape = True if image_shape[2] == -1 else False
+    infer_cfg = OrderedDict({
+        'mode': 'paddle',
+        'draw_threshold': 0.5,
+        'metric': config['metric'],
+        'use_dynamic_shape': use_dynamic_shape
+    })
+    export_onnx = config.get('export_onnx', False)
+    export_eb = config.get('export_eb', False)
+
+    infer_arch = config['architecture']
+    if 'RCNN' in infer_arch and export_onnx:
+        logger.warning(
+            "Exporting RCNN model to ONNX only support batch_size = 1")
+        infer_cfg['export_onnx'] = True
+        infer_cfg['export_eb'] = export_eb
+
+    if infer_arch in MOT_ARCH:
+        if infer_arch == 'DeepSORT':
+            tracker_cfg = config['DeepSORTTracker']
+        elif infer_arch == 'CenterTrack':
+            tracker_cfg = config['CenterTracker']
+        else:
+            tracker_cfg = config['JDETracker']
+        infer_cfg['tracker'] = _parse_tracker(tracker_cfg)
+
+    for arch, min_subgraph_size in TRT_MIN_SUBGRAPH.items():
+        if arch in infer_arch:
+            infer_cfg['arch'] = arch
+            infer_cfg['min_subgraph_size'] = min_subgraph_size
+            arch_state = True
+            break
+
+    if infer_arch == 'PPYOLOEWithAuxHead':
+        infer_arch = 'PPYOLOE'
+
+    if infer_arch in ['PPYOLOE', 'YOLOX', 'YOLOF']:
+        infer_cfg['arch'] = infer_arch
+        infer_cfg['min_subgraph_size'] = TRT_MIN_SUBGRAPH[infer_arch]
+        arch_state = True
+
+    if not arch_state:
+        logger.error(
+            'Architecture: {} is not supported for exporting model now.\n'.
+            format(infer_arch) +
+            'Please set TRT_MIN_SUBGRAPH in ppdet/engine/export_utils.py')
+        os._exit(0)
+    if 'mask_head' in config[config['architecture']] and config[config[
+            'architecture']]['mask_head']:
+        infer_cfg['mask'] = True
+    label_arch = 'detection_arch'
+    if infer_arch in KEYPOINT_ARCH:
+        label_arch = 'keypoint_arch'
+
+    if infer_arch in MOT_ARCH:
+        if config['metric'] in ['COCO', 'VOC']:
+            # MOT model run as Detector
+            reader_cfg = config['TestReader']
+            dataset_cfg = config['TestDataset']
+        else:
+            # 'metric' in ['MOT', 'MCMOT', 'KITTI']
+            label_arch = 'mot_arch'
+            reader_cfg = config['TestMOTReader']
+            dataset_cfg = config['TestMOTDataset']
+    else:
+        reader_cfg = config['TestReader']
+        dataset_cfg = config['TestDataset']
+
+    infer_cfg['Preprocess'], infer_cfg['label_list'] = _parse_reader(
+        reader_cfg, dataset_cfg, config['metric'], label_arch, image_shape[1:])
+
+    if infer_arch == 'PicoDet':
+        if hasattr(config, 'export') and config['export'].get(
+                'post_process',
+                False) and not config['export'].get('benchmark', False):
+            infer_cfg['arch'] = 'GFL'
+        head_name = 'PicoHeadV2' if config['PicoHeadV2'] else 'PicoHead'
+        infer_cfg['NMS'] = config[head_name]['nms']
+        # In order to speed up the prediction, the threshold of nms 
+        # is adjusted here, which can be changed in infer_cfg.yml
+        config[head_name]['nms']["score_threshold"] = 0.3
+        config[head_name]['nms']["nms_threshold"] = 0.5
+        infer_cfg['fpn_stride'] = config[head_name]['fpn_stride']
+
+    yaml.dump(infer_cfg, open(path, 'w'))
+    logger.info("Export inference config file to {}".format(os.path.join(path)))
--- a/rtdetr_paddle/ppdet/engine/trainer.py
+++ b/rtdetr_paddle/ppdet/engine/trainer.py
@@ -0,0 +1,966 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import copy
+import time
+from tqdm import tqdm
+
+import numpy as np
+import typing
+from PIL import Image, ImageOps, ImageFile
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+import paddle
+import paddle.nn as nn
+import paddle.distributed as dist
+from paddle.distributed import fleet
+from paddle.static import InputSpec
+from ppdet.optimizer import ModelEMA
+
+from ppdet.core.workspace import create
+from ppdet.utils.checkpoint import load_weight, load_pretrain_weight
+from ppdet.utils.visualizer import visualize_results, save_result
+from ppdet.metrics import Metric, COCOMetric, VOCMetric, get_infer_results
+from ppdet.data.source.category import get_categories
+import ppdet.utils.stats as stats
+from ppdet.utils.fuse_utils import fuse_conv_bn
+from ppdet.utils import profiler
+from ppdet.modeling.post_process import multiclass_nms
+
+from .callbacks import Callback, ComposeCallback, LogPrinter, Checkpointer, VisualDLWriter, WandbCallback
+from .export_utils import _dump_infer_config, _prune_input_spec, apply_to_static
+
+from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('ppdet.engine')
+
+__all__ = ['Trainer']
+
+class Trainer(object):
+    def __init__(self, cfg, mode='train'):
+        self.cfg = cfg.copy()
+        assert mode.lower() in ['train', 'eval', 'test'], \
+                "mode should be 'train', 'eval' or 'test'"
+        self.mode = mode.lower()
+        self.optimizer = None
+        self.is_loaded_weights = False
+        self.use_amp = self.cfg.get('amp', False)
+        self.amp_level = self.cfg.get('amp_level', 'O1')
+        self.custom_white_list = self.cfg.get('custom_white_list', None)
+        self.custom_black_list = self.cfg.get('custom_black_list', None)
+
+        # build data loader
+        capital_mode = self.mode.capitalize()
+        self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(
+            '{}Dataset'.format(capital_mode))()
+
+        if self.mode == 'train':
+            self.loader = create('{}Reader'.format(capital_mode))(
+                self.dataset, cfg.worker_num)
+
+        # build model
+        if 'model' not in self.cfg:
+            self.model = create(cfg.architecture)
+        else:
+            self.model = self.cfg.model
+            self.is_loaded_weights = True
+
+        # EvalDataset build with BatchSampler to evaluate in single device
+        # TODO: multi-device evaluate
+        if self.mode == 'eval':
+            self._eval_batch_sampler = paddle.io.BatchSampler(
+                self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
+            reader_name = '{}Reader'.format(self.mode.capitalize())
+            # If metric is VOC, need to be set collate_batch=False.
+            if cfg.metric == 'VOC':
+                self.cfg[reader_name]['collate_batch'] = False
+            self.loader = create(reader_name)(self.dataset, cfg.worker_num,
+                                                self._eval_batch_sampler)
+            
+        # TestDataset build after user set images, skip loader creation here
+
+        # get Params
+        print_params = self.cfg.get('print_params', False)
+        if print_params:
+            params = sum([
+                p.numel() for n, p in self.model.named_parameters()
+                if all([x not in n for x in ['_mean', '_variance', 'aux_']])
+            ])  # exclude BatchNorm running status
+            logger.info('Model Params : {} M.'.format((params / 1e6).numpy()[
+                0]))
+
+        # build optimizer in train mode
+        if self.mode == 'train':
+            steps_per_epoch = len(self.loader)
+            if steps_per_epoch < 1:
+                logger.warning(
+                    "Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader."
+                )
+            self.lr = create('LearningRate')(steps_per_epoch)
+            self.optimizer = create('OptimizerBuilder')(self.lr, self.model)
+
+        if self.use_amp and self.amp_level == 'O2':
+            self.model, self.optimizer = paddle.amp.decorate(
+                models=self.model,
+                optimizers=self.optimizer,
+                level=self.amp_level)
+        self.use_ema = ('use_ema' in cfg and cfg['use_ema'])
+        if self.use_ema:
+            ema_decay = self.cfg.get('ema_decay', 0.9998)
+            ema_decay_type = self.cfg.get('ema_decay_type', 'threshold')
+            cycle_epoch = self.cfg.get('cycle_epoch', -1)
+            ema_black_list = self.cfg.get('ema_black_list', None)
+            ema_filter_no_grad = self.cfg.get('ema_filter_no_grad', False)
+            self.ema = ModelEMA(
+                self.model,
+                decay=ema_decay,
+                ema_decay_type=ema_decay_type,
+                cycle_epoch=cycle_epoch,
+                ema_black_list=ema_black_list,
+                ema_filter_no_grad=ema_filter_no_grad)
+
+        self._nranks = dist.get_world_size()
+        self._local_rank = dist.get_rank()
+
+        self.status = {}
+
+        self.start_epoch = 0
+        self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch
+
+        # initial default callbacks
+        self._init_callbacks()
+
+        # initial default metrics
+        self._init_metrics()
+        self._reset_metrics()
+
+    def _init_callbacks(self):
+        if self.mode == 'train':
+            self._callbacks = [LogPrinter(self), Checkpointer(self)]
+            if self.cfg.get('use_vdl', False):
+                self._callbacks.append(VisualDLWriter(self))
+            if self.cfg.get('use_wandb', False) or 'wandb' in self.cfg:
+                self._callbacks.append(WandbCallback(self))
+            self._compose_callback = ComposeCallback(self._callbacks)
+        elif self.mode == 'eval':
+            self._callbacks = [LogPrinter(self)]
+            self._compose_callback = ComposeCallback(self._callbacks)
+        elif self.mode == 'test' and self.cfg.get('use_vdl', False):
+            self._callbacks = [VisualDLWriter(self)]
+            self._compose_callback = ComposeCallback(self._callbacks)
+        else:
+            self._callbacks = []
+            self._compose_callback = None
+
+    def _init_metrics(self, validate=False):
+        if self.mode == 'test' or (self.mode == 'train' and not validate):
+            self._metrics = []
+            return
+        classwise = self.cfg['classwise'] if 'classwise' in self.cfg else False
+        if self.cfg.metric == 'COCO':
+            # TODO: bias should be unified
+            bias = 1 if self.cfg.get('bias', False) else 0
+            output_eval = self.cfg['output_eval'] \
+                if 'output_eval' in self.cfg else None
+            save_prediction_only = self.cfg.get('save_prediction_only', False)
+
+            # pass clsid2catid info to metric instance to avoid multiple loading
+            # annotation file
+            clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()} \
+                                if self.mode == 'eval' else None
+
+            # when do validation in train, annotation file should be get from
+            # EvalReader instead of self.dataset(which is TrainReader)
+            if self.mode == 'train' and validate:
+                eval_dataset = self.cfg['EvalDataset']
+                eval_dataset.check_or_download_dataset()
+                anno_file = eval_dataset.get_anno()
+                dataset = eval_dataset
+            else:
+                dataset = self.dataset
+                anno_file = dataset.get_anno()
+
+            IouType = self.cfg['IouType'] if 'IouType' in self.cfg else 'bbox'
+            self._metrics = [
+                COCOMetric(
+                    anno_file=anno_file,
+                    clsid2catid=clsid2catid,
+                    classwise=classwise,
+                    output_eval=output_eval,
+                    bias=bias,
+                    IouType=IouType,
+                    save_prediction_only=save_prediction_only)
+            ]
+
+        elif self.cfg.metric == 'VOC':
+            output_eval = self.cfg['output_eval'] \
+                if 'output_eval' in self.cfg else None
+            save_prediction_only = self.cfg.get('save_prediction_only', False)
+            self._metrics = [
+                VOCMetric(
+                    label_list=self.dataset.get_label_list(),
+                    class_num=self.cfg.num_classes,
+                    map_type=self.cfg.map_type,
+                    classwise=classwise,
+                    output_eval=output_eval,
+                    save_prediction_only=save_prediction_only)
+            ]
+        else:
+            logger.warning("Metric not support for metric type {}".format(
+                self.cfg.metric))
+            self._metrics = []
+
+    def _reset_metrics(self):
+        for metric in self._metrics:
+            metric.reset()
+
+    def register_callbacks(self, callbacks):
+        callbacks = [c for c in list(callbacks) if c is not None]
+        for c in callbacks:
+            assert isinstance(c, Callback), \
+                    "metrics shoule be instances of subclass of Metric"
+        self._callbacks.extend(callbacks)
+        self._compose_callback = ComposeCallback(self._callbacks)
+
+    def register_metrics(self, metrics):
+        metrics = [m for m in list(metrics) if m is not None]
+        for m in metrics:
+            assert isinstance(m, Metric), \
+                    "metrics shoule be instances of subclass of Metric"
+        self._metrics.extend(metrics)
+
+    def load_weights(self, weights, ARSL_eval=False):
+        if self.is_loaded_weights:
+            return
+        self.start_epoch = 0
+        load_pretrain_weight(self.model, weights, ARSL_eval)
+        logger.debug("Load weights {} to start training".format(weights))
+
+    def resume_weights(self, weights):
+        self.start_epoch = load_weight(self.model, weights, self.optimizer,
+                                        self.ema if self.use_ema else None)
+        logger.debug("Resume weights of epoch {}".format(self.start_epoch))
+
+    def train(self, validate=False):
+        assert self.mode == 'train', "Model not in 'train' mode"
+        Init_mark = False
+        if validate:
+            self.cfg['EvalDataset'] = self.cfg.EvalDataset = create(
+                "EvalDataset")()
+
+        model = self.model
+        if self.cfg.get('to_static', False):
+            model = apply_to_static(self.cfg, model)
+        sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and
+                   (self.cfg.use_gpu or self.cfg.use_mlu) and self._nranks > 1)
+        if sync_bn:
+            model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+
+        # enabel auto mixed precision mode
+        if self.use_amp:
+            scaler = paddle.amp.GradScaler(
+                enable=self.cfg.use_gpu or self.cfg.use_npu or self.cfg.use_mlu,
+                init_loss_scaling=self.cfg.get('init_loss_scaling', 1024))
+        # get distributed model
+        if self.cfg.get('fleet', False):
+            model = fleet.distributed_model(model)
+            self.optimizer = fleet.distributed_optimizer(self.optimizer)
+        elif self._nranks > 1:
+            find_unused_parameters = self.cfg[
+                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
+            model = paddle.DataParallel(
+                model, find_unused_parameters=find_unused_parameters)
+
+        self.status.update({
+            'epoch_id': self.start_epoch,
+            'step_id': 0,
+            'steps_per_epoch': len(self.loader)
+        })
+
+        self.status['batch_time'] = stats.SmoothedValue(
+            self.cfg.log_iter, fmt='{avg:.4f}')
+        self.status['data_time'] = stats.SmoothedValue(
+            self.cfg.log_iter, fmt='{avg:.4f}')
+        self.status['training_status'] = stats.TrainingStats(self.cfg.log_iter)
+
+        profiler_options = self.cfg.get('profiler_options', None)
+
+        self._compose_callback.on_train_begin(self.status)
+
+        use_fused_allreduce_gradients = self.cfg[
+            'use_fused_allreduce_gradients'] if 'use_fused_allreduce_gradients' in self.cfg else False
+
+        for epoch_id in range(self.start_epoch, self.cfg.epoch):
+            self.status['mode'] = 'train'
+            self.status['epoch_id'] = epoch_id
+            self._compose_callback.on_epoch_begin(self.status)
+            self.loader.dataset.set_epoch(epoch_id)
+            model.train()
+            iter_tic = time.time()
+            for step_id, data in enumerate(self.loader):
+                self.status['data_time'].update(time.time() - iter_tic)
+                self.status['step_id'] = step_id
+                profiler.add_profiler_step(profiler_options)
+                self._compose_callback.on_step_begin(self.status)
+                data['epoch_id'] = epoch_id
+                if self.cfg.get('to_static',
+                                False) and 'image_file' in data.keys():
+                    data.pop('image_file')
+
+                if self.use_amp:
+                    if isinstance(
+                            model, paddle.
+                            DataParallel) and use_fused_allreduce_gradients:
+                        with model.no_sync():
+                            with paddle.amp.auto_cast(
+                                    enable=self.cfg.use_gpu or
+                                    self.cfg.use_npu or self.cfg.use_mlu,
+                                    custom_white_list=self.custom_white_list,
+                                    custom_black_list=self.custom_black_list,
+                                    level=self.amp_level):
+                                # model forward
+                                outputs = model(data)
+                                loss = outputs['loss']
+                            # model backward
+                            scaled_loss = scaler.scale(loss)
+                            scaled_loss.backward()
+                        fused_allreduce_gradients(
+                            list(model.parameters()), None)
+                    else:
+                        with paddle.amp.auto_cast(
+                                enable=self.cfg.use_gpu or self.cfg.use_npu or
+                                self.cfg.use_mlu,
+                                custom_white_list=self.custom_white_list,
+                                custom_black_list=self.custom_black_list,
+                                level=self.amp_level):
+                            # model forward
+                            outputs = model(data)
+                            loss = outputs['loss']
+                        # model backward
+                        scaled_loss = scaler.scale(loss)
+                        scaled_loss.backward()
+                    # in dygraph mode, optimizer.minimize is equal to optimizer.step
+                    scaler.minimize(self.optimizer, scaled_loss)
+                else:
+                    if isinstance(
+                            model, paddle.
+                            DataParallel) and use_fused_allreduce_gradients:
+                        with model.no_sync():
+                            # model forward
+                            outputs = model(data)
+                            loss = outputs['loss']
+                            # model backward
+                            loss.backward()
+                        fused_allreduce_gradients(
+                            list(model.parameters()), None)
+                    else:
+                        # model forward
+                        outputs = model(data)
+                        loss = outputs['loss']
+                        # model backward
+                        loss.backward()
+                    self.optimizer.step()
+                curr_lr = self.optimizer.get_lr()
+                self.lr.step()
+                self.optimizer.clear_grad()
+                self.status['learning_rate'] = curr_lr
+
+                if self._nranks < 2 or self._local_rank == 0:
+                    self.status['training_status'].update(outputs)
+
+                self.status['batch_time'].update(time.time() - iter_tic)
+                self._compose_callback.on_step_end(self.status)
+                if self.use_ema:
+                    self.ema.update()
+                iter_tic = time.time()
+
+            is_snapshot = (self._nranks < 2 or (self._local_rank == 0 or self.cfg.metric == "Pose3DEval")) \
+                       and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1)
+            if is_snapshot and self.use_ema:
+                # apply ema weight on model
+                weight = copy.deepcopy(self.model.state_dict())
+                self.model.set_dict(self.ema.apply())
+                self.status['weight'] = weight
+
+            self._compose_callback.on_epoch_end(self.status)
+
+            if validate and is_snapshot:
+                if not hasattr(self, '_eval_loader'):
+                    # build evaluation dataset and loader
+                    self._eval_dataset = self.cfg.EvalDataset
+                    self._eval_batch_sampler = \
+                        paddle.io.BatchSampler(
+                            self._eval_dataset,
+                            batch_size=self.cfg.EvalReader['batch_size'])
+                    # If metric is VOC, need to be set collate_batch=False.
+                    if self.cfg.metric == 'VOC':
+                        self.cfg['EvalReader']['collate_batch'] = False
+                    else:
+                        self._eval_loader = create('EvalReader')(
+                            self._eval_dataset,
+                            self.cfg.worker_num,
+                            batch_sampler=self._eval_batch_sampler)
+                # if validation in training is enabled, metrics should be re-init
+                # Init_mark makes sure this code will only execute once
+                if validate and Init_mark == False:
+                    Init_mark = True
+                    self._init_metrics(validate=validate)
+                    self._reset_metrics()
+
+                with paddle.no_grad():
+                    self.status['save_best_model'] = True
+                    self._eval_with_loader(self._eval_loader)
+
+            if is_snapshot and self.use_ema:
+                # reset original weight
+                self.model.set_dict(weight)
+                self.status.pop('weight')
+
+        self._compose_callback.on_train_end(self.status)
+
+    def _eval_with_loader(self, loader):
+        sample_num = 0
+        tic = time.time()
+        self._compose_callback.on_epoch_begin(self.status)
+        self.status['mode'] = 'eval'
+
+        self.model.eval()
+        for step_id, data in enumerate(loader):
+            self.status['step_id'] = step_id
+            self._compose_callback.on_step_begin(self.status)
+            # forward
+            if self.use_amp:
+                with paddle.amp.auto_cast(
+                        enable=self.cfg.use_gpu or self.cfg.use_npu or
+                        self.cfg.use_mlu,
+                        custom_white_list=self.custom_white_list,
+                        custom_black_list=self.custom_black_list,
+                        level=self.amp_level):
+                    outs = self.model(data)
+            else:
+                outs = self.model(data)
+
+            # update metrics
+            for metric in self._metrics:
+                metric.update(data, outs)
+
+            # multi-scale inputs: all inputs have same im_id
+            if isinstance(data, typing.Sequence):
+                sample_num += data[0]['im_id'].numpy().shape[0]
+            else:
+                sample_num += data['im_id'].numpy().shape[0]
+            self._compose_callback.on_step_end(self.status)
+
+        self.status['sample_num'] = sample_num
+        self.status['cost_time'] = time.time() - tic
+
+        # accumulate metric to log out
+        for metric in self._metrics:
+            metric.accumulate()
+            metric.log()
+        self._compose_callback.on_epoch_end(self.status)
+        # reset metric states for metric may performed multiple times
+        self._reset_metrics()
+
+    def evaluate(self):
+        # get distributed model
+        if self.cfg.get('fleet', False):
+            self.model = fleet.distributed_model(self.model)
+            self.optimizer = fleet.distributed_optimizer(self.optimizer)
+        elif self._nranks > 1:
+            find_unused_parameters = self.cfg[
+                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
+            self.model = paddle.DataParallel(
+                self.model, find_unused_parameters=find_unused_parameters)
+        with paddle.no_grad():
+            self._eval_with_loader(self.loader)
+
+    def _eval_with_loader_slice(self,
+                                loader,
+                                slice_size=[640, 640],
+                                overlap_ratio=[0.25, 0.25],
+                                combine_method='nms',
+                                match_threshold=0.6,
+                                match_metric='iou'):
+        sample_num = 0
+        tic = time.time()
+        self._compose_callback.on_epoch_begin(self.status)
+        self.status['mode'] = 'eval'
+        self.model.eval()
+        merged_bboxs = []
+        for step_id, data in enumerate(loader):
+            self.status['step_id'] = step_id
+            self._compose_callback.on_step_begin(self.status)
+            # forward
+            if self.use_amp:
+                with paddle.amp.auto_cast(
+                        enable=self.cfg.use_gpu or self.cfg.use_npu or
+                        self.cfg.use_mlu,
+                        custom_white_list=self.custom_white_list,
+                        custom_black_list=self.custom_black_list,
+                        level=self.amp_level):
+                    outs = self.model(data)
+            else:
+                outs = self.model(data)
+
+            shift_amount = data['st_pix']
+            outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount
+            outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount
+            merged_bboxs.append(outs['bbox'])
+
+            if data['is_last'] > 0:
+                # merge matching predictions
+                merged_results = {'bbox': []}
+                if combine_method == 'nms':
+                    final_boxes = multiclass_nms(
+                        np.concatenate(merged_bboxs), self.cfg.num_classes,
+                        match_threshold, match_metric)
+                    merged_results['bbox'] = np.concatenate(final_boxes)
+                elif combine_method == 'concat':
+                    merged_results['bbox'] = np.concatenate(merged_bboxs)
+                else:
+                    raise ValueError(
+                        "Now only support 'nms' or 'concat' to fuse detection results."
+                    )
+                merged_results['im_id'] = np.array([[0]])
+                merged_results['bbox_num'] = np.array(
+                    [len(merged_results['bbox'])])
+
+                merged_bboxs = []
+                data['im_id'] = data['ori_im_id']
+                # update metrics
+                for metric in self._metrics:
+                    metric.update(data, merged_results)
+
+                # multi-scale inputs: all inputs have same im_id
+                if isinstance(data, typing.Sequence):
+                    sample_num += data[0]['im_id'].numpy().shape[0]
+                else:
+                    sample_num += data['im_id'].numpy().shape[0]
+
+            self._compose_callback.on_step_end(self.status)
+
+        self.status['sample_num'] = sample_num
+        self.status['cost_time'] = time.time() - tic
+
+        # accumulate metric to log out
+        for metric in self._metrics:
+            metric.accumulate()
+            metric.log()
+        self._compose_callback.on_epoch_end(self.status)
+        # reset metric states for metric may performed multiple times
+        self._reset_metrics()
+
+    def evaluate_slice(self,
+                       slice_size=[640, 640],
+                       overlap_ratio=[0.25, 0.25],
+                       combine_method='nms',
+                       match_threshold=0.6,
+                       match_metric='iou'):
+        with paddle.no_grad():
+            self._eval_with_loader_slice(self.loader, slice_size, overlap_ratio,
+                                         combine_method, match_threshold,
+                                         match_metric)
+
+    def slice_predict(self,
+                      images,
+                      slice_size=[640, 640],
+                      overlap_ratio=[0.25, 0.25],
+                      combine_method='nms',
+                      match_threshold=0.6,
+                      match_metric='iou',
+                      draw_threshold=0.5,
+                      output_dir='output',
+                      save_results=False,
+                      visualize=True):
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        self.dataset.set_slice_images(images, slice_size, overlap_ratio)
+        loader = create('TestReader')(self.dataset, 0)
+        imid2path = self.dataset.get_imid2path()
+
+        def setup_metrics_for_loader():
+            # mem
+            metrics = copy.deepcopy(self._metrics)
+            mode = self.mode
+            save_prediction_only = self.cfg[
+                'save_prediction_only'] if 'save_prediction_only' in self.cfg else None
+            output_eval = self.cfg[
+                'output_eval'] if 'output_eval' in self.cfg else None
+
+            # modify
+            self.mode = '_test'
+            self.cfg['save_prediction_only'] = True
+            self.cfg['output_eval'] = output_dir
+            self.cfg['imid2path'] = imid2path
+            self._init_metrics()
+
+            # restore
+            self.mode = mode
+            self.cfg.pop('save_prediction_only')
+            if save_prediction_only is not None:
+                self.cfg['save_prediction_only'] = save_prediction_only
+
+            self.cfg.pop('output_eval')
+            if output_eval is not None:
+                self.cfg['output_eval'] = output_eval
+
+            self.cfg.pop('imid2path')
+
+            _metrics = copy.deepcopy(self._metrics)
+            self._metrics = metrics
+
+            return _metrics
+
+        if save_results:
+            metrics = setup_metrics_for_loader()
+        else:
+            metrics = []
+
+        anno_file = self.dataset.get_anno()
+        clsid2catid, catid2name = get_categories(
+            self.cfg.metric, anno_file=anno_file)
+
+        # Run Infer 
+        self.status['mode'] = 'test'
+        self.model.eval()
+
+        results = []  # all images
+        merged_bboxs = []  # single image
+        for step_id, data in enumerate(tqdm(loader)):
+            self.status['step_id'] = step_id
+            # forward
+            with paddle.no_grad():
+                outs = self.model(data)
+
+            outs['bbox'] = outs['bbox'].numpy()  # only in test mode
+            shift_amount = data['st_pix']
+            outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount.numpy()
+            outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount.numpy()
+            merged_bboxs.append(outs['bbox'])
+
+            if data['is_last'] > 0:
+                # merge matching predictions
+                merged_results = {'bbox': []}
+                if combine_method == 'nms':
+                    final_boxes = multiclass_nms(
+                        np.concatenate(merged_bboxs), self.cfg.num_classes,
+                        match_threshold, match_metric)
+                    merged_results['bbox'] = np.concatenate(final_boxes)
+                elif combine_method == 'concat':
+                    merged_results['bbox'] = np.concatenate(merged_bboxs)
+                else:
+                    raise ValueError(
+                        "Now only support 'nms' or 'concat' to fuse detection results."
+                    )
+                merged_results['im_id'] = np.array([[0]])
+                merged_results['bbox_num'] = np.array(
+                    [len(merged_results['bbox'])])
+
+                merged_bboxs = []
+                data['im_id'] = data['ori_im_id']
+
+                for _m in metrics:
+                    _m.update(data, merged_results)
+
+                for key in ['im_shape', 'scale_factor', 'im_id']:
+                    if isinstance(data, typing.Sequence):
+                        merged_results[key] = data[0][key]
+                    else:
+                        merged_results[key] = data[key]
+                for key, value in merged_results.items():
+                    if hasattr(value, 'numpy'):
+                        merged_results[key] = value.numpy()
+                results.append(merged_results)
+
+        for _m in metrics:
+            _m.accumulate()
+            _m.reset()
+
+        if visualize:
+            for outs in results:
+                batch_res = get_infer_results(outs, clsid2catid)
+                bbox_num = outs['bbox_num']
+
+                start = 0
+                for i, im_id in enumerate(outs['im_id']):
+                    image_path = imid2path[int(im_id)]
+                    image = Image.open(image_path).convert('RGB')
+                    image = ImageOps.exif_transpose(image)
+                    self.status['original_image'] = np.array(image.copy())
+
+                    end = start + bbox_num[i]
+                    bbox_res = batch_res['bbox'][start:end] \
+                            if 'bbox' in batch_res else None
+                    mask_res = batch_res['mask'][start:end] \
+                            if 'mask' in batch_res else None
+                    segm_res = batch_res['segm'][start:end] \
+                            if 'segm' in batch_res else None
+                    keypoint_res = batch_res['keypoint'][start:end] \
+                            if 'keypoint' in batch_res else None
+                    pose3d_res = batch_res['pose3d'][start:end] \
+                            if 'pose3d' in batch_res else None
+                    image = visualize_results(
+                        image, bbox_res, mask_res, segm_res, keypoint_res,
+                        pose3d_res, int(im_id), catid2name, draw_threshold)
+                    self.status['result_image'] = np.array(image.copy())
+                    if self._compose_callback:
+                        self._compose_callback.on_step_end(self.status)
+                    # save image with detection
+                    save_name = self._get_save_image_name(output_dir,
+                                                          image_path)
+                    logger.info("Detection bbox results save in {}".format(
+                        save_name))
+                    image.save(save_name, quality=95)
+
+                    start = end
+
+    def predict(self,
+                images,
+                draw_threshold=0.5,
+                output_dir='output',
+                save_results=False,
+                visualize=True):
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        self.dataset.set_images(images)
+        loader = create('TestReader')(self.dataset, 0)
+
+        imid2path = self.dataset.get_imid2path()
+
+        def setup_metrics_for_loader():
+            # mem
+            metrics = copy.deepcopy(self._metrics)
+            mode = self.mode
+            save_prediction_only = self.cfg[
+                'save_prediction_only'] if 'save_prediction_only' in self.cfg else None
+            output_eval = self.cfg[
+                'output_eval'] if 'output_eval' in self.cfg else None
+
+            # modify
+            self.mode = '_test'
+            self.cfg['save_prediction_only'] = True
+            self.cfg['output_eval'] = output_dir
+            self.cfg['imid2path'] = imid2path
+            self._init_metrics()
+
+            # restore
+            self.mode = mode
+            self.cfg.pop('save_prediction_only')
+            if save_prediction_only is not None:
+                self.cfg['save_prediction_only'] = save_prediction_only
+
+            self.cfg.pop('output_eval')
+            if output_eval is not None:
+                self.cfg['output_eval'] = output_eval
+
+            self.cfg.pop('imid2path')
+
+            _metrics = copy.deepcopy(self._metrics)
+            self._metrics = metrics
+
+            return _metrics
+
+        if save_results:
+            metrics = setup_metrics_for_loader()
+        else:
+            metrics = []
+
+        anno_file = self.dataset.get_anno()
+        clsid2catid, catid2name = get_categories(
+            self.cfg.metric, anno_file=anno_file)
+
+        # Run Infer 
+        self.status['mode'] = 'test'
+        self.model.eval()
+
+        results = []
+        for step_id, data in enumerate(tqdm(loader)):
+            self.status['step_id'] = step_id
+            # forward
+            with paddle.no_grad():
+                if hasattr(self.model, 'modelTeacher'):
+                    outs = self.model.modelTeacher(data)
+                else:
+                    outs = self.model(data)
+
+            for _m in metrics:
+                _m.update(data, outs)
+
+            for key in ['im_shape', 'scale_factor', 'im_id']:
+                if isinstance(data, typing.Sequence):
+                    outs[key] = data[0][key]
+                else:
+                    outs[key] = data[key]
+            for key, value in outs.items():
+                if hasattr(value, 'numpy'):
+                    outs[key] = value.numpy()
+            results.append(outs)
+
+        for _m in metrics:
+            _m.accumulate()
+            _m.reset()
+
+        if visualize:
+            for outs in results:
+                batch_res = get_infer_results(outs, clsid2catid)
+                bbox_num = outs['bbox_num']
+
+                start = 0
+                for i, im_id in enumerate(outs['im_id']):
+                    image_path = imid2path[int(im_id)]
+                    image = Image.open(image_path).convert('RGB')
+                    image = ImageOps.exif_transpose(image)
+                    self.status['original_image'] = np.array(image.copy())
+
+                    end = start + bbox_num[i]
+                    bbox_res = batch_res['bbox'][start:end] \
+                            if 'bbox' in batch_res else None
+                    mask_res = batch_res['mask'][start:end] \
+                            if 'mask' in batch_res else None
+                    segm_res = batch_res['segm'][start:end] \
+                            if 'segm' in batch_res else None
+                    keypoint_res = batch_res['keypoint'][start:end] \
+                            if 'keypoint' in batch_res else None
+                    pose3d_res = batch_res['pose3d'][start:end] \
+                            if 'pose3d' in batch_res else None
+                    image = visualize_results(
+                        image, bbox_res, mask_res, segm_res, keypoint_res,
+                        pose3d_res, int(im_id), catid2name, draw_threshold)
+                    self.status['result_image'] = np.array(image.copy())
+                    if self._compose_callback:
+                        self._compose_callback.on_step_end(self.status)
+                    # save image with detection
+                    save_name = self._get_save_image_name(output_dir,
+                                                          image_path)
+                    logger.info("Detection bbox results save in {}".format(
+                        save_name))
+                    image.save(save_name, quality=95)
+
+                    start = end
+        return results
+
+    def _get_save_image_name(self, output_dir, image_path):
+        """
+        Get save image name from source image path.
+        """
+        image_name = os.path.split(image_path)[-1]
+        name, ext = os.path.splitext(image_name)
+        return os.path.join(output_dir, "{}".format(name)) + ext
+
+    def _get_infer_cfg_and_input_spec(self,
+                                      save_dir,
+                                      prune_input=True,
+                                      kl_quant=False):
+        image_shape = None
+        im_shape = [None, 2]
+        scale_factor = [None, 2]
+        test_reader_name = 'TestReader'
+        if 'inputs_def' in self.cfg[test_reader_name]:
+            inputs_def = self.cfg[test_reader_name]['inputs_def']
+            image_shape = inputs_def.get('image_shape', None)
+        # set image_shape=[None, 3, -1, -1] as default
+        if image_shape is None:
+            image_shape = [None, 3, -1, -1]
+    
+        if len(image_shape) == 3:
+            image_shape = [None] + image_shape
+        else:
+            im_shape = [image_shape[0], 2]
+            scale_factor = [image_shape[0], 2]
+    
+        if hasattr(self.model, 'deploy'):
+            self.model.deploy = True
+    
+        for layer in self.model.sublayers():
+            if hasattr(layer, 'convert_to_deploy'):
+                layer.convert_to_deploy()
+    
+        if hasattr(self.cfg, 'export') and 'fuse_conv_bn' in self.cfg[
+                'export'] and self.cfg['export']['fuse_conv_bn']:
+            self.model = fuse_conv_bn(self.model)
+    
+        export_post_process = self.cfg['export'].get(
+            'post_process', False) if hasattr(self.cfg, 'export') else True
+        export_nms = self.cfg['export'].get('nms', False) if hasattr(
+            self.cfg, 'export') else True
+        export_benchmark = self.cfg['export'].get(
+            'benchmark', False) if hasattr(self.cfg, 'export') else False
+        if hasattr(self.model, 'export_post_process'):
+            self.model.export_post_process = export_post_process if not export_benchmark else False
+        if hasattr(self.model, 'export_nms'):
+            self.model.export_nms = export_nms if not export_benchmark else False
+        if export_post_process and not export_benchmark:
+            image_shape = [None] + image_shape[1:]
+    
+        # Save infer cfg
+        _dump_infer_config(self.cfg,
+                           os.path.join(save_dir, 'infer_cfg.yml'), image_shape,
+                           self.model)
+    
+        input_spec = [{
+            "image": InputSpec(
+                shape=image_shape, name='image'),
+            "im_shape": InputSpec(
+                shape=im_shape, name='im_shape'),
+            "scale_factor": InputSpec(
+                shape=scale_factor, name='scale_factor')
+        }]
+    
+        if prune_input:
+            static_model = paddle.jit.to_static(
+                self.model, input_spec=input_spec, full_graph=True)
+            # NOTE: dy2st do not pruned program, but jit.save will prune program
+            # input spec, prune input spec here and save with pruned input spec
+            pruned_input_spec = _prune_input_spec(
+                input_spec, static_model.forward.main_program,
+                static_model.forward.outputs)
+        else:
+            static_model = None
+            pruned_input_spec = input_spec
+    
+        return static_model, pruned_input_spec
+
+    def export(self, output_dir='output_inference'):
+        if hasattr(self.model, 'aux_neck'):
+            self.model.__delattr__('aux_neck')
+        if hasattr(self.model, 'aux_head'):
+            self.model.__delattr__('aux_head')
+        self.model.eval()
+
+        model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]
+        save_dir = os.path.join(output_dir, model_name)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+
+        static_model, pruned_input_spec = self._get_infer_cfg_and_input_spec(
+            save_dir)
+
+        # dy2st and save model
+        paddle.jit.save(
+            static_model,
+            os.path.join(save_dir, 'model'),
+            input_spec=pruned_input_spec)
+
+        logger.info("Export model and saved in {}".format(save_dir))
--- a/rtdetr_paddle/ppdet/metrics/init.py
+++ b/rtdetr_paddle/ppdet/metrics/init.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import metrics
+
+from .metrics import *
+from .pose3d_metrics import *
+
+from . import mot_metrics
+from .mot_metrics import *
+__all__ = metrics.__all__ + mot_metrics.__all__
+
+from . import mcmot_metrics
+from .mcmot_metrics import *
+__all__ = metrics.__all__ + mcmot_metrics.__all__
--- a/rtdetr_paddle/ppdet/metrics/coco_utils.py
+++ b/rtdetr_paddle/ppdet/metrics/coco_utils.py
@@ -0,0 +1,188 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import numpy as np
+import itertools
+
+from ppdet.metrics.json_results import get_det_res, get_det_poly_res, get_seg_res, get_solov2_segm_res, get_keypoint_res, get_pose3d_res
+from ppdet.metrics.map_utils import draw_pr_curve
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+def get_infer_results(outs, catid, bias=0):
+    """
+    Get result at the stage of inference.
+    The output format is dictionary containing bbox or mask result.
+
+    For example, bbox result is a list and each element contains
+    image_id, category_id, bbox and score.
+    """
+    if outs is None or len(outs) == 0:
+        raise ValueError(
+            'The number of valid detection result if zero. Please use reasonable model and check input data.'
+        )
+
+    im_id = outs['im_id']
+
+    infer_res = {}
+    if 'bbox' in outs:
+        if len(outs['bbox']) > 0 and len(outs['bbox'][0]) > 6:
+            infer_res['bbox'] = get_det_poly_res(
+                outs['bbox'], outs['bbox_num'], im_id, catid, bias=bias)
+        else:
+            infer_res['bbox'] = get_det_res(
+                outs['bbox'], outs['bbox_num'], im_id, catid, bias=bias)
+
+    if 'mask' in outs:
+        # mask post process
+        infer_res['mask'] = get_seg_res(outs['mask'], outs['bbox'],
+                                        outs['bbox_num'], im_id, catid)
+
+    if 'segm' in outs:
+        infer_res['segm'] = get_solov2_segm_res(outs, im_id, catid)
+
+    if 'keypoint' in outs:
+        infer_res['keypoint'] = get_keypoint_res(outs, im_id)
+        outs['bbox_num'] = [len(infer_res['keypoint'])]
+
+    if 'pose3d' in outs:
+        infer_res['pose3d'] = get_pose3d_res(outs, im_id)
+        outs['bbox_num'] = [len(infer_res['pose3d'])]
+
+    return infer_res
+
+
+def cocoapi_eval(jsonfile,
+                 style,
+                 coco_gt=None,
+                 anno_file=None,
+                 max_dets=(100, 300, 1000),
+                 classwise=False,
+                 sigmas=None,
+                 use_area=True):
+    """
+    Args:
+        jsonfile (str): Evaluation json file, eg: bbox.json, mask.json.
+        style (str): COCOeval style, can be `bbox` , `segm` , `proposal`, `keypoints` and `keypoints_crowd`.
+        coco_gt (str): Whether to load COCOAPI through anno_file,
+                 eg: coco_gt = COCO(anno_file)
+        anno_file (str): COCO annotations file.
+        max_dets (tuple): COCO evaluation maxDets.
+        classwise (bool): Whether per-category AP and draw P-R Curve or not.
+        sigmas (nparray): keypoint labelling sigmas.
+        use_area (bool): If gt annotations (eg. CrowdPose, AIC)
+                         do not have 'area', please set use_area=False.
+    """
+    assert coco_gt != None or anno_file != None
+    if style == 'keypoints_crowd':
+        #please install xtcocotools==1.6
+        from xtcocotools.coco import COCO
+        from xtcocotools.cocoeval import COCOeval
+    else:
+        from pycocotools.coco import COCO
+        from pycocotools.cocoeval import COCOeval
+
+    if coco_gt == None:
+        coco_gt = COCO(anno_file)
+    logger.info("Start evaluate...")
+    coco_dt = coco_gt.loadRes(jsonfile)
+    if style == 'proposal':
+        coco_eval = COCOeval(coco_gt, coco_dt, 'bbox')
+        coco_eval.params.useCats = 0
+        coco_eval.params.maxDets = list(max_dets)
+    elif style == 'keypoints_crowd':
+        coco_eval = COCOeval(coco_gt, coco_dt, style, sigmas, use_area)
+    else:
+        coco_eval = COCOeval(coco_gt, coco_dt, style)
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    if classwise:
+        # Compute per-category AP and PR curve
+        try:
+            from terminaltables import AsciiTable
+        except Exception as e:
+            logger.error(
+                'terminaltables not found, plaese install terminaltables. '
+                'for example: `pip install terminaltables`.')
+            raise e
+        precisions = coco_eval.eval['precision']
+        cat_ids = coco_gt.getCatIds()
+        # precision: (iou, recall, cls, area range, max dets)
+        assert len(cat_ids) == precisions.shape[2]
+        results_per_category = []
+        for idx, catId in enumerate(cat_ids):
+            # area range index 0: all area ranges
+            # max dets index -1: typically 100 per image
+            nm = coco_gt.loadCats(catId)[0]
+            precision = precisions[:, :, idx, 0, -1]
+            precision = precision[precision > -1]
+            if precision.size:
+                ap = np.mean(precision)
+            else:
+                ap = float('nan')
+            results_per_category.append(
+                (str(nm["name"]), '{:0.3f}'.format(float(ap))))
+            pr_array = precisions[0, :, idx, 0, 2]
+            recall_array = np.arange(0.0, 1.01, 0.01)
+            draw_pr_curve(
+                pr_array,
+                recall_array,
+                out_dir=style + '_pr_curve',
+                file_name='{}_precision_recall_curve.jpg'.format(nm["name"]))
+
+        num_columns = min(6, len(results_per_category) * 2)
+        results_flatten = list(itertools.chain(*results_per_category))
+        headers = ['category', 'AP'] * (num_columns // 2)
+        results_2d = itertools.zip_longest(
+            * [results_flatten[i::num_columns] for i in range(num_columns)])
+        table_data = [headers]
+        table_data += [result for result in results_2d]
+        table = AsciiTable(table_data)
+        logger.info('Per-category of {} AP: \n{}'.format(style, table.table))
+        logger.info("per-category PR curve has output to {} folder.".format(
+            style + '_pr_curve'))
+    # flush coco evaluation result
+    sys.stdout.flush()
+    return coco_eval.stats
+
+
+def json_eval_results(metric, json_directory, dataset):
+    """
+    cocoapi eval with already exists proposal.json, bbox.json or mask.json
+    """
+    assert metric == 'COCO'
+    anno_file = dataset.get_anno()
+    json_file_list = ['proposal.json', 'bbox.json', 'mask.json']
+    if json_directory:
+        assert os.path.exists(
+            json_directory), "The json directory:{} does not exist".format(
+                json_directory)
+        for k, v in enumerate(json_file_list):
+            json_file_list[k] = os.path.join(str(json_directory), v)
+
+    coco_eval_style = ['proposal', 'bbox', 'segm']
+    for i, v_json in enumerate(json_file_list):
+        if os.path.exists(v_json):
+            cocoapi_eval(v_json, coco_eval_style[i], anno_file=anno_file)
+        else:
+            logger.info("{} not exists!".format(v_json))
--- a/rtdetr_paddle/ppdet/metrics/json_results.py
+++ b/rtdetr_paddle/ppdet/metrics/json_results.py
@@ -0,0 +1,175 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import six
+import numpy as np
+
+
+def get_det_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0):
+    det_res = []
+    k = 0
+    for i in range(len(bbox_nums)):
+        cur_image_id = int(image_id[i][0])
+        det_nums = bbox_nums[i]
+        for j in range(det_nums):
+            dt = bboxes[k]
+            k = k + 1
+            num_id, score, xmin, ymin, xmax, ymax = dt.tolist()
+            if int(num_id) < 0:
+                continue
+            category_id = label_to_cat_id_map[int(num_id)]
+            w = xmax - xmin + bias
+            h = ymax - ymin + bias
+            bbox = [xmin, ymin, w, h]
+            dt_res = {
+                'image_id': cur_image_id,
+                'category_id': category_id,
+                'bbox': bbox,
+                'score': score
+            }
+            det_res.append(dt_res)
+    return det_res
+
+
+def get_det_poly_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0):
+    det_res = []
+    k = 0
+    for i in range(len(bbox_nums)):
+        cur_image_id = int(image_id[i][0])
+        det_nums = bbox_nums[i]
+        for j in range(det_nums):
+            dt = bboxes[k]
+            k = k + 1
+            num_id, score, x1, y1, x2, y2, x3, y3, x4, y4 = dt.tolist()
+            if int(num_id) < 0:
+                continue
+            category_id = label_to_cat_id_map[int(num_id)]
+            rbox = [x1, y1, x2, y2, x3, y3, x4, y4]
+            dt_res = {
+                'image_id': cur_image_id,
+                'category_id': category_id,
+                'bbox': rbox,
+                'score': score
+            }
+            det_res.append(dt_res)
+    return det_res
+
+
+def strip_mask(mask):
+    row = mask[0, 0, :]
+    col = mask[0, :, 0]
+    im_h = len(col) - np.count_nonzero(col == -1)
+    im_w = len(row) - np.count_nonzero(row == -1)
+    return mask[:, :im_h, :im_w]
+
+
+def get_seg_res(masks, bboxes, mask_nums, image_id, label_to_cat_id_map):
+    import pycocotools.mask as mask_util
+    seg_res = []
+    k = 0
+    for i in range(len(mask_nums)):
+        cur_image_id = int(image_id[i][0])
+        det_nums = mask_nums[i]
+        mask_i = masks[k:k + det_nums]
+        mask_i = strip_mask(mask_i)
+        for j in range(det_nums):
+            mask = mask_i[j].astype(np.uint8)
+            score = float(bboxes[k][1])
+            label = int(bboxes[k][0])
+            k = k + 1
+            if label == -1:
+                continue
+            cat_id = label_to_cat_id_map[label]
+            rle = mask_util.encode(
+                np.array(
+                    mask[:, :, None], order="F", dtype="uint8"))[0]
+            if six.PY3:
+                if 'counts' in rle:
+                    rle['counts'] = rle['counts'].decode("utf8")
+            sg_res = {
+                'image_id': cur_image_id,
+                'category_id': cat_id,
+                'segmentation': rle,
+                'score': score
+            }
+            seg_res.append(sg_res)
+    return seg_res
+
+
+def get_solov2_segm_res(results, image_id, num_id_to_cat_id_map):
+    import pycocotools.mask as mask_util
+    segm_res = []
+    # for each batch
+    segms = results['segm'].astype(np.uint8)
+    clsid_labels = results['cate_label']
+    clsid_scores = results['cate_score']
+    lengths = segms.shape[0]
+    im_id = int(image_id[0][0])
+    if lengths == 0 or segms is None:
+        return None
+    # for each sample
+    for i in range(lengths - 1):
+        clsid = int(clsid_labels[i])
+        catid = num_id_to_cat_id_map[clsid]
+        score = float(clsid_scores[i])
+        mask = segms[i]
+        segm = mask_util.encode(np.array(mask[:, :, np.newaxis], order='F'))[0]
+        segm['counts'] = segm['counts'].decode('utf8')
+        coco_res = {
+            'image_id': im_id,
+            'category_id': catid,
+            'segmentation': segm,
+            'score': score
+        }
+        segm_res.append(coco_res)
+    return segm_res
+
+
+def get_keypoint_res(results, im_id):
+    anns = []
+    preds = results['keypoint']
+    for idx in range(im_id.shape[0]):
+        image_id = im_id[idx].item()
+        kpts, scores = preds[idx]
+        for kpt, score in zip(kpts, scores):
+            kpt = kpt.flatten()
+            ann = {
+                'image_id': image_id,
+                'category_id': 1,  # XXX hard code
+                'keypoints': kpt.tolist(),
+                'score': float(score)
+            }
+            x = kpt[0::3]
+            y = kpt[1::3]
+            x0, x1, y0, y1 = np.min(x).item(), np.max(x).item(), np.min(y).item(
+            ), np.max(y).item()
+            ann['area'] = (x1 - x0) * (y1 - y0)
+            ann['bbox'] = [x0, y0, x1 - x0, y1 - y0]
+            anns.append(ann)
+    return anns
+
+
+def get_pose3d_res(results, im_id):
+    anns = []
+    preds = results['pose3d']
+    for idx in range(im_id.shape[0]):
+        image_id = im_id[idx].item()
+        pose3d = preds[idx]
+        ann = {
+            'image_id': image_id,
+            'category_id': 1,  # XXX hard code
+            'pose3d': pose3d.tolist(),
+            'score': float(1.)
+        }
+        anns.append(ann)
+    return anns
--- a/rtdetr_paddle/ppdet/metrics/keypoint_metrics.py
+++ b/rtdetr_paddle/ppdet/metrics/keypoint_metrics.py
@@ -0,0 +1,410 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import os
+import json
+from collections import defaultdict, OrderedDict
+import numpy as np
+import paddle
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from ..modeling.keypoint_utils import oks_nms
+from scipy.io import loadmat, savemat
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['KeyPointTopDownCOCOEval', 'KeyPointTopDownMPIIEval']
+
+
+class KeyPointTopDownCOCOEval(object):
+    """refer to
+        https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
+        Copyright (c) Microsoft, under the MIT License.
+    """
+
+    def __init__(self,
+                 anno_file,
+                 num_samples,
+                 num_joints,
+                 output_eval,
+                 iou_type='keypoints',
+                 in_vis_thre=0.2,
+                 oks_thre=0.9,
+                 save_prediction_only=False):
+        super(KeyPointTopDownCOCOEval, self).__init__()
+        self.coco = COCO(anno_file)
+        self.num_samples = num_samples
+        self.num_joints = num_joints
+        self.iou_type = iou_type
+        self.in_vis_thre = in_vis_thre
+        self.oks_thre = oks_thre
+        self.output_eval = output_eval
+        self.res_file = os.path.join(output_eval, "keypoints_results.json")
+        self.save_prediction_only = save_prediction_only
+        self.reset()
+
+    def reset(self):
+        self.results = {
+            'all_preds': np.zeros(
+                (self.num_samples, self.num_joints, 3), dtype=np.float32),
+            'all_boxes': np.zeros((self.num_samples, 6)),
+            'image_path': []
+        }
+        self.eval_results = {}
+        self.idx = 0
+
+    def update(self, inputs, outputs):
+        kpts, _ = outputs['keypoint'][0]
+
+        num_images = inputs['image'].shape[0]
+        self.results['all_preds'][self.idx:self.idx + num_images, :, 0:
+                                  3] = kpts[:, :, 0:3]
+        self.results['all_boxes'][self.idx:self.idx + num_images, 0:2] = inputs[
+            'center'].numpy()[:, 0:2] if isinstance(
+                inputs['center'], paddle.Tensor) else inputs['center'][:, 0:2]
+        self.results['all_boxes'][self.idx:self.idx + num_images, 2:4] = inputs[
+            'scale'].numpy()[:, 0:2] if isinstance(
+                inputs['scale'], paddle.Tensor) else inputs['scale'][:, 0:2]
+        self.results['all_boxes'][self.idx:self.idx + num_images, 4] = np.prod(
+            inputs['scale'].numpy() * 200,
+            1) if isinstance(inputs['scale'], paddle.Tensor) else np.prod(
+                inputs['scale'] * 200, 1)
+        self.results['all_boxes'][
+            self.idx:self.idx + num_images,
+            5] = np.squeeze(inputs['score'].numpy()) if isinstance(
+                inputs['score'], paddle.Tensor) else np.squeeze(inputs['score'])
+        if isinstance(inputs['im_id'], paddle.Tensor):
+            self.results['image_path'].extend(inputs['im_id'].numpy())
+        else:
+            self.results['image_path'].extend(inputs['im_id'])
+        self.idx += num_images
+
+    def _write_coco_keypoint_results(self, keypoints):
+        data_pack = [{
+            'cat_id': 1,
+            'cls': 'person',
+            'ann_type': 'keypoints',
+            'keypoints': keypoints
+        }]
+        results = self._coco_keypoint_results_one_category_kernel(data_pack[0])
+        if not os.path.exists(self.output_eval):
+            os.makedirs(self.output_eval)
+        with open(self.res_file, 'w') as f:
+            json.dump(results, f, sort_keys=True, indent=4)
+            logger.info(f'The keypoint result is saved to {self.res_file}.')
+        try:
+            json.load(open(self.res_file))
+        except Exception:
+            content = []
+            with open(self.res_file, 'r') as f:
+                for line in f:
+                    content.append(line)
+            content[-1] = ']'
+            with open(self.res_file, 'w') as f:
+                for c in content:
+                    f.write(c)
+
+    def _coco_keypoint_results_one_category_kernel(self, data_pack):
+        cat_id = data_pack['cat_id']
+        keypoints = data_pack['keypoints']
+        cat_results = []
+
+        for img_kpts in keypoints:
+            if len(img_kpts) == 0:
+                continue
+
+            _key_points = np.array(
+                [img_kpts[k]['keypoints'] for k in range(len(img_kpts))])
+            _key_points = _key_points.reshape(_key_points.shape[0], -1)
+
+            result = [{
+                'image_id': img_kpts[k]['image'],
+                'category_id': cat_id,
+                'keypoints': _key_points[k].tolist(),
+                'score': img_kpts[k]['score'],
+                'center': list(img_kpts[k]['center']),
+                'scale': list(img_kpts[k]['scale'])
+            } for k in range(len(img_kpts))]
+            cat_results.extend(result)
+
+        return cat_results
+
+    def get_final_results(self, preds, all_boxes, img_path):
+        _kpts = []
+        for idx, kpt in enumerate(preds):
+            _kpts.append({
+                'keypoints': kpt,
+                'center': all_boxes[idx][0:2],
+                'scale': all_boxes[idx][2:4],
+                'area': all_boxes[idx][4],
+                'score': all_boxes[idx][5],
+                'image': int(img_path[idx])
+            })
+        # image x person x (keypoints)
+        kpts = defaultdict(list)
+        for kpt in _kpts:
+            kpts[kpt['image']].append(kpt)
+
+        # rescoring and oks nms
+        num_joints = preds.shape[1]
+        in_vis_thre = self.in_vis_thre
+        oks_thre = self.oks_thre
+        oks_nmsed_kpts = []
+        for img in kpts.keys():
+            img_kpts = kpts[img]
+            for n_p in img_kpts:
+                box_score = n_p['score']
+                kpt_score = 0
+                valid_num = 0
+                for n_jt in range(0, num_joints):
+                    t_s = n_p['keypoints'][n_jt][2]
+                    if t_s > in_vis_thre:
+                        kpt_score = kpt_score + t_s
+                        valid_num = valid_num + 1
+                if valid_num != 0:
+                    kpt_score = kpt_score / valid_num
+                # rescoring
+                n_p['score'] = kpt_score * box_score
+
+            keep = oks_nms([img_kpts[i] for i in range(len(img_kpts))],
+                           oks_thre)
+
+            if len(keep) == 0:
+                oks_nmsed_kpts.append(img_kpts)
+            else:
+                oks_nmsed_kpts.append([img_kpts[_keep] for _keep in keep])
+
+        self._write_coco_keypoint_results(oks_nmsed_kpts)
+
+    def accumulate(self):
+        self.get_final_results(self.results['all_preds'],
+                               self.results['all_boxes'],
+                               self.results['image_path'])
+        if self.save_prediction_only:
+            logger.info(f'The keypoint result is saved to {self.res_file} '
+                        'and do not evaluate the mAP.')
+            return
+        coco_dt = self.coco.loadRes(self.res_file)
+        coco_eval = COCOeval(self.coco, coco_dt, 'keypoints')
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        keypoint_stats = []
+        for ind in range(len(coco_eval.stats)):
+            keypoint_stats.append((coco_eval.stats[ind]))
+        self.eval_results['keypoint'] = keypoint_stats
+
+    def log(self):
+        if self.save_prediction_only:
+            return
+        stats_names = [
+            'AP', 'Ap .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+        num_values = len(stats_names)
+        print(' '.join(['| {}'.format(name) for name in stats_names]) + ' |')
+        print('|---' * (num_values + 1) + '|')
+
+        print(' '.join([
+            '| {:.3f}'.format(value) for value in self.eval_results['keypoint']
+        ]) + ' |')
+
+    def get_results(self):
+        return self.eval_results
+
+
+class KeyPointTopDownMPIIEval(object):
+    def __init__(self,
+                 anno_file,
+                 num_samples,
+                 num_joints,
+                 output_eval,
+                 oks_thre=0.9,
+                 save_prediction_only=False):
+        super(KeyPointTopDownMPIIEval, self).__init__()
+        self.ann_file = anno_file
+        self.res_file = os.path.join(output_eval, "keypoints_results.json")
+        self.save_prediction_only = save_prediction_only
+        self.reset()
+
+    def reset(self):
+        self.results = []
+        self.eval_results = {}
+        self.idx = 0
+
+    def update(self, inputs, outputs):
+        kpts, _ = outputs['keypoint'][0]
+
+        num_images = inputs['image'].shape[0]
+        results = {}
+        results['preds'] = kpts[:, :, 0:3]
+        results['boxes'] = np.zeros((num_images, 6))
+        results['boxes'][:, 0:2] = inputs['center'].numpy()[:, 0:2]
+        results['boxes'][:, 2:4] = inputs['scale'].numpy()[:, 0:2]
+        results['boxes'][:, 4] = np.prod(inputs['scale'].numpy() * 200, 1)
+        results['boxes'][:, 5] = np.squeeze(inputs['score'].numpy())
+        results['image_path'] = inputs['image_file']
+
+        self.results.append(results)
+
+    def accumulate(self):
+        self._mpii_keypoint_results_save()
+        if self.save_prediction_only:
+            logger.info(f'The keypoint result is saved to {self.res_file} '
+                        'and do not evaluate the mAP.')
+            return
+
+        self.eval_results = self.evaluate(self.results)
+
+    def _mpii_keypoint_results_save(self):
+        results = []
+        for res in self.results:
+            if len(res) == 0:
+                continue
+            result = [{
+                'preds': res['preds'][k].tolist(),
+                'boxes': res['boxes'][k].tolist(),
+                'image_path': res['image_path'][k],
+            } for k in range(len(res))]
+            results.extend(result)
+        with open(self.res_file, 'w') as f:
+            json.dump(results, f, sort_keys=True, indent=4)
+            logger.info(f'The keypoint result is saved to {self.res_file}.')
+
+    def log(self):
+        if self.save_prediction_only:
+            return
+        for item, value in self.eval_results.items():
+            print("{} : {}".format(item, value))
+
+    def get_results(self):
+        return self.eval_results
+
+    def evaluate(self, outputs, savepath=None):
+        """Evaluate PCKh for MPII dataset. refer to
+        https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
+        Copyright (c) Microsoft, under the MIT License.
+
+        Args:
+            outputs(list(preds, boxes)):
+
+                * preds (np.ndarray[N,K,3]): The first two dimensions are
+                  coordinates, score is the third dimension of the array.
+                * boxes (np.ndarray[N,6]): [center[0], center[1], scale[0]
+                  , scale[1],area, score]
+
+        Returns:
+            dict: PCKh for each joint
+        """
+
+        kpts = []
+        for output in outputs:
+            preds = output['preds']
+            batch_size = preds.shape[0]
+            for i in range(batch_size):
+                kpts.append({'keypoints': preds[i]})
+
+        preds = np.stack([kpt['keypoints'] for kpt in kpts])
+
+        # convert 0-based index to 1-based index,
+        # and get the first two dimensions.
+        preds = preds[..., :2] + 1.0
+
+        if savepath is not None:
+            pred_file = os.path.join(savepath, 'pred.mat')
+            savemat(pred_file, mdict={'preds': preds})
+
+        SC_BIAS = 0.6
+        threshold = 0.5
+
+        gt_file = os.path.join(
+            os.path.dirname(self.ann_file), 'mpii_gt_val.mat')
+        gt_dict = loadmat(gt_file)
+        dataset_joints = gt_dict['dataset_joints']
+        jnt_missing = gt_dict['jnt_missing']
+        pos_gt_src = gt_dict['pos_gt_src']
+        headboxes_src = gt_dict['headboxes_src']
+
+        pos_pred_src = np.transpose(preds, [1, 2, 0])
+
+        head = np.where(dataset_joints == 'head')[1][0]
+        lsho = np.where(dataset_joints == 'lsho')[1][0]
+        lelb = np.where(dataset_joints == 'lelb')[1][0]
+        lwri = np.where(dataset_joints == 'lwri')[1][0]
+        lhip = np.where(dataset_joints == 'lhip')[1][0]
+        lkne = np.where(dataset_joints == 'lkne')[1][0]
+        lank = np.where(dataset_joints == 'lank')[1][0]
+
+        rsho = np.where(dataset_joints == 'rsho')[1][0]
+        relb = np.where(dataset_joints == 'relb')[1][0]
+        rwri = np.where(dataset_joints == 'rwri')[1][0]
+        rkne = np.where(dataset_joints == 'rkne')[1][0]
+        rank = np.where(dataset_joints == 'rank')[1][0]
+        rhip = np.where(dataset_joints == 'rhip')[1][0]
+
+        jnt_visible = 1 - jnt_missing
+        uv_error = pos_pred_src - pos_gt_src
+        uv_err = np.linalg.norm(uv_error, axis=1)
+        headsizes = headboxes_src[1, :, :] - headboxes_src[0, :, :]
+        headsizes = np.linalg.norm(headsizes, axis=0)
+        headsizes *= SC_BIAS
+        scale = headsizes * np.ones((len(uv_err), 1), dtype=np.float32)
+        scaled_uv_err = uv_err / scale
+        scaled_uv_err = scaled_uv_err * jnt_visible
+        jnt_count = np.sum(jnt_visible, axis=1)
+        less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible
+        PCKh = 100. * np.sum(less_than_threshold, axis=1) / jnt_count
+
+        # save
+        rng = np.arange(0, 0.5 + 0.01, 0.01)
+        pckAll = np.zeros((len(rng), 16), dtype=np.float32)
+
+        for r, threshold in enumerate(rng):
+            less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible
+            pckAll[r, :] = 100. * np.sum(less_than_threshold,
+                                         axis=1) / jnt_count
+
+        PCKh = np.ma.array(PCKh, mask=False)
+        PCKh.mask[6:8] = True
+
+        jnt_count = np.ma.array(jnt_count, mask=False)
+        jnt_count.mask[6:8] = True
+        jnt_ratio = jnt_count / np.sum(jnt_count).astype(np.float64)
+
+        name_value = [  #noqa
+            ('Head', PCKh[head]),
+            ('Shoulder', 0.5 * (PCKh[lsho] + PCKh[rsho])),
+            ('Elbow', 0.5 * (PCKh[lelb] + PCKh[relb])),
+            ('Wrist', 0.5 * (PCKh[lwri] + PCKh[rwri])),
+            ('Hip', 0.5 * (PCKh[lhip] + PCKh[rhip])),
+            ('Knee', 0.5 * (PCKh[lkne] + PCKh[rkne])),
+            ('Ankle', 0.5 * (PCKh[lank] + PCKh[rank])),
+            ('PCKh', np.sum(PCKh * jnt_ratio)),
+            ('PCKh@0.1', np.sum(pckAll[11, :] * jnt_ratio))
+        ]
+        name_value = OrderedDict(name_value)
+
+        return name_value
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        kpts = sorted(kpts, key=lambda x: x[key])
+        num = len(kpts)
+        for i in range(num - 1, 0, -1):
+            if kpts[i][key] == kpts[i - 1][key]:
+                del kpts[i]
+
+        return kpts
--- a/rtdetr_paddle/ppdet/metrics/map_utils.py
+++ b/rtdetr_paddle/ppdet/metrics/map_utils.py
@@ -0,0 +1,397 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import os
+import sys
+import numpy as np
+import itertools
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = [
+    'draw_pr_curve',
+    'bbox_area',
+    'jaccard_overlap',
+    'prune_zero_padding',
+    'DetectionMAP',
+    'ap_per_class',
+    'compute_ap',
+]
+
+
+def draw_pr_curve(precision,
+                  recall,
+                  iou=0.5,
+                  out_dir='pr_curve',
+                  file_name='precision_recall_curve.jpg'):
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+    output_path = os.path.join(out_dir, file_name)
+    try:
+        import matplotlib.pyplot as plt
+    except Exception as e:
+        logger.error('Matplotlib not found, plaese install matplotlib.'
+                     'for example: `pip install matplotlib`.')
+        raise e
+    plt.cla()
+    plt.figure('P-R Curve')
+    plt.title('Precision/Recall Curve(IoU={})'.format(iou))
+    plt.xlabel('Recall')
+    plt.ylabel('Precision')
+    plt.grid(True)
+    plt.plot(recall, precision)
+    plt.savefig(output_path)
+
+
+def bbox_area(bbox, is_bbox_normalized):
+    """
+    Calculate area of a bounding box
+    """
+    norm = 1. - float(is_bbox_normalized)
+    width = bbox[2] - bbox[0] + norm
+    height = bbox[3] - bbox[1] + norm
+    return width * height
+
+
+def jaccard_overlap(pred, gt, is_bbox_normalized=False):
+    """
+    Calculate jaccard overlap ratio between two bounding box
+    """
+    if pred[0] >= gt[2] or pred[2] <= gt[0] or \
+        pred[1] >= gt[3] or pred[3] <= gt[1]:
+        return 0.
+    inter_xmin = max(pred[0], gt[0])
+    inter_ymin = max(pred[1], gt[1])
+    inter_xmax = min(pred[2], gt[2])
+    inter_ymax = min(pred[3], gt[3])
+    inter_size = bbox_area([inter_xmin, inter_ymin, inter_xmax, inter_ymax],
+                           is_bbox_normalized)
+    pred_size = bbox_area(pred, is_bbox_normalized)
+    gt_size = bbox_area(gt, is_bbox_normalized)
+    overlap = float(inter_size) / (pred_size + gt_size - inter_size)
+    return overlap
+
+
+def prune_zero_padding(gt_box, gt_label, difficult=None):
+    valid_cnt = 0
+    for i in range(len(gt_box)):
+        if (gt_box[i] == 0).all():
+            break
+        valid_cnt += 1
+    return (gt_box[:valid_cnt], gt_label[:valid_cnt], difficult[:valid_cnt]
+            if difficult is not None else None)
+
+
+class DetectionMAP(object):
+    """
+    Calculate detection mean average precision.
+    Currently support two types: 11point and integral
+
+    Args:
+        class_num (int): The class number.
+        overlap_thresh (float): The threshold of overlap
+            ratio between prediction bounding box and 
+            ground truth bounding box for deciding 
+            true/false positive. Default 0.5.
+        map_type (str): Calculation method of mean average
+            precision, currently support '11point' and
+            'integral'. Default '11point'.
+        is_bbox_normalized (bool): Whether bounding boxes
+            is normalized to range[0, 1]. Default False.
+        evaluate_difficult (bool): Whether to evaluate
+            difficult bounding boxes. Default False.
+        catid2name (dict): Mapping between category id and category name.
+        classwise (bool): Whether per-category AP and draw
+            P-R Curve or not.
+    """
+
+    def __init__(self,
+                 class_num,
+                 overlap_thresh=0.5,
+                 map_type='11point',
+                 is_bbox_normalized=False,
+                 evaluate_difficult=False,
+                 catid2name=None,
+                 classwise=False):
+        self.class_num = class_num
+        self.overlap_thresh = overlap_thresh
+        assert map_type in ['11point', 'integral'], \
+                "map_type currently only support '11point' "\
+                "and 'integral'"
+        self.map_type = map_type
+        self.is_bbox_normalized = is_bbox_normalized
+        self.evaluate_difficult = evaluate_difficult
+        self.classwise = classwise
+        self.classes = []
+        for cname in catid2name.values():
+            self.classes.append(cname)
+        self.reset()
+
+    def update(self, bbox, score, label, gt_box, gt_label, difficult=None):
+        """
+        Update metric statics from given prediction and ground
+        truth infomations.
+        """
+        if difficult is None:
+            difficult = np.zeros_like(gt_label)
+
+        # record class gt count
+        for gtl, diff in zip(gt_label, difficult):
+            if self.evaluate_difficult or int(diff) == 0:
+                self.class_gt_counts[int(np.array(gtl))] += 1
+
+        # record class score positive
+        visited = [False] * len(gt_label)
+        for b, s, l in zip(bbox, score, label):
+            pred = b.tolist() if isinstance(b, np.ndarray) else b
+            max_idx = -1
+            max_overlap = -1.0
+            for i, gl in enumerate(gt_label):
+                if int(gl) == int(l):
+                    if len(gt_box[i]) == 8:
+                        overlap = calc_rbox_iou(pred, gt_box[i])
+                    else:
+                        overlap = jaccard_overlap(pred, gt_box[i],
+                                                  self.is_bbox_normalized)
+                    if overlap > max_overlap:
+                        max_overlap = overlap
+                        max_idx = i
+
+            if max_overlap > self.overlap_thresh:
+                if self.evaluate_difficult or \
+                        int(np.array(difficult[max_idx])) == 0:
+                    if not visited[max_idx]:
+                        self.class_score_poss[int(l)].append([s, 1.0])
+                        visited[max_idx] = True
+                    else:
+                        self.class_score_poss[int(l)].append([s, 0.0])
+            else:
+                self.class_score_poss[int(l)].append([s, 0.0])
+
+    def reset(self):
+        """
+        Reset metric statics
+        """
+        self.class_score_poss = [[] for _ in range(self.class_num)]
+        self.class_gt_counts = [0] * self.class_num
+        self.mAP = 0.0
+
+    def accumulate(self):
+        """
+        Accumulate metric results and calculate mAP
+        """
+        mAP = 0.
+        valid_cnt = 0
+        eval_results = []
+        for score_pos, count in zip(self.class_score_poss,
+                                    self.class_gt_counts):
+            if count == 0: continue
+            if len(score_pos) == 0:
+                valid_cnt += 1
+                continue
+
+            accum_tp_list, accum_fp_list = \
+                    self._get_tp_fp_accum(score_pos)
+            precision = []
+            recall = []
+            for ac_tp, ac_fp in zip(accum_tp_list, accum_fp_list):
+                precision.append(float(ac_tp) / (ac_tp + ac_fp))
+                recall.append(float(ac_tp) / count)
+
+            one_class_ap = 0.0
+            if self.map_type == '11point':
+                max_precisions = [0.] * 11
+                start_idx = len(precision) - 1
+                for j in range(10, -1, -1):
+                    for i in range(start_idx, -1, -1):
+                        if recall[i] < float(j) / 10.:
+                            start_idx = i
+                            if j > 0:
+                                max_precisions[j - 1] = max_precisions[j]
+                                break
+                        else:
+                            if max_precisions[j] < precision[i]:
+                                max_precisions[j] = precision[i]
+                one_class_ap = sum(max_precisions) / 11.
+                mAP += one_class_ap
+                valid_cnt += 1
+            elif self.map_type == 'integral':
+                import math
+                prev_recall = 0.
+                for i in range(len(precision)):
+                    recall_gap = math.fabs(recall[i] - prev_recall)
+                    if recall_gap > 1e-6:
+                        one_class_ap += precision[i] * recall_gap
+                        prev_recall = recall[i]
+                mAP += one_class_ap
+                valid_cnt += 1
+            else:
+                logger.error("Unspported mAP type {}".format(self.map_type))
+                sys.exit(1)
+            eval_results.append({
+                'class': self.classes[valid_cnt - 1],
+                'ap': one_class_ap,
+                'precision': precision,
+                'recall': recall,
+            })
+        self.eval_results = eval_results
+        self.mAP = mAP / float(valid_cnt) if valid_cnt > 0 else mAP
+
+    def get_map(self):
+        """
+        Get mAP result
+        """
+        if self.mAP is None:
+            logger.error("mAP is not calculated.")
+        if self.classwise:
+            # Compute per-category AP and PR curve
+            try:
+                from terminaltables import AsciiTable
+            except Exception as e:
+                logger.error(
+                    'terminaltables not found, plaese install terminaltables. '
+                    'for example: `pip install terminaltables`.')
+                raise e
+            results_per_category = []
+            for eval_result in self.eval_results:
+                results_per_category.append(
+                    (str(eval_result['class']),
+                     '{:0.3f}'.format(float(eval_result['ap']))))
+                draw_pr_curve(
+                    eval_result['precision'],
+                    eval_result['recall'],
+                    out_dir='voc_pr_curve',
+                    file_name='{}_precision_recall_curve.jpg'.format(
+                        eval_result['class']))
+
+            num_columns = min(6, len(results_per_category) * 2)
+            results_flatten = list(itertools.chain(*results_per_category))
+            headers = ['category', 'AP'] * (num_columns // 2)
+            results_2d = itertools.zip_longest(* [
+                results_flatten[i::num_columns] for i in range(num_columns)
+            ])
+            table_data = [headers]
+            table_data += [result for result in results_2d]
+            table = AsciiTable(table_data)
+            logger.info('Per-category of VOC AP: \n{}'.format(table.table))
+            logger.info(
+                "per-category PR curve has output to voc_pr_curve folder.")
+        return self.mAP
+
+    def _get_tp_fp_accum(self, score_pos_list):
+        """
+        Calculate accumulating true/false positive results from
+        [score, pos] records
+        """
+        sorted_list = sorted(score_pos_list, key=lambda s: s[0], reverse=True)
+        accum_tp = 0
+        accum_fp = 0
+        accum_tp_list = []
+        accum_fp_list = []
+        for (score, pos) in sorted_list:
+            accum_tp += int(pos)
+            accum_tp_list.append(accum_tp)
+            accum_fp += 1 - int(pos)
+            accum_fp_list.append(accum_fp)
+        return accum_tp_list, accum_fp_list
+
+
+def ap_per_class(tp, conf, pred_cls, target_cls):
+    """
+    Computes the average precision, given the recall and precision curves.
+    Method originally from https://github.com/rafaelpadilla/Object-Detection-Metrics.
+    
+    Args:
+        tp (list): True positives.
+        conf (list): Objectness value from 0-1.
+        pred_cls (list): Predicted object classes.
+        target_cls (list): Target object classes.
+    """
+    tp, conf, pred_cls, target_cls = np.array(tp), np.array(conf), np.array(
+        pred_cls), np.array(target_cls)
+
+    # Sort by objectness
+    i = np.argsort(-conf)
+    tp, conf, pred_cls = tp[i], conf[i], pred_cls[i]
+
+    # Find unique classes
+    unique_classes = np.unique(np.concatenate((pred_cls, target_cls), 0))
+
+    # Create Precision-Recall curve and compute AP for each class
+    ap, p, r = [], [], []
+    for c in unique_classes:
+        i = pred_cls == c
+        n_gt = sum(target_cls == c)  # Number of ground truth objects
+        n_p = sum(i)  # Number of predicted objects
+
+        if (n_p == 0) and (n_gt == 0):
+            continue
+        elif (n_p == 0) or (n_gt == 0):
+            ap.append(0)
+            r.append(0)
+            p.append(0)
+        else:
+            # Accumulate FPs and TPs
+            fpc = np.cumsum(1 - tp[i])
+            tpc = np.cumsum(tp[i])
+
+            # Recall
+            recall_curve = tpc / (n_gt + 1e-16)
+            r.append(tpc[-1] / (n_gt + 1e-16))
+
+            # Precision
+            precision_curve = tpc / (tpc + fpc)
+            p.append(tpc[-1] / (tpc[-1] + fpc[-1]))
+
+            # AP from recall-precision curve
+            ap.append(compute_ap(recall_curve, precision_curve))
+
+    return np.array(ap), unique_classes.astype('int32'), np.array(r), np.array(
+        p)
+
+
+def compute_ap(recall, precision):
+    """
+    Computes the average precision, given the recall and precision curves.
+    Code originally from https://github.com/rbgirshick/py-faster-rcnn.
+    
+    Args:
+        recall (list): The recall curve.
+        precision (list): The precision curve.
+
+    Returns:
+        The average precision as computed in py-faster-rcnn.
+    """
+    # correct AP calculation
+    # first append sentinel values at the end
+    mrec = np.concatenate(([0.], recall, [1.]))
+    mpre = np.concatenate(([0.], precision, [0.]))
+
+    # compute the precision envelope
+    for i in range(mpre.size - 1, 0, -1):
+        mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+    # to calculate area under PR curve, look for points
+    # where X axis (recall) changes value
+    i = np.where(mrec[1:] != mrec[:-1])[0]
+
+    # and sum (\Delta recall) * prec
+    ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap
--- a/rtdetr_paddle/ppdet/metrics/mcmot_metrics.py
+++ b/rtdetr_paddle/ppdet/metrics/mcmot_metrics.py
@@ -0,0 +1,473 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import copy
+import sys
+import math
+from collections import defaultdict
+
+import numpy as np
+import pandas as pd
+
+from .metrics import Metric
+try:
+    import motmetrics as mm
+    from motmetrics.math_util import quiet_divide
+    metrics = mm.metrics.motchallenge_metrics
+    mh = mm.metrics.create()
+except:
+    print(
+        'Warning: Unable to use MCMOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'
+    )
+    pass
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['MCMOTEvaluator', 'MCMOTMetric']
+
+METRICS_LIST = [
+    'num_frames', 'num_matches', 'num_switches', 'num_transfer', 'num_ascend',
+    'num_migrate', 'num_false_positives', 'num_misses', 'num_detections',
+    'num_objects', 'num_predictions', 'num_unique_objects', 'mostly_tracked',
+    'partially_tracked', 'mostly_lost', 'num_fragmentations', 'motp', 'mota',
+    'precision', 'recall', 'idfp', 'idfn', 'idtp', 'idp', 'idr', 'idf1'
+]
+
+NAME_MAP = {
+    'num_frames': 'num_frames',
+    'num_matches': 'num_matches',
+    'num_switches': 'IDs',
+    'num_transfer': 'IDt',
+    'num_ascend': 'IDa',
+    'num_migrate': 'IDm',
+    'num_false_positives': 'FP',
+    'num_misses': 'FN',
+    'num_detections': 'num_detections',
+    'num_objects': 'num_objects',
+    'num_predictions': 'num_predictions',
+    'num_unique_objects': 'GT',
+    'mostly_tracked': 'MT',
+    'partially_tracked': 'partially_tracked',
+    'mostly_lost': 'ML',
+    'num_fragmentations': 'FM',
+    'motp': 'MOTP',
+    'mota': 'MOTA',
+    'precision': 'Prcn',
+    'recall': 'Rcll',
+    'idfp': 'idfp',
+    'idfn': 'idfn',
+    'idtp': 'idtp',
+    'idp': 'IDP',
+    'idr': 'IDR',
+    'idf1': 'IDF1'
+}
+
+
+def parse_accs_metrics(seq_acc, index_name, verbose=False):
+    """
+    Parse the evaluation indicators of multiple MOTAccumulator 
+    """
+    mh = mm.metrics.create()
+    summary = MCMOTEvaluator.get_summary(seq_acc, index_name, METRICS_LIST)
+    summary.loc['OVERALL', 'motp'] = (summary['motp'] * summary['num_detections']).sum() / \
+                                     summary.loc['OVERALL', 'num_detections']
+    if verbose:
+        strsummary = mm.io.render_summary(
+            summary, formatters=mh.formatters, namemap=NAME_MAP)
+        print(strsummary)
+
+    return summary
+
+
+def seqs_overall_metrics(summary_df, verbose=False):
+    """
+    Calculate overall metrics for multiple sequences
+    """
+    add_col = [
+        'num_frames', 'num_matches', 'num_switches', 'num_transfer',
+        'num_ascend', 'num_migrate', 'num_false_positives', 'num_misses',
+        'num_detections', 'num_objects', 'num_predictions',
+        'num_unique_objects', 'mostly_tracked', 'partially_tracked',
+        'mostly_lost', 'num_fragmentations', 'idfp', 'idfn', 'idtp'
+    ]
+    calc_col = ['motp', 'mota', 'precision', 'recall', 'idp', 'idr', 'idf1']
+    calc_df = summary_df.copy()
+
+    overall_dic = {}
+    for col in add_col:
+        overall_dic[col] = calc_df[col].sum()
+
+    for col in calc_col:
+        overall_dic[col] = getattr(MCMOTMetricOverall, col + '_overall')(
+            calc_df, overall_dic)
+
+    overall_df = pd.DataFrame(overall_dic, index=['overall_calc'])
+    calc_df = pd.concat([calc_df, overall_df])
+
+    if verbose:
+        mh = mm.metrics.create()
+        str_calc_df = mm.io.render_summary(
+            calc_df, formatters=mh.formatters, namemap=NAME_MAP)
+        print(str_calc_df)
+
+    return calc_df
+
+
+class MCMOTMetricOverall(object):
+    def motp_overall(summary_df, overall_dic):
+        motp = quiet_divide((summary_df['motp'] *
+                             summary_df['num_detections']).sum(),
+                            overall_dic['num_detections'])
+        return motp
+
+    def mota_overall(summary_df, overall_dic):
+        del summary_df
+        mota = 1. - quiet_divide(
+            (overall_dic['num_misses'] + overall_dic['num_switches'] +
+             overall_dic['num_false_positives']), overall_dic['num_objects'])
+        return mota
+
+    def precision_overall(summary_df, overall_dic):
+        del summary_df
+        precision = quiet_divide(overall_dic['num_detections'], (
+            overall_dic['num_false_positives'] + overall_dic['num_detections']))
+        return precision
+
+    def recall_overall(summary_df, overall_dic):
+        del summary_df
+        recall = quiet_divide(overall_dic['num_detections'],
+                              overall_dic['num_objects'])
+        return recall
+
+    def idp_overall(summary_df, overall_dic):
+        del summary_df
+        idp = quiet_divide(overall_dic['idtp'],
+                           (overall_dic['idtp'] + overall_dic['idfp']))
+        return idp
+
+    def idr_overall(summary_df, overall_dic):
+        del summary_df
+        idr = quiet_divide(overall_dic['idtp'],
+                           (overall_dic['idtp'] + overall_dic['idfn']))
+        return idr
+
+    def idf1_overall(summary_df, overall_dic):
+        del summary_df
+        idf1 = quiet_divide(2. * overall_dic['idtp'], (
+            overall_dic['num_objects'] + overall_dic['num_predictions']))
+        return idf1
+
+
+def read_mcmot_results_union(filename, is_gt, is_ignore):
+    results_dict = dict()
+    if os.path.isfile(filename):
+        all_result = np.loadtxt(filename, delimiter=',')
+        if all_result.shape[0] == 0 or all_result.shape[1] < 7:
+            return results_dict
+        if is_ignore:
+            return results_dict
+        if is_gt:
+            # only for test use
+            all_result = all_result[all_result[:, 7] != 0]
+            all_result[:, 7] = all_result[:, 7] - 1
+
+        if all_result.shape[0] == 0:
+            return results_dict
+
+        class_unique = np.unique(all_result[:, 7])
+
+        last_max_id = 0
+        result_cls_list = []
+        for cls in class_unique:
+            result_cls_split = all_result[all_result[:, 7] == cls]
+            result_cls_split[:, 1] = result_cls_split[:, 1] + last_max_id
+            # make sure track id different between every category
+            last_max_id = max(np.unique(result_cls_split[:, 1])) + 1
+            result_cls_list.append(result_cls_split)
+
+        results_con = np.concatenate(result_cls_list)
+
+        for line in range(len(results_con)):
+            linelist = results_con[line]
+            fid = int(linelist[0])
+            if fid < 1:
+                continue
+            results_dict.setdefault(fid, list())
+
+            if is_gt:
+                score = 1
+            else:
+                score = float(linelist[6])
+
+            tlwh = tuple(map(float, linelist[2:6]))
+            target_id = int(linelist[1])
+            cls = int(linelist[7])
+
+            results_dict[fid].append((tlwh, target_id, cls, score))
+
+        return results_dict
+
+
+def read_mcmot_results(filename, is_gt, is_ignore):
+    results_dict = dict()
+    if os.path.isfile(filename):
+        with open(filename, 'r') as f:
+            for line in f.readlines():
+                linelist = line.strip().split(',')
+                if len(linelist) < 7:
+                    continue
+                fid = int(linelist[0])
+                if fid < 1:
+                    continue
+                cid = int(linelist[7])
+                if is_gt:
+                    score = 1
+                    # only for test use
+                    cid -= 1
+                else:
+                    score = float(linelist[6])
+
+                cls_result_dict = results_dict.setdefault(cid, dict())
+                cls_result_dict.setdefault(fid, list())
+
+                tlwh = tuple(map(float, linelist[2:6]))
+                target_id = int(linelist[1])
+                cls_result_dict[fid].append((tlwh, target_id, score))
+    return results_dict
+
+
+def read_results(filename,
+                 data_type,
+                 is_gt=False,
+                 is_ignore=False,
+                 multi_class=False,
+                 union=False):
+    if data_type in ['mcmot', 'lab']:
+        if multi_class:
+            if union:
+                # The results are evaluated by union all the categories.
+                # Track IDs between different categories cannot be duplicate.
+                read_fun = read_mcmot_results_union
+            else:
+                # The results are evaluated separately by category.
+                read_fun = read_mcmot_results
+        else:
+            raise ValueError('multi_class: {}, MCMOT should have cls_id.'.
+                             format(multi_class))
+    else:
+        raise ValueError('Unknown data type: {}'.format(data_type))
+
+    return read_fun(filename, is_gt, is_ignore)
+
+
+def unzip_objs(objs):
+    if len(objs) > 0:
+        tlwhs, ids, scores = zip(*objs)
+    else:
+        tlwhs, ids, scores = [], [], []
+    tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4)
+    return tlwhs, ids, scores
+
+
+def unzip_objs_cls(objs):
+    if len(objs) > 0:
+        tlwhs, ids, cls, scores = zip(*objs)
+    else:
+        tlwhs, ids, cls, scores = [], [], [], []
+    tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4)
+    ids = np.array(ids)
+    cls = np.array(cls)
+    scores = np.array(scores)
+    return tlwhs, ids, cls, scores
+
+
+class MCMOTEvaluator(object):
+    def __init__(self, data_root, seq_name, data_type, num_classes):
+        self.data_root = data_root
+        self.seq_name = seq_name
+        self.data_type = data_type
+        self.num_classes = num_classes
+
+        self.load_annotations()
+        try:
+            import motmetrics as mm
+            mm.lap.default_solver = 'lap'
+        except Exception as e:
+            raise RuntimeError(
+                'Unable to use MCMOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'
+            )
+        self.reset_accumulator()
+
+        self.class_accs = []
+
+    def load_annotations(self):
+        assert self.data_type == 'mcmot'
+        self.gt_filename = os.path.join(self.data_root, '../', 'sequences',
+                                        '{}.txt'.format(self.seq_name))
+        if not os.path.exists(self.gt_filename):
+            logger.warning(
+                "gt_filename '{}' of MCMOTEvaluator is not exist, so the MOTA will be -INF."
+            )
+
+    def reset_accumulator(self):
+        self.acc = mm.MOTAccumulator(auto_id=True)
+
+    def eval_frame_dict(self, trk_objs, gt_objs, rtn_events=False, union=False):
+        if union:
+            trk_tlwhs, trk_ids, trk_cls = unzip_objs_cls(trk_objs)[:3]
+            gt_tlwhs, gt_ids, gt_cls = unzip_objs_cls(gt_objs)[:3]
+
+            # get distance matrix
+            iou_distance = mm.distances.iou_matrix(
+                gt_tlwhs, trk_tlwhs, max_iou=0.5)
+
+            # Set the distance between objects of different categories to nan
+            gt_cls_len = len(gt_cls)
+            trk_cls_len = len(trk_cls)
+            # When the number of GT or Trk is 0, iou_distance dimension is (0,0)
+            if gt_cls_len != 0 and trk_cls_len != 0:
+                gt_cls = gt_cls.reshape(gt_cls_len, 1)
+                gt_cls = np.repeat(gt_cls, trk_cls_len, axis=1)
+                trk_cls = trk_cls.reshape(1, trk_cls_len)
+                trk_cls = np.repeat(trk_cls, gt_cls_len, axis=0)
+                iou_distance = np.where(gt_cls == trk_cls, iou_distance, np.nan)
+
+        else:
+            trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2]
+            gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2]
+
+            # get distance matrix
+            iou_distance = mm.distances.iou_matrix(
+                gt_tlwhs, trk_tlwhs, max_iou=0.5)
+
+        self.acc.update(gt_ids, trk_ids, iou_distance)
+
+        if rtn_events and iou_distance.size > 0 and hasattr(self.acc,
+                                                            'mot_events'):
+            events = self.acc.mot_events  # only supported by https://github.com/longcw/py-motmetrics
+        else:
+            events = None
+        return events
+
+    def eval_file(self, result_filename):
+        # evaluation of each category
+        gt_frame_dict = read_results(
+            self.gt_filename,
+            self.data_type,
+            is_gt=True,
+            multi_class=True,
+            union=False)
+        result_frame_dict = read_results(
+            result_filename,
+            self.data_type,
+            is_gt=False,
+            multi_class=True,
+            union=False)
+
+        for cid in range(self.num_classes):
+            self.reset_accumulator()
+            cls_result_frame_dict = result_frame_dict.setdefault(cid, dict())
+            cls_gt_frame_dict = gt_frame_dict.setdefault(cid, dict())
+
+            # only labeled frames will be evaluated
+            frames = sorted(list(set(cls_gt_frame_dict.keys())))
+
+            for frame_id in frames:
+                trk_objs = cls_result_frame_dict.get(frame_id, [])
+                gt_objs = cls_gt_frame_dict.get(frame_id, [])
+                self.eval_frame_dict(trk_objs, gt_objs, rtn_events=False)
+
+            self.class_accs.append(self.acc)
+
+        return self.class_accs
+
+    @staticmethod
+    def get_summary(accs,
+                    names,
+                    metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1',
+                             'precision', 'recall')):
+        names = copy.deepcopy(names)
+        if metrics is None:
+            metrics = mm.metrics.motchallenge_metrics
+        metrics = copy.deepcopy(metrics)
+
+        mh = mm.metrics.create()
+        summary = mh.compute_many(
+            accs, metrics=metrics, names=names, generate_overall=True)
+
+        return summary
+
+    @staticmethod
+    def save_summary(summary, filename):
+        import pandas as pd
+        writer = pd.ExcelWriter(filename)
+        summary.to_excel(writer)
+        writer.save()
+
+
+class MCMOTMetric(Metric):
+    def __init__(self, num_classes, save_summary=False):
+        self.num_classes = num_classes
+        self.save_summary = save_summary
+        self.MCMOTEvaluator = MCMOTEvaluator
+        self.result_root = None
+        self.reset()
+
+        self.seqs_overall = defaultdict(list)
+
+    def reset(self):
+        self.accs = []
+        self.seqs = []
+
+    def update(self, data_root, seq, data_type, result_root, result_filename):
+        evaluator = self.MCMOTEvaluator(data_root, seq, data_type,
+                                        self.num_classes)
+        seq_acc = evaluator.eval_file(result_filename)
+        self.accs.append(seq_acc)
+        self.seqs.append(seq)
+        self.result_root = result_root
+
+        cls_index_name = [
+            '{}_{}'.format(seq, i) for i in range(self.num_classes)
+        ]
+        summary = parse_accs_metrics(seq_acc, cls_index_name)
+        summary.rename(
+            index={'OVERALL': '{}_OVERALL'.format(seq)}, inplace=True)
+        for row in range(len(summary)):
+            self.seqs_overall[row].append(summary.iloc[row:row + 1])
+
+    def accumulate(self):
+        self.cls_summary_list = []
+        for row in range(self.num_classes):
+            seqs_cls_df = pd.concat(self.seqs_overall[row])
+            seqs_cls_summary = seqs_overall_metrics(seqs_cls_df)
+            cls_summary_overall = seqs_cls_summary.iloc[-1:].copy()
+            cls_summary_overall.rename(
+                index={'overall_calc': 'overall_calc_{}'.format(row)},
+                inplace=True)
+            self.cls_summary_list.append(cls_summary_overall)
+
+    def log(self):
+        seqs_summary = seqs_overall_metrics(
+            pd.concat(self.seqs_overall[self.num_classes]), verbose=True)
+        class_summary = seqs_overall_metrics(
+            pd.concat(self.cls_summary_list), verbose=True)
+
+    def get_results(self):
+        return 1
--- a/rtdetr_paddle/ppdet/metrics/metrics.py
+++ b/rtdetr_paddle/ppdet/metrics/metrics.py
@@ -0,0 +1,505 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import json
+import paddle
+import numpy as np
+import typing
+from collections import defaultdict
+from pathlib import Path
+
+from .map_utils import prune_zero_padding, DetectionMAP
+from .coco_utils import get_infer_results, cocoapi_eval
+from .widerface_utils import face_eval_run
+from ppdet.data.source.category import get_categories
+
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = [
+    'Metric', 'COCOMetric', 'VOCMetric', 'WiderFaceMetric', 'get_infer_results',
+    'RBoxMetric', 'SNIPERCOCOMetric'
+]
+
+COCO_SIGMAS = np.array([
+    .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87,
+    .89, .89
+]) / 10.0
+CROWD_SIGMAS = np.array(
+    [.79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89, .79,
+     .79]) / 10.0
+
+
+class Metric(paddle.metric.Metric):
+    def name(self):
+        return self.__class__.__name__
+
+    def reset(self):
+        pass
+
+    def accumulate(self):
+        pass
+
+    # paddle.metric.Metric defined :metch:`update`, :meth:`accumulate`
+    # :metch:`reset`, in ppdet, we also need following 2 methods:
+
+    # abstract method for logging metric results
+    def log(self):
+        pass
+
+    # abstract method for getting metric results
+    def get_results(self):
+        pass
+
+
+class COCOMetric(Metric):
+    def __init__(self, anno_file, **kwargs):
+        self.anno_file = anno_file
+        self.clsid2catid = kwargs.get('clsid2catid', None)
+        if self.clsid2catid is None:
+            self.clsid2catid, _ = get_categories('COCO', anno_file)
+        self.classwise = kwargs.get('classwise', False)
+        self.output_eval = kwargs.get('output_eval', None)
+        # TODO: bias should be unified
+        self.bias = kwargs.get('bias', 0)
+        self.save_prediction_only = kwargs.get('save_prediction_only', False)
+        self.iou_type = kwargs.get('IouType', 'bbox')
+
+        if not self.save_prediction_only:
+            assert os.path.isfile(anno_file), \
+                    "anno_file {} not a file".format(anno_file)
+
+        if self.output_eval is not None:
+            Path(self.output_eval).mkdir(exist_ok=True)
+
+        self.reset()
+
+    def reset(self):
+        # only bbox and mask evaluation support currently
+        self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []}
+        self.eval_results = {}
+
+    def update(self, inputs, outputs):
+        outs = {}
+        # outputs Tensor -> numpy.ndarray
+        for k, v in outputs.items():
+            outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v
+
+        # multi-scale inputs: all inputs have same im_id
+        if isinstance(inputs, typing.Sequence):
+            im_id = inputs[0]['im_id']
+        else:
+            im_id = inputs['im_id']
+        outs['im_id'] = im_id.numpy() if isinstance(im_id,
+                                                    paddle.Tensor) else im_id
+
+        infer_results = get_infer_results(
+            outs, self.clsid2catid, bias=self.bias)
+        self.results['bbox'] += infer_results[
+            'bbox'] if 'bbox' in infer_results else []
+        self.results['mask'] += infer_results[
+            'mask'] if 'mask' in infer_results else []
+        self.results['segm'] += infer_results[
+            'segm'] if 'segm' in infer_results else []
+        self.results['keypoint'] += infer_results[
+            'keypoint'] if 'keypoint' in infer_results else []
+
+    def accumulate(self):
+        if len(self.results['bbox']) > 0:
+            output = "bbox.json"
+            if self.output_eval:
+                output = os.path.join(self.output_eval, output)
+            with open(output, 'w') as f:
+                json.dump(self.results['bbox'], f)
+                logger.info('The bbox result is saved to bbox.json.')
+
+            if self.save_prediction_only:
+                logger.info('The bbox result is saved to {} and do not '
+                            'evaluate the mAP.'.format(output))
+            else:
+                bbox_stats = cocoapi_eval(
+                    output,
+                    'bbox',
+                    anno_file=self.anno_file,
+                    classwise=self.classwise)
+                self.eval_results['bbox'] = bbox_stats
+                sys.stdout.flush()
+
+        if len(self.results['mask']) > 0:
+            output = "mask.json"
+            if self.output_eval:
+                output = os.path.join(self.output_eval, output)
+            with open(output, 'w') as f:
+                json.dump(self.results['mask'], f)
+                logger.info('The mask result is saved to mask.json.')
+
+            if self.save_prediction_only:
+                logger.info('The mask result is saved to {} and do not '
+                            'evaluate the mAP.'.format(output))
+            else:
+                seg_stats = cocoapi_eval(
+                    output,
+                    'segm',
+                    anno_file=self.anno_file,
+                    classwise=self.classwise)
+                self.eval_results['mask'] = seg_stats
+                sys.stdout.flush()
+
+        if len(self.results['segm']) > 0:
+            output = "segm.json"
+            if self.output_eval:
+                output = os.path.join(self.output_eval, output)
+            with open(output, 'w') as f:
+                json.dump(self.results['segm'], f)
+                logger.info('The segm result is saved to segm.json.')
+
+            if self.save_prediction_only:
+                logger.info('The segm result is saved to {} and do not '
+                            'evaluate the mAP.'.format(output))
+            else:
+                seg_stats = cocoapi_eval(
+                    output,
+                    'segm',
+                    anno_file=self.anno_file,
+                    classwise=self.classwise)
+                self.eval_results['mask'] = seg_stats
+                sys.stdout.flush()
+
+        if len(self.results['keypoint']) > 0:
+            output = "keypoint.json"
+            if self.output_eval:
+                output = os.path.join(self.output_eval, output)
+            with open(output, 'w') as f:
+                json.dump(self.results['keypoint'], f)
+                logger.info('The keypoint result is saved to keypoint.json.')
+
+            if self.save_prediction_only:
+                logger.info('The keypoint result is saved to {} and do not '
+                            'evaluate the mAP.'.format(output))
+            else:
+                style = 'keypoints'
+                use_area = True
+                sigmas = COCO_SIGMAS
+                if self.iou_type == 'keypoints_crowd':
+                    style = 'keypoints_crowd'
+                    use_area = False
+                    sigmas = CROWD_SIGMAS
+                keypoint_stats = cocoapi_eval(
+                    output,
+                    style,
+                    anno_file=self.anno_file,
+                    classwise=self.classwise,
+                    sigmas=sigmas,
+                    use_area=use_area)
+                self.eval_results['keypoint'] = keypoint_stats
+                sys.stdout.flush()
+
+    def log(self):
+        pass
+
+    def get_results(self):
+        return self.eval_results
+
+
+class VOCMetric(Metric):
+    def __init__(self,
+                 label_list,
+                 class_num=20,
+                 overlap_thresh=0.5,
+                 map_type='11point',
+                 is_bbox_normalized=False,
+                 evaluate_difficult=False,
+                 classwise=False,
+                 output_eval=None,
+                 save_prediction_only=False):
+        assert os.path.isfile(label_list), \
+                "label_list {} not a file".format(label_list)
+        self.clsid2catid, self.catid2name = get_categories('VOC', label_list)
+
+        self.overlap_thresh = overlap_thresh
+        self.map_type = map_type
+        self.evaluate_difficult = evaluate_difficult
+        self.output_eval = output_eval
+        self.save_prediction_only = save_prediction_only
+        self.detection_map = DetectionMAP(
+            class_num=class_num,
+            overlap_thresh=overlap_thresh,
+            map_type=map_type,
+            is_bbox_normalized=is_bbox_normalized,
+            evaluate_difficult=evaluate_difficult,
+            catid2name=self.catid2name,
+            classwise=classwise)
+
+        self.reset()
+
+    def reset(self):
+        self.results = {'bbox': [], 'score': [], 'label': []}
+        self.detection_map.reset()
+
+    def update(self, inputs, outputs):
+        bbox_np = outputs['bbox'].numpy() if isinstance(
+            outputs['bbox'], paddle.Tensor) else outputs['bbox']
+        bboxes = bbox_np[:, 2:]
+        scores = bbox_np[:, 1]
+        labels = bbox_np[:, 0]
+        bbox_lengths = outputs['bbox_num'].numpy() if isinstance(
+            outputs['bbox_num'], paddle.Tensor) else outputs['bbox_num']
+
+        self.results['bbox'].append(bboxes.tolist())
+        self.results['score'].append(scores.tolist())
+        self.results['label'].append(labels.tolist())
+
+        if bboxes.shape == (1, 1) or bboxes is None:
+            return
+        if self.save_prediction_only:
+            return
+
+        gt_boxes = inputs['gt_bbox']
+        gt_labels = inputs['gt_class']
+        difficults = inputs['difficult'] if not self.evaluate_difficult \
+                            else None
+
+        if 'scale_factor' in inputs:
+            scale_factor = inputs['scale_factor'].numpy() if isinstance(
+                inputs['scale_factor'],
+                paddle.Tensor) else inputs['scale_factor']
+        else:
+            scale_factor = np.ones((gt_boxes.shape[0], 2)).astype('float32')
+
+        bbox_idx = 0
+        for i in range(len(gt_boxes)):
+            gt_box = gt_boxes[i].numpy() if isinstance(
+                gt_boxes[i], paddle.Tensor) else gt_boxes[i]
+            h, w = scale_factor[i]
+            gt_box = gt_box / np.array([w, h, w, h])
+            gt_label = gt_labels[i].numpy() if isinstance(
+                gt_labels[i], paddle.Tensor) else gt_labels[i]
+            if difficults is not None:
+                difficult = difficults[i].numpy() if isinstance(
+                    difficults[i], paddle.Tensor) else difficults[i]
+            else:
+                difficult = None
+            bbox_num = bbox_lengths[i]
+            bbox = bboxes[bbox_idx:bbox_idx + bbox_num]
+            score = scores[bbox_idx:bbox_idx + bbox_num]
+            label = labels[bbox_idx:bbox_idx + bbox_num]
+            gt_box, gt_label, difficult = prune_zero_padding(gt_box, gt_label,
+                                                             difficult)
+            self.detection_map.update(bbox, score, label, gt_box, gt_label,
+                                      difficult)
+            bbox_idx += bbox_num
+
+    def accumulate(self):
+        output = "bbox.json"
+        if self.output_eval:
+            output = os.path.join(self.output_eval, output)
+            with open(output, 'w') as f:
+                json.dump(self.results, f)
+                logger.info('The bbox result is saved to bbox.json.')
+        if self.save_prediction_only:
+            return
+
+        logger.info("Accumulating evaluatation results...")
+        self.detection_map.accumulate()
+
+    def log(self):
+        map_stat = 100. * self.detection_map.get_map()
+        logger.info("mAP({:.2f}, {}) = {:.2f}%".format(self.overlap_thresh,
+                                                       self.map_type, map_stat))
+
+    def get_results(self):
+        return {'bbox': [self.detection_map.get_map()]}
+
+
+class WiderFaceMetric(Metric):
+    def __init__(self, image_dir, anno_file, multi_scale=True):
+        self.image_dir = image_dir
+        self.anno_file = anno_file
+        self.multi_scale = multi_scale
+        self.clsid2catid, self.catid2name = get_categories('widerface')
+
+    def update(self, model):
+
+        face_eval_run(
+            model,
+            self.image_dir,
+            self.anno_file,
+            pred_dir='output/pred',
+            eval_mode='widerface',
+            multi_scale=self.multi_scale)
+
+
+class RBoxMetric(Metric):
+    def __init__(self, anno_file, **kwargs):
+        self.anno_file = anno_file
+        self.clsid2catid, self.catid2name = get_categories('RBOX', anno_file)
+        self.catid2clsid = {v: k for k, v in self.clsid2catid.items()}
+        self.classwise = kwargs.get('classwise', False)
+        self.output_eval = kwargs.get('output_eval', None)
+        self.save_prediction_only = kwargs.get('save_prediction_only', False)
+        self.overlap_thresh = kwargs.get('overlap_thresh', 0.5)
+        self.map_type = kwargs.get('map_type', '11point')
+        self.evaluate_difficult = kwargs.get('evaluate_difficult', False)
+        self.imid2path = kwargs.get('imid2path', None)
+        class_num = len(self.catid2name)
+        self.detection_map = DetectionMAP(
+            class_num=class_num,
+            overlap_thresh=self.overlap_thresh,
+            map_type=self.map_type,
+            is_bbox_normalized=False,
+            evaluate_difficult=self.evaluate_difficult,
+            catid2name=self.catid2name,
+            classwise=self.classwise)
+
+        self.reset()
+
+    def reset(self):
+        self.results = []
+        self.detection_map.reset()
+
+    def update(self, inputs, outputs):
+        outs = {}
+        # outputs Tensor -> numpy.ndarray
+        for k, v in outputs.items():
+            outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v
+
+        im_id = inputs['im_id']
+        im_id = im_id.numpy() if isinstance(im_id, paddle.Tensor) else im_id
+        outs['im_id'] = im_id
+
+        infer_results = get_infer_results(outs, self.clsid2catid)
+        infer_results = infer_results['bbox'] if 'bbox' in infer_results else []
+        self.results += infer_results
+        if self.save_prediction_only:
+            return
+
+        gt_boxes = inputs['gt_poly']
+        gt_labels = inputs['gt_class']
+
+        if 'scale_factor' in inputs:
+            scale_factor = inputs['scale_factor'].numpy() if isinstance(
+                inputs['scale_factor'],
+                paddle.Tensor) else inputs['scale_factor']
+        else:
+            scale_factor = np.ones((gt_boxes.shape[0], 2)).astype('float32')
+
+        for i in range(len(gt_boxes)):
+            gt_box = gt_boxes[i].numpy() if isinstance(
+                gt_boxes[i], paddle.Tensor) else gt_boxes[i]
+            h, w = scale_factor[i]
+            gt_box = gt_box / np.array([w, h, w, h, w, h, w, h])
+            gt_label = gt_labels[i].numpy() if isinstance(
+                gt_labels[i], paddle.Tensor) else gt_labels[i]
+            gt_box, gt_label, _ = prune_zero_padding(gt_box, gt_label)
+            bbox = [
+                res['bbox'] for res in infer_results
+                if int(res['image_id']) == int(im_id[i])
+            ]
+            score = [
+                res['score'] for res in infer_results
+                if int(res['image_id']) == int(im_id[i])
+            ]
+            label = [
+                self.catid2clsid[int(res['category_id'])]
+                for res in infer_results
+                if int(res['image_id']) == int(im_id[i])
+            ]
+            self.detection_map.update(bbox, score, label, gt_box, gt_label)
+
+    def save_results(self, results, output_dir, imid2path):
+        if imid2path:
+            data_dicts = defaultdict(list)
+            for result in results:
+                image_id = result['image_id']
+                data_dicts[image_id].append(result)
+
+            for image_id, image_path in imid2path.items():
+                basename = os.path.splitext(os.path.split(image_path)[-1])[0]
+                output = os.path.join(output_dir, "{}.txt".format(basename))
+                dets = data_dicts.get(image_id, [])
+                with open(output, 'w') as f:
+                    for det in dets:
+                        catid, bbox, score = det['category_id'], det[
+                            'bbox'], det['score']
+                        bbox_pred = '{} {} '.format(self.catid2name[catid],
+                                                    score) + ' '.join(
+                                                        [str(e) for e in bbox])
+                        f.write(bbox_pred + '\n')
+
+            logger.info('The bbox result is saved to {}.'.format(output_dir))
+        else:
+            output = os.path.join(output_dir, "bbox.json")
+            with open(output, 'w') as f:
+                json.dump(results, f)
+
+            logger.info('The bbox result is saved to {}.'.format(output))
+
+    def accumulate(self):
+        if self.output_eval:
+            self.save_results(self.results, self.output_eval, self.imid2path)
+
+        if not self.save_prediction_only:
+            logger.info("Accumulating evaluatation results...")
+            self.detection_map.accumulate()
+
+    def log(self):
+        map_stat = 100. * self.detection_map.get_map()
+        logger.info("mAP({:.2f}, {}) = {:.2f}%".format(self.overlap_thresh,
+                                                       self.map_type, map_stat))
+
+    def get_results(self):
+        return {'bbox': [self.detection_map.get_map()]}
+
+
+class SNIPERCOCOMetric(COCOMetric):
+    def __init__(self, anno_file, **kwargs):
+        super(SNIPERCOCOMetric, self).__init__(anno_file, **kwargs)
+        self.dataset = kwargs["dataset"]
+        self.chip_results = []
+
+    def reset(self):
+        # only bbox and mask evaluation support currently
+        self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []}
+        self.eval_results = {}
+        self.chip_results = []
+
+    def update(self, inputs, outputs):
+        outs = {}
+        # outputs Tensor -> numpy.ndarray
+        for k, v in outputs.items():
+            outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v
+
+        im_id = inputs['im_id']
+        outs['im_id'] = im_id.numpy() if isinstance(im_id,
+                                                    paddle.Tensor) else im_id
+
+        self.chip_results.append(outs)
+
+    def accumulate(self):
+        results = self.dataset.anno_cropper.aggregate_chips_detections(
+            self.chip_results)
+        for outs in results:
+            infer_results = get_infer_results(
+                outs, self.clsid2catid, bias=self.bias)
+            self.results['bbox'] += infer_results[
+                'bbox'] if 'bbox' in infer_results else []
+
+        super(SNIPERCOCOMetric, self).accumulate()
--- a/rtdetr_paddle/ppdet/metrics/mot_metrics.py
+++ b/rtdetr_paddle/ppdet/metrics/mot_metrics.py
--- a/rtdetr_paddle/ppdet/metrics/munkres.py
+++ b/rtdetr_paddle/ppdet/metrics/munkres.py
@@ -0,0 +1,428 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+"""
+This code is borrow from https://github.com/xingyizhou/CenterTrack/blob/master/src/tools/eval_kitti_track/munkres.py
+"""
+
+import sys
+
+__all__ = ['Munkres', 'make_cost_matrix']
+
+
+class Munkres:
+    """
+    Calculate the Munkres solution to the classical assignment problem.
+    See the module documentation for usage.
+    """
+
+    def __init__(self):
+        """Create a new instance"""
+        self.C = None
+        self.row_covered = []
+        self.col_covered = []
+        self.n = 0
+        self.Z0_r = 0
+        self.Z0_c = 0
+        self.marked = None
+        self.path = None
+
+    def make_cost_matrix(profit_matrix, inversion_function):
+        """
+        **DEPRECATED**
+
+        Please use the module function ``make_cost_matrix()``.
+        """
+        import munkres
+        return munkres.make_cost_matrix(profit_matrix, inversion_function)
+
+    make_cost_matrix = staticmethod(make_cost_matrix)
+
+    def pad_matrix(self, matrix, pad_value=0):
+        """
+        Pad a possibly non-square matrix to make it square.
+
+        :Parameters:
+            matrix : list of lists
+                matrix to pad
+
+            pad_value : int
+                value to use to pad the matrix
+
+        :rtype: list of lists
+        :return: a new, possibly padded, matrix
+        """
+        max_columns = 0
+        total_rows = len(matrix)
+
+        for row in matrix:
+            max_columns = max(max_columns, len(row))
+
+        total_rows = max(max_columns, total_rows)
+
+        new_matrix = []
+        for row in matrix:
+            row_len = len(row)
+            new_row = row[:]
+            if total_rows > row_len:
+                # Row too short. Pad it.
+                new_row += [0] * (total_rows - row_len)
+            new_matrix += [new_row]
+
+        while len(new_matrix) < total_rows:
+            new_matrix += [[0] * total_rows]
+
+        return new_matrix
+
+    def compute(self, cost_matrix):
+        """
+        Compute the indexes for the lowest-cost pairings between rows and
+        columns in the database. Returns a list of (row, column) tuples
+        that can be used to traverse the matrix.
+
+        :Parameters:
+            cost_matrix : list of lists
+                The cost matrix. If this cost matrix is not square, it
+                will be padded with zeros, via a call to ``pad_matrix()``.
+                (This method does *not* modify the caller's matrix. It
+                operates on a copy of the matrix.)
+
+                **WARNING**: This code handles square and rectangular
+                matrices. It does *not* handle irregular matrices.
+
+        :rtype: list
+        :return: A list of ``(row, column)`` tuples that describe the lowest
+                 cost path through the matrix
+
+        """
+        self.C = self.pad_matrix(cost_matrix)
+        self.n = len(self.C)
+        self.original_length = len(cost_matrix)
+        self.original_width = len(cost_matrix[0])
+        self.row_covered = [False for i in range(self.n)]
+        self.col_covered = [False for i in range(self.n)]
+        self.Z0_r = 0
+        self.Z0_c = 0
+        self.path = self.__make_matrix(self.n * 2, 0)
+        self.marked = self.__make_matrix(self.n, 0)
+
+        done = False
+        step = 1
+
+        steps = {
+            1: self.__step1,
+            2: self.__step2,
+            3: self.__step3,
+            4: self.__step4,
+            5: self.__step5,
+            6: self.__step6
+        }
+
+        while not done:
+            try:
+                func = steps[step]
+                step = func()
+            except KeyError:
+                done = True
+
+        # Look for the starred columns
+        results = []
+        for i in range(self.original_length):
+            for j in range(self.original_width):
+                if self.marked[i][j] == 1:
+                    results += [(i, j)]
+
+        return results
+
+    def __copy_matrix(self, matrix):
+        """Return an exact copy of the supplied matrix"""
+        return copy.deepcopy(matrix)
+
+    def __make_matrix(self, n, val):
+        """Create an *n*x*n* matrix, populating it with the specific value."""
+        matrix = []
+        for i in range(n):
+            matrix += [[val for j in range(n)]]
+        return matrix
+
+    def __step1(self):
+        """
+        For each row of the matrix, find the smallest element and
+        subtract it from every element in its row. Go to Step 2.
+        """
+        C = self.C
+        n = self.n
+        for i in range(n):
+            minval = min(self.C[i])
+            # Find the minimum value for this row and subtract that minimum
+            # from every element in the row.
+            for j in range(n):
+                self.C[i][j] -= minval
+
+        return 2
+
+    def __step2(self):
+        """
+        Find a zero (Z) in the resulting matrix. If there is no starred
+        zero in its row or column, star Z. Repeat for each element in the
+        matrix. Go to Step 3.
+        """
+        n = self.n
+        for i in range(n):
+            for j in range(n):
+                if (self.C[i][j] == 0) and \
+                   (not self.col_covered[j]) and \
+                   (not self.row_covered[i]):
+                    self.marked[i][j] = 1
+                    self.col_covered[j] = True
+                    self.row_covered[i] = True
+
+        self.__clear_covers()
+        return 3
+
+    def __step3(self):
+        """
+        Cover each column containing a starred zero. If K columns are
+        covered, the starred zeros describe a complete set of unique
+        assignments. In this case, Go to DONE, otherwise, Go to Step 4.
+        """
+        n = self.n
+        count = 0
+        for i in range(n):
+            for j in range(n):
+                if self.marked[i][j] == 1:
+                    self.col_covered[j] = True
+                    count += 1
+
+        if count >= n:
+            step = 7  # done
+        else:
+            step = 4
+
+        return step
+
+    def __step4(self):
+        """
+        Find a noncovered zero and prime it. If there is no starred zero
+        in the row containing this primed zero, Go to Step 5. Otherwise,
+        cover this row and uncover the column containing the starred
+        zero. Continue in this manner until there are no uncovered zeros
+        left. Save the smallest uncovered value and Go to Step 6.
+        """
+        step = 0
+        done = False
+        row = -1
+        col = -1
+        star_col = -1
+        while not done:
+            (row, col) = self.__find_a_zero()
+            if row < 0:
+                done = True
+                step = 6
+            else:
+                self.marked[row][col] = 2
+                star_col = self.__find_star_in_row(row)
+                if star_col >= 0:
+                    col = star_col
+                    self.row_covered[row] = True
+                    self.col_covered[col] = False
+                else:
+                    done = True
+                    self.Z0_r = row
+                    self.Z0_c = col
+                    step = 5
+
+        return step
+
+    def __step5(self):
+        """
+        Construct a series of alternating primed and starred zeros as
+        follows. Let Z0 represent the uncovered primed zero found in Step 4.
+        Let Z1 denote the starred zero in the column of Z0 (if any).
+        Let Z2 denote the primed zero in the row of Z1 (there will always
+        be one). Continue until the series terminates at a primed zero
+        that has no starred zero in its column. Unstar each starred zero
+        of the series, star each primed zero of the series, erase all
+        primes and uncover every line in the matrix. Return to Step 3
+        """
+        count = 0
+        path = self.path
+        path[count][0] = self.Z0_r
+        path[count][1] = self.Z0_c
+        done = False
+        while not done:
+            row = self.__find_star_in_col(path[count][1])
+            if row >= 0:
+                count += 1
+                path[count][0] = row
+                path[count][1] = path[count - 1][1]
+            else:
+                done = True
+
+            if not done:
+                col = self.__find_prime_in_row(path[count][0])
+                count += 1
+                path[count][0] = path[count - 1][0]
+                path[count][1] = col
+
+        self.__convert_path(path, count)
+        self.__clear_covers()
+        self.__erase_primes()
+        return 3
+
+    def __step6(self):
+        """
+        Add the value found in Step 4 to every element of each covered
+        row, and subtract it from every element of each uncovered column.
+        Return to Step 4 without altering any stars, primes, or covered
+        lines.
+        """
+        minval = self.__find_smallest()
+        for i in range(self.n):
+            for j in range(self.n):
+                if self.row_covered[i]:
+                    self.C[i][j] += minval
+                if not self.col_covered[j]:
+                    self.C[i][j] -= minval
+        return 4
+
+    def __find_smallest(self):
+        """Find the smallest uncovered value in the matrix."""
+        minval = 2e9  # sys.maxint
+        for i in range(self.n):
+            for j in range(self.n):
+                if (not self.row_covered[i]) and (not self.col_covered[j]):
+                    if minval > self.C[i][j]:
+                        minval = self.C[i][j]
+        return minval
+
+    def __find_a_zero(self):
+        """Find the first uncovered element with value 0"""
+        row = -1
+        col = -1
+        i = 0
+        n = self.n
+        done = False
+
+        while not done:
+            j = 0
+            while True:
+                if (self.C[i][j] == 0) and \
+                   (not self.row_covered[i]) and \
+                   (not self.col_covered[j]):
+                    row = i
+                    col = j
+                    done = True
+                j += 1
+                if j >= n:
+                    break
+            i += 1
+            if i >= n:
+                done = True
+
+        return (row, col)
+
+    def __find_star_in_row(self, row):
+        """
+        Find the first starred element in the specified row. Returns
+        the column index, or -1 if no starred element was found.
+        """
+        col = -1
+        for j in range(self.n):
+            if self.marked[row][j] == 1:
+                col = j
+                break
+
+        return col
+
+    def __find_star_in_col(self, col):
+        """
+        Find the first starred element in the specified row. Returns
+        the row index, or -1 if no starred element was found.
+        """
+        row = -1
+        for i in range(self.n):
+            if self.marked[i][col] == 1:
+                row = i
+                break
+
+        return row
+
+    def __find_prime_in_row(self, row):
+        """
+        Find the first prime element in the specified row. Returns
+        the column index, or -1 if no starred element was found.
+        """
+        col = -1
+        for j in range(self.n):
+            if self.marked[row][j] == 2:
+                col = j
+                break
+
+        return col
+
+    def __convert_path(self, path, count):
+        for i in range(count + 1):
+            if self.marked[path[i][0]][path[i][1]] == 1:
+                self.marked[path[i][0]][path[i][1]] = 0
+            else:
+                self.marked[path[i][0]][path[i][1]] = 1
+
+    def __clear_covers(self):
+        """Clear all covered matrix cells"""
+        for i in range(self.n):
+            self.row_covered[i] = False
+            self.col_covered[i] = False
+
+    def __erase_primes(self):
+        """Erase all prime markings"""
+        for i in range(self.n):
+            for j in range(self.n):
+                if self.marked[i][j] == 2:
+                    self.marked[i][j] = 0
+
+
+def make_cost_matrix(profit_matrix, inversion_function):
+    """
+    Create a cost matrix from a profit matrix by calling
+    'inversion_function' to invert each value. The inversion
+    function must take one numeric argument (of any type) and return
+    another numeric argument which is presumed to be the cost inverse
+    of the original profit.
+
+    This is a static method. Call it like this:
+
+    .. python::
+
+        cost_matrix = Munkres.make_cost_matrix(matrix, inversion_func)
+
+    For example:
+
+    .. python::
+
+        cost_matrix = Munkres.make_cost_matrix(matrix, lambda x : sys.maxint - x)
+
+    :Parameters:
+        profit_matrix : list of lists
+            The matrix to convert from a profit to a cost matrix
+
+        inversion_function : function
+            The function to use to invert each entry in the profit matrix
+
+    :rtype: list of lists
+    :return: The converted matrix
+    """
+    cost_matrix = []
+    for row in profit_matrix:
+        cost_matrix.append([inversion_function(value) for value in row])
+    return cost_matrix
--- a/rtdetr_paddle/ppdet/metrics/pose3d_metrics.py
+++ b/rtdetr_paddle/ppdet/metrics/pose3d_metrics.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import paddle
+from paddle.distributed import ParallelEnv
+import os
+import json
+from collections import defaultdict, OrderedDict
+import numpy as np
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['Pose3DEval']
+
+
+class AverageMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def mean_per_joint_position_error(pred, gt, has_3d_joints):
+    """ 
+    Compute mPJPE
+    """
+    gt = gt[has_3d_joints == 1]
+    gt = gt[:, :, :3]
+    pred = pred[has_3d_joints == 1]
+
+    with paddle.no_grad():
+        gt_pelvis = (gt[:, 2, :] + gt[:, 3, :]) / 2
+        gt = gt - gt_pelvis[:, None, :]
+        pred_pelvis = (pred[:, 2, :] + pred[:, 3, :]) / 2
+        pred = pred - pred_pelvis[:, None, :]
+        error = paddle.sqrt(((pred - gt)**2).sum(axis=-1)).mean(axis=-1).numpy()
+        return error
+
+
+def compute_similarity_transform(S1, S2):
+    """Computes a similarity transform (sR, t) that takes
+    a set of 3D points S1 (3 x N) closest to a set of 3D points S2,
+    where R is an 3x3 rotation matrix, t 3x1 translation, s scale.
+    i.e. solves the orthogonal Procrutes problem.
+    """
+    transposed = False
+    if S1.shape[0] != 3 and S1.shape[0] != 2:
+        S1 = S1.T
+        S2 = S2.T
+        transposed = True
+    assert (S2.shape[1] == S1.shape[1])
+
+    # 1. Remove mean.
+    mu1 = S1.mean(axis=1, keepdims=True)
+    mu2 = S2.mean(axis=1, keepdims=True)
+    X1 = S1 - mu1
+    X2 = S2 - mu2
+
+    # 2. Compute variance of X1 used for scale.
+    var1 = np.sum(X1**2)
+
+    # 3. The outer product of X1 and X2.
+    K = X1.dot(X2.T)
+
+    # 4. Solution that Maximizes trace(R'K) is R=U*V', where U, V are
+    # singular vectors of K.
+    U, s, Vh = np.linalg.svd(K)
+    V = Vh.T
+    # Construct Z that fixes the orientation of R to get det(R)=1.
+    Z = np.eye(U.shape[0])
+    Z[-1, -1] *= np.sign(np.linalg.det(U.dot(V.T)))
+    # Construct R.
+    R = V.dot(Z.dot(U.T))
+
+    # 5. Recover scale.
+    scale = np.trace(R.dot(K)) / var1
+
+    # 6. Recover translation.
+    t = mu2 - scale * (R.dot(mu1))
+
+    # 7. Error:
+    S1_hat = scale * R.dot(S1) + t
+
+    if transposed:
+        S1_hat = S1_hat.T
+
+    return S1_hat
+
+
+def compute_similarity_transform_batch(S1, S2):
+    """Batched version of compute_similarity_transform."""
+    S1_hat = np.zeros_like(S1)
+    for i in range(S1.shape[0]):
+        S1_hat[i] = compute_similarity_transform(S1[i], S2[i])
+    return S1_hat
+
+
+def reconstruction_error(S1, S2, reduction='mean'):
+    """Do Procrustes alignment and compute reconstruction error."""
+    S1_hat = compute_similarity_transform_batch(S1, S2)
+    re = np.sqrt(((S1_hat - S2)**2).sum(axis=-1)).mean(axis=-1)
+    if reduction == 'mean':
+        re = re.mean()
+    elif reduction == 'sum':
+        re = re.sum()
+    return re
+
+
+def all_gather(data):
+    if paddle.distributed.get_world_size() == 1:
+        return data
+    vlist = []
+    paddle.distributed.all_gather(vlist, data)
+    data = paddle.concat(vlist, 0)
+    return data
+
+
+class Pose3DEval(object):
+    def __init__(self, output_eval, save_prediction_only=False):
+        super(Pose3DEval, self).__init__()
+        self.output_eval = output_eval
+        self.res_file = os.path.join(output_eval, "pose3d_results.json")
+        self.save_prediction_only = save_prediction_only
+        self.reset()
+
+    def reset(self):
+        self.PAmPJPE = AverageMeter()
+        self.mPJPE = AverageMeter()
+        self.eval_results = {}
+
+    def get_human36m_joints(self, input):
+        J24_TO_J14 = paddle.to_tensor(
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18])
+        J24_TO_J17 = paddle.to_tensor(
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19])
+        return paddle.index_select(input, J24_TO_J14, axis=1)
+
+    def update(self, inputs, outputs):
+        gt_3d_joints = all_gather(inputs['joints_3d'].cuda(ParallelEnv()
+                                                           .local_rank))
+        has_3d_joints = all_gather(inputs['has_3d_joints'].cuda(ParallelEnv()
+                                                                .local_rank))
+        pred_3d_joints = all_gather(outputs['pose3d'])
+        if gt_3d_joints.shape[1] == 24:
+            gt_3d_joints = self.get_human36m_joints(gt_3d_joints)
+        if pred_3d_joints.shape[1] == 24:
+            pred_3d_joints = self.get_human36m_joints(pred_3d_joints)
+        mPJPE_val = mean_per_joint_position_error(pred_3d_joints, gt_3d_joints,
+                                                  has_3d_joints).mean()
+        PAmPJPE_val = reconstruction_error(
+            pred_3d_joints.numpy(),
+            gt_3d_joints[:, :, :3].numpy(),
+            reduction=None).mean()
+        count = int(np.sum(has_3d_joints.numpy()))
+        self.PAmPJPE.update(PAmPJPE_val * 1000., count)
+        self.mPJPE.update(mPJPE_val * 1000., count)
+
+    def accumulate(self):
+        if self.save_prediction_only:
+            logger.info(f'The pose3d result is saved to {self.res_file} '
+                        'and do not evaluate the model.')
+            return
+        self.eval_results['pose3d'] = [-self.mPJPE.avg, -self.PAmPJPE.avg]
+
+    def log(self):
+        if self.save_prediction_only:
+            return
+        stats_names = ['mPJPE', 'PAmPJPE']
+        num_values = len(stats_names)
+        print(' '.join(['| {}'.format(name) for name in stats_names]) + ' |')
+        print('|---' * (num_values + 1) + '|')
+
+        print(' '.join([
+            '| {:.3f}'.format(abs(value))
+            for value in self.eval_results['pose3d']
+        ]) + ' |')
+
+    def get_results(self):
+        return self.eval_results
--- a/rtdetr_paddle/ppdet/metrics/widerface_utils.py
+++ b/rtdetr_paddle/ppdet/metrics/widerface_utils.py
@@ -0,0 +1,391 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import cv2
+import numpy as np
+from collections import OrderedDict
+
+import paddle
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['face_eval_run', 'lmk2out']
+
+
+def face_eval_run(model,
+                  image_dir,
+                  gt_file,
+                  pred_dir='output/pred',
+                  eval_mode='widerface',
+                  multi_scale=False):
+    # load ground truth files
+    with open(gt_file, 'r') as f:
+        gt_lines = f.readlines()
+    imid2path = []
+    pos_gt = 0
+    while pos_gt < len(gt_lines):
+        name_gt = gt_lines[pos_gt].strip('\n\t').split()[0]
+        imid2path.append(name_gt)
+        pos_gt += 1
+        n_gt = int(gt_lines[pos_gt].strip('\n\t').split()[0])
+        pos_gt += 1 + n_gt
+    logger.info('The ground truth file load {} images'.format(len(imid2path)))
+
+    dets_dist = OrderedDict()
+    for iter_id, im_path in enumerate(imid2path):
+        image_path = os.path.join(image_dir, im_path)
+        if eval_mode == 'fddb':
+            image_path += '.jpg'
+        assert os.path.exists(image_path)
+        image = cv2.imread(image_path)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        if multi_scale:
+            shrink, max_shrink = get_shrink(image.shape[0], image.shape[1])
+            det0 = detect_face(model, image, shrink)
+            det1 = flip_test(model, image, shrink)
+            [det2, det3] = multi_scale_test(model, image, max_shrink)
+            det4 = multi_scale_test_pyramid(model, image, max_shrink)
+            det = np.row_stack((det0, det1, det2, det3, det4))
+            dets = bbox_vote(det)
+        else:
+            dets = detect_face(model, image, 1)
+        if eval_mode == 'widerface':
+            save_widerface_bboxes(image_path, dets, pred_dir)
+        else:
+            dets_dist[im_path] = dets
+        if iter_id % 100 == 0:
+            logger.info('Test iter {}'.format(iter_id))
+    if eval_mode == 'fddb':
+        save_fddb_bboxes(dets_dist, pred_dir)
+    logger.info("Finish evaluation.")
+
+
+def detect_face(model, image, shrink):
+    image_shape = [image.shape[0], image.shape[1]]
+    if shrink != 1:
+        h, w = int(image_shape[0] * shrink), int(image_shape[1] * shrink)
+        image = cv2.resize(image, (w, h))
+        image_shape = [h, w]
+
+    img = face_img_process(image)
+    image_shape = np.asarray([image_shape])
+    scale_factor = np.asarray([[shrink, shrink]])
+    data = {
+        "image": paddle.to_tensor(
+            img, dtype='float32'),
+        "im_shape": paddle.to_tensor(
+            image_shape, dtype='float32'),
+        "scale_factor": paddle.to_tensor(
+            scale_factor, dtype='float32')
+    }
+    model.eval()
+    detection = model(data)
+    detection = detection['bbox'].numpy()
+    # layout: xmin, ymin, xmax. ymax, score
+    if np.prod(detection.shape) == 1:
+        logger.info("No face detected")
+        return np.array([[0, 0, 0, 0, 0]])
+    det_conf = detection[:, 1]
+    det_xmin = detection[:, 2]
+    det_ymin = detection[:, 3]
+    det_xmax = detection[:, 4]
+    det_ymax = detection[:, 5]
+
+    det = np.column_stack((det_xmin, det_ymin, det_xmax, det_ymax, det_conf))
+    return det
+
+
+def flip_test(model, image, shrink):
+    img = cv2.flip(image, 1)
+    det_f = detect_face(model, img, shrink)
+    det_t = np.zeros(det_f.shape)
+    img_width = image.shape[1]
+    det_t[:, 0] = img_width - det_f[:, 2]
+    det_t[:, 1] = det_f[:, 1]
+    det_t[:, 2] = img_width - det_f[:, 0]
+    det_t[:, 3] = det_f[:, 3]
+    det_t[:, 4] = det_f[:, 4]
+    return det_t
+
+
+def multi_scale_test(model, image, max_shrink):
+    # Shrink detecting is only used to detect big faces
+    st = 0.5 if max_shrink >= 0.75 else 0.5 * max_shrink
+    det_s = detect_face(model, image, st)
+    index = np.where(
+        np.maximum(det_s[:, 2] - det_s[:, 0] + 1, det_s[:, 3] - det_s[:, 1] + 1)
+        > 30)[0]
+    det_s = det_s[index, :]
+    # Enlarge one times
+    bt = min(2, max_shrink) if max_shrink > 1 else (st + max_shrink) / 2
+    det_b = detect_face(model, image, bt)
+
+    # Enlarge small image x times for small faces
+    if max_shrink > 2:
+        bt *= 2
+        while bt < max_shrink:
+            det_b = np.row_stack((det_b, detect_face(model, image, bt)))
+            bt *= 2
+        det_b = np.row_stack((det_b, detect_face(model, image, max_shrink)))
+
+    # Enlarged images are only used to detect small faces.
+    if bt > 1:
+        index = np.where(
+            np.minimum(det_b[:, 2] - det_b[:, 0] + 1,
+                       det_b[:, 3] - det_b[:, 1] + 1) < 100)[0]
+        det_b = det_b[index, :]
+    # Shrinked images are only used to detect big faces.
+    else:
+        index = np.where(
+            np.maximum(det_b[:, 2] - det_b[:, 0] + 1,
+                       det_b[:, 3] - det_b[:, 1] + 1) > 30)[0]
+        det_b = det_b[index, :]
+    return det_s, det_b
+
+
+def multi_scale_test_pyramid(model, image, max_shrink):
+    # Use image pyramids to detect faces
+    det_b = detect_face(model, image, 0.25)
+    index = np.where(
+        np.maximum(det_b[:, 2] - det_b[:, 0] + 1, det_b[:, 3] - det_b[:, 1] + 1)
+        > 30)[0]
+    det_b = det_b[index, :]
+
+    st = [0.75, 1.25, 1.5, 1.75]
+    for i in range(len(st)):
+        if st[i] <= max_shrink:
+            det_temp = detect_face(model, image, st[i])
+            # Enlarged images are only used to detect small faces.
+            if st[i] > 1:
+                index = np.where(
+                    np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1,
+                               det_temp[:, 3] - det_temp[:, 1] + 1) < 100)[0]
+                det_temp = det_temp[index, :]
+            # Shrinked images are only used to detect big faces.
+            else:
+                index = np.where(
+                    np.maximum(det_temp[:, 2] - det_temp[:, 0] + 1,
+                               det_temp[:, 3] - det_temp[:, 1] + 1) > 30)[0]
+                det_temp = det_temp[index, :]
+            det_b = np.row_stack((det_b, det_temp))
+    return det_b
+
+
+def to_chw(image):
+    """
+    Transpose image from HWC to CHW.
+    Args:
+        image (np.array): an image with HWC layout.
+    """
+    # HWC to CHW
+    if len(image.shape) == 3:
+        image = np.swapaxes(image, 1, 2)
+        image = np.swapaxes(image, 1, 0)
+    return image
+
+
+def face_img_process(image,
+                     mean=[104., 117., 123.],
+                     std=[127.502231, 127.502231, 127.502231]):
+    img = np.array(image)
+    img = to_chw(img)
+    img = img.astype('float32')
+    img -= np.array(mean)[:, np.newaxis, np.newaxis].astype('float32')
+    img /= np.array(std)[:, np.newaxis, np.newaxis].astype('float32')
+    img = [img]
+    img = np.array(img)
+    return img
+
+
+def get_shrink(height, width):
+    """
+    Args:
+        height (int): image height.
+        width (int): image width.
+    """
+    # avoid out of memory
+    max_shrink_v1 = (0x7fffffff / 577.0 / (height * width))**0.5
+    max_shrink_v2 = ((678 * 1024 * 2.0 * 2.0) / (height * width))**0.5
+
+    def get_round(x, loc):
+        str_x = str(x)
+        if '.' in str_x:
+            str_before, str_after = str_x.split('.')
+            len_after = len(str_after)
+            if len_after >= 3:
+                str_final = str_before + '.' + str_after[0:loc]
+                return float(str_final)
+            else:
+                return x
+
+    max_shrink = get_round(min(max_shrink_v1, max_shrink_v2), 2) - 0.3
+    if max_shrink >= 1.5 and max_shrink < 2:
+        max_shrink = max_shrink - 0.1
+    elif max_shrink >= 2 and max_shrink < 3:
+        max_shrink = max_shrink - 0.2
+    elif max_shrink >= 3 and max_shrink < 4:
+        max_shrink = max_shrink - 0.3
+    elif max_shrink >= 4 and max_shrink < 5:
+        max_shrink = max_shrink - 0.4
+    elif max_shrink >= 5:
+        max_shrink = max_shrink - 0.5
+    elif max_shrink <= 0.1:
+        max_shrink = 0.1
+
+    shrink = max_shrink if max_shrink < 1 else 1
+    return shrink, max_shrink
+
+
+def bbox_vote(det):
+    order = det[:, 4].ravel().argsort()[::-1]
+    det = det[order, :]
+    if det.shape[0] == 0:
+        dets = np.array([[10, 10, 20, 20, 0.002]])
+        det = np.empty(shape=[0, 5])
+    while det.shape[0] > 0:
+        # IOU
+        area = (det[:, 2] - det[:, 0] + 1) * (det[:, 3] - det[:, 1] + 1)
+        xx1 = np.maximum(det[0, 0], det[:, 0])
+        yy1 = np.maximum(det[0, 1], det[:, 1])
+        xx2 = np.minimum(det[0, 2], det[:, 2])
+        yy2 = np.minimum(det[0, 3], det[:, 3])
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        o = inter / (area[0] + area[:] - inter)
+
+        # nms
+        merge_index = np.where(o >= 0.3)[0]
+        det_accu = det[merge_index, :]
+        det = np.delete(det, merge_index, 0)
+        if merge_index.shape[0] <= 1:
+            if det.shape[0] == 0:
+                try:
+                    dets = np.row_stack((dets, det_accu))
+                except:
+                    dets = det_accu
+            continue
+        det_accu[:, 0:4] = det_accu[:, 0:4] * np.tile(det_accu[:, -1:], (1, 4))
+        max_score = np.max(det_accu[:, 4])
+        det_accu_sum = np.zeros((1, 5))
+        det_accu_sum[:, 0:4] = np.sum(det_accu[:, 0:4],
+                                      axis=0) / np.sum(det_accu[:, -1:])
+        det_accu_sum[:, 4] = max_score
+        try:
+            dets = np.row_stack((dets, det_accu_sum))
+        except:
+            dets = det_accu_sum
+    dets = dets[0:750, :]
+    keep_index = np.where(dets[:, 4] >= 0.01)[0]
+    dets = dets[keep_index, :]
+    return dets
+
+
+def save_widerface_bboxes(image_path, bboxes_scores, output_dir):
+    image_name = image_path.split('/')[-1]
+    image_class = image_path.split('/')[-2]
+    odir = os.path.join(output_dir, image_class)
+    if not os.path.exists(odir):
+        os.makedirs(odir)
+
+    ofname = os.path.join(odir, '%s.txt' % (image_name[:-4]))
+    f = open(ofname, 'w')
+    f.write('{:s}\n'.format(image_class + '/' + image_name))
+    f.write('{:d}\n'.format(bboxes_scores.shape[0]))
+    for box_score in bboxes_scores:
+        xmin, ymin, xmax, ymax, score = box_score
+        f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n'.format(xmin, ymin, (
+            xmax - xmin + 1), (ymax - ymin + 1), score))
+    f.close()
+    logger.info("The predicted result is saved as {}".format(ofname))
+
+
+def save_fddb_bboxes(bboxes_scores,
+                     output_dir,
+                     output_fname='pred_fddb_res.txt'):
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    predict_file = os.path.join(output_dir, output_fname)
+    f = open(predict_file, 'w')
+    for image_path, dets in bboxes_scores.iteritems():
+        f.write('{:s}\n'.format(image_path))
+        f.write('{:d}\n'.format(dets.shape[0]))
+        for box_score in dets:
+            xmin, ymin, xmax, ymax, score = box_score
+            width, height = xmax - xmin, ymax - ymin
+            f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n'
+                    .format(xmin, ymin, width, height, score))
+    logger.info("The predicted result is saved as {}".format(predict_file))
+    return predict_file
+
+
+def lmk2out(results, is_bbox_normalized=False):
+    """
+    Args:
+        results: request a dict, should include: `landmark`, `im_id`,
+                 if is_bbox_normalized=True, also need `im_shape`.
+        is_bbox_normalized: whether or not landmark is normalized.
+    """
+    xywh_res = []
+    for t in results:
+        bboxes = t['bbox'][0]
+        lengths = t['bbox'][1][0]
+        im_ids = np.array(t['im_id'][0]).flatten()
+        if bboxes.shape == (1, 1) or bboxes is None:
+            continue
+        face_index = t['face_index'][0]
+        prior_box = t['prior_boxes'][0]
+        predict_lmk = t['landmark'][0]
+        prior = np.reshape(prior_box, (-1, 4))
+        predictlmk = np.reshape(predict_lmk, (-1, 10))
+
+        k = 0
+        for a in range(len(lengths)):
+            num = lengths[a]
+            im_id = int(im_ids[a])
+            for i in range(num):
+                score = bboxes[k][1]
+                theindex = face_index[i][0]
+                me_prior = prior[theindex, :]
+                lmk_pred = predictlmk[theindex, :]
+                prior_w = me_prior[2] - me_prior[0]
+                prior_h = me_prior[3] - me_prior[1]
+                prior_w_center = (me_prior[2] + me_prior[0]) / 2
+                prior_h_center = (me_prior[3] + me_prior[1]) / 2
+                lmk_decode = np.zeros((10))
+                for j in [0, 2, 4, 6, 8]:
+                    lmk_decode[j] = lmk_pred[j] * 0.1 * prior_w + prior_w_center
+                for j in [1, 3, 5, 7, 9]:
+                    lmk_decode[j] = lmk_pred[j] * 0.1 * prior_h + prior_h_center
+                im_shape = t['im_shape'][0][a].tolist()
+                image_h, image_w = int(im_shape[0]), int(im_shape[1])
+                if is_bbox_normalized:
+                    lmk_decode = lmk_decode * np.array([
+                        image_w, image_h, image_w, image_h, image_w, image_h,
+                        image_w, image_h, image_w, image_h
+                    ])
+                lmk_res = {
+                    'image_id': im_id,
+                    'landmark': lmk_decode,
+                    'score': score,
+                }
+                xywh_res.append(lmk_res)
+                k += 1
+    return xywh_res
--- a/rtdetr_paddle/ppdet/modeling/init.py
+++ b/rtdetr_paddle/ppdet/modeling/init.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import warnings
+warnings.filterwarnings(
+    action='ignore', category=DeprecationWarning, module='ops')
+
+
+from .ops import *
+from .backbones import *
+from .heads import *
+from .losses import *
+from .architectures import *
+from .post_process import *
+from .layers import *
+from .transformers import *
--- a/rtdetr_paddle/ppdet/modeling/architectures/init.py
+++ b/rtdetr_paddle/ppdet/modeling/architectures/init.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from .meta_arch import *
+from .detr import *
--- a/rtdetr_paddle/ppdet/modeling/architectures/detr.py
+++ b/rtdetr_paddle/ppdet/modeling/architectures/detr.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from .meta_arch import BaseArch
+from ppdet.core.workspace import register, create
+
+__all__ = ['DETR']
+# Deformable DETR, DINO use the same architecture as DETR
+
+
+@register
+class DETR(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+    __shared__ = ['with_mask', 'exclude_post_process']
+
+    def __init__(self,
+                 backbone,
+                 transformer='DETRTransformer',
+                 detr_head='DETRHead',
+                 neck=None,
+                 post_process='DETRPostProcess',
+                 with_mask=False,
+                 exclude_post_process=False):
+        super(DETR, self).__init__()
+        self.backbone = backbone
+        self.transformer = transformer
+        self.detr_head = detr_head
+        self.neck = neck
+        self.post_process = post_process
+        self.with_mask = with_mask
+        self.exclude_post_process = exclude_post_process
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+        # neck
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs) if cfg['neck'] else None
+
+        # transformer
+        if neck is not None:
+            kwargs = {'input_shape': neck.out_shape}
+        transformer = create(cfg['transformer'], **kwargs)
+        # head
+        kwargs = {
+            'hidden_dim': transformer.hidden_dim,
+            'nhead': transformer.nhead,
+            'input_shape': backbone.out_shape
+        }
+        detr_head = create(cfg['detr_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'transformer': transformer,
+            "detr_head": detr_head,
+            "neck": neck
+        }
+
+    def _forward(self):
+        # Backbone
+        body_feats = self.backbone(self.inputs)
+
+        # Neck
+        if self.neck is not None:
+            body_feats = self.neck(body_feats)
+
+        # Transformer
+        pad_mask = self.inputs.get('pad_mask', None)
+        out_transformer = self.transformer(body_feats, pad_mask, self.inputs)
+
+        # DETR Head
+        if self.training:
+            detr_losses = self.detr_head(out_transformer, body_feats,
+                                         self.inputs)
+            detr_losses.update({
+                'loss': paddle.add_n(
+                    [v for k, v in detr_losses.items() if 'log' not in k])
+            })
+            return detr_losses
+        else:
+            preds = self.detr_head(out_transformer, body_feats)
+            if self.exclude_post_process:
+                bbox, bbox_num, mask = preds
+            else:
+                bbox, bbox_num, mask = self.post_process(
+                    preds, self.inputs['im_shape'], self.inputs['scale_factor'],
+                    paddle.shape(self.inputs['image'])[2:])
+
+            output = {'bbox': bbox, 'bbox_num': bbox_num}
+            if self.with_mask:
+                output['mask'] = mask
+            return output
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
--- a/rtdetr_paddle/ppdet/modeling/architectures/meta_arch.py
+++ b/rtdetr_paddle/ppdet/modeling/architectures/meta_arch.py
@@ -0,0 +1,132 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import typing
+
+from ppdet.core.workspace import register
+from ppdet.modeling.post_process import nms
+
+__all__ = ['BaseArch']
+
+
+@register
+class BaseArch(nn.Layer):
+    def __init__(self, data_format='NCHW', use_extra_data=False):
+        super(BaseArch, self).__init__()
+        self.data_format = data_format
+        self.inputs = {}
+        self.fuse_norm = False
+        self.use_extra_data = use_extra_data
+
+    def load_meanstd(self, cfg_transform):
+        scale = 1.
+        mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
+        std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
+        for item in cfg_transform:
+            if 'NormalizeImage' in item:
+                mean = np.array(
+                    item['NormalizeImage']['mean'], dtype=np.float32)
+                std = np.array(item['NormalizeImage']['std'], dtype=np.float32)
+                if item['NormalizeImage'].get('is_scale', True):
+                    scale = 1. / 255.
+                break
+        if self.data_format == 'NHWC':
+            self.scale = paddle.to_tensor(scale / std).reshape((1, 1, 1, 3))
+            self.bias = paddle.to_tensor(-mean / std).reshape((1, 1, 1, 3))
+        else:
+            self.scale = paddle.to_tensor(scale / std).reshape((1, 3, 1, 1))
+            self.bias = paddle.to_tensor(-mean / std).reshape((1, 3, 1, 1))
+
+    def forward(self, inputs):
+        if self.data_format == 'NHWC':
+            image = inputs['image']
+            inputs['image'] = paddle.transpose(image, [0, 2, 3, 1])
+
+        if self.fuse_norm:
+            image = inputs['image']
+            self.inputs['image'] = image * self.scale + self.bias
+            self.inputs['im_shape'] = inputs['im_shape']
+            self.inputs['scale_factor'] = inputs['scale_factor']
+        else:
+            self.inputs = inputs
+
+        self.model_arch()
+
+        if self.training:
+            out = self.get_loss()
+        else:
+            inputs_list = []
+            # multi-scale input
+            if not isinstance(inputs, typing.Sequence):
+                inputs_list.append(inputs)
+            else:
+                inputs_list.extend(inputs)
+            outs = []
+            for inp in inputs_list:
+                if self.fuse_norm:
+                    self.inputs['image'] = inp['image'] * self.scale + self.bias
+                    self.inputs['im_shape'] = inp['im_shape']
+                    self.inputs['scale_factor'] = inp['scale_factor']
+                else:
+                    self.inputs = inp
+                outs.append(self.get_pred())
+
+            # multi-scale test
+            if len(outs) > 1:
+                out = self.merge_multi_scale_predictions(outs)
+            else:
+                out = outs[0]
+        return out
+
+    def merge_multi_scale_predictions(self, outs):
+        # default values for architectures not included in following list
+        num_classes = 80
+        nms_threshold = 0.5
+        keep_top_k = 100
+
+        if self.__class__.__name__ in ('CascadeRCNN', 'FasterRCNN', 'MaskRCNN'):
+            num_classes = self.bbox_head.num_classes
+            keep_top_k = self.bbox_post_process.nms.keep_top_k
+            nms_threshold = self.bbox_post_process.nms.nms_threshold
+        else:
+            raise Exception(
+                "Multi scale test only supports CascadeRCNN, FasterRCNN and MaskRCNN for now"
+            )
+
+        final_boxes = []
+        all_scale_outs = paddle.concat([o['bbox'] for o in outs]).numpy()
+        for c in range(num_classes):
+            idxs = all_scale_outs[:, 0] == c
+            if np.count_nonzero(idxs) == 0:
+                continue
+            r = nms(all_scale_outs[idxs, 1:], nms_threshold)
+            final_boxes.append(
+                np.concatenate([np.full((r.shape[0], 1), c), r], 1))
+        out = np.concatenate(final_boxes)
+        out = np.concatenate(sorted(
+            out, key=lambda e: e[1])[-keep_top_k:]).reshape((-1, 6))
+        out = {
+            'bbox': paddle.to_tensor(out),
+            'bbox_num': paddle.to_tensor(np.array([out.shape[0], ]))
+        }
+
+        return out
+
+    def build_inputs(self, data, input_def):
+        inputs = {}
+        for i, k in enumerate(input_def):
+            inputs[k] = data[i]
+        return inputs
+
+    def model_arch(self, ):
+        pass
+
+    def get_loss(self, ):
+        raise NotImplementedError("Should implement get_loss method!")
+
+    def get_pred(self, ):
+        raise NotImplementedError("Should implement get_pred method!")
--- a/rtdetr_paddle/ppdet/modeling/backbones/init.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/init.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from .resnet import *
+from .darknet import *
+from .mobilenet_v1 import *
+from .mobilenet_v3 import *
+from .shufflenet_v2 import *
+from .swin_transformer import *
+from .lcnet import *
+from .cspresnet import *
+from .csp_darknet import *
+from .convnext import *
+from .vision_transformer import *
+from .mobileone import *
+from .trans_encoder import *
+from .focalnet import *
+from .vit_mae import *
+from .hgnet_v2 import *
--- a/rtdetr_paddle/ppdet/modeling/backbones/convnext.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/convnext.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+'''
+Modified from https://github.com/facebookresearch/ConvNeXt
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+'''
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import Constant
+
+import numpy as np
+
+from ppdet.core.workspace import register, serializable
+from ..shape_spec import ShapeSpec
+from .transformer_utils import DropPath, trunc_normal_, zeros_
+
+__all__ = ['ConvNeXt']
+
+
+class Block(nn.Layer):
+    r""" ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in Pypaddle
+    
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+
+    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
+        super().__init__()
+        self.dwconv = nn.Conv2D(
+            dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim, 4 * dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+
+        if layer_scale_init_value > 0:
+            self.gamma = self.create_parameter(
+                shape=(dim, ),
+                attr=ParamAttr(initializer=Constant(layer_scale_init_value)))
+        else:
+            self.gamma = None
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity(
+        )
+
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.transpose([0, 2, 3, 1])
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.transpose([0, 3, 1, 2])
+        x = input + self.drop_path(x)
+        return x
+
+
+class LayerNorm(nn.Layer):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. 
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with 
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs 
+    with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+
+        self.weight = self.create_parameter(
+            shape=(normalized_shape, ),
+            attr=ParamAttr(initializer=Constant(1.)))
+        self.bias = self.create_parameter(
+            shape=(normalized_shape, ),
+            attr=ParamAttr(initializer=Constant(0.)))
+
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape, )
+
+    def forward(self, x):
+        if self.data_format == "channels_last":
+            return F.layer_norm(x, self.normalized_shape, self.weight,
+                                self.bias, self.eps)
+        elif self.data_format == "channels_first":
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / paddle.sqrt(s + self.eps)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+            return x
+
+
+@register
+@serializable
+class ConvNeXt(nn.Layer):
+    r""" ConvNeXt
+        A Pypaddle impl of : `A ConvNet for the 2020s`  -
+          https://arxiv.org/pdf/2201.03545.pdf
+
+    Args:
+        in_chans (int): Number of input image channels. Default: 3
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+
+    arch_settings = {
+        'tiny': {
+            'depths': [3, 3, 9, 3],
+            'dims': [96, 192, 384, 768]
+        },
+        'small': {
+            'depths': [3, 3, 27, 3],
+            'dims': [96, 192, 384, 768]
+        },
+        'base': {
+            'depths': [3, 3, 27, 3],
+            'dims': [128, 256, 512, 1024]
+        },
+        'large': {
+            'depths': [3, 3, 27, 3],
+            'dims': [192, 384, 768, 1536]
+        },
+        'xlarge': {
+            'depths': [3, 3, 27, 3],
+            'dims': [256, 512, 1024, 2048]
+        },
+    }
+
+    def __init__(
+            self,
+            arch='tiny',
+            in_chans=3,
+            drop_path_rate=0.,
+            layer_scale_init_value=1e-6,
+            return_idx=[1, 2, 3],
+            norm_output=True,
+            pretrained=None, ):
+        super().__init__()
+        depths = self.arch_settings[arch]['depths']
+        dims = self.arch_settings[arch]['dims']
+        self.downsample_layers = nn.LayerList(
+        )  # stem and 3 intermediate downsampling conv layers
+        stem = nn.Sequential(
+            nn.Conv2D(
+                in_chans, dims[0], kernel_size=4, stride=4),
+            LayerNorm(
+                dims[0], eps=1e-6, data_format="channels_first"))
+        self.downsample_layers.append(stem)
+        for i in range(3):
+            downsample_layer = nn.Sequential(
+                LayerNorm(
+                    dims[i], eps=1e-6, data_format="channels_first"),
+                nn.Conv2D(
+                    dims[i], dims[i + 1], kernel_size=2, stride=2), )
+            self.downsample_layers.append(downsample_layer)
+
+        self.stages = nn.LayerList(
+        )  # 4 feature resolution stages, each consisting of multiple residual blocks
+        dp_rates = [x for x in np.linspace(0, drop_path_rate, sum(depths))]
+        cur = 0
+        for i in range(4):
+            stage = nn.Sequential(* [
+                Block(
+                    dim=dims[i],
+                    drop_path=dp_rates[cur + j],
+                    layer_scale_init_value=layer_scale_init_value)
+                for j in range(depths[i])
+            ])
+            self.stages.append(stage)
+            cur += depths[i]
+
+        self.return_idx = return_idx
+        self.dims = [dims[i] for i in return_idx]  # [::-1]
+
+        self.norm_output = norm_output
+        if norm_output:
+            self.norms = nn.LayerList([
+                LayerNorm(
+                    c, eps=1e-6, data_format="channels_first")
+                for c in self.dims
+            ])
+
+        self.apply(self._init_weights)
+
+        if pretrained is not None:
+            if 'http' in pretrained:  #URL
+                path = paddle.utils.download.get_weights_path_from_url(
+                    pretrained)
+            else:  #model in local path
+                path = pretrained
+            self.set_state_dict(paddle.load(path))
+
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2D, nn.Linear)):
+            trunc_normal_(m.weight)
+            zeros_(m.bias)
+
+    def forward_features(self, x):
+        output = []
+        for i in range(4):
+            x = self.downsample_layers[i](x)
+            x = self.stages[i](x)
+            output.append(x)
+
+        outputs = [output[i] for i in self.return_idx]
+        if self.norm_output:
+            outputs = [self.norms[i](out) for i, out in enumerate(outputs)]
+
+        return outputs
+
+    def forward(self, x):
+        x = self.forward_features(x['image'])
+        return x
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self.dims]
--- a/rtdetr_paddle/ppdet/modeling/backbones/csp_darknet.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/csp_darknet.py
@@ -0,0 +1,404 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling.initializer import conv_init_
+from ..shape_spec import ShapeSpec
+
+__all__ = [
+    'CSPDarkNet', 'BaseConv', 'DWConv', 'BottleNeck', 'SPPLayer', 'SPPFLayer'
+]
+
+
+class BaseConv(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize,
+                 stride,
+                 groups=1,
+                 bias=False,
+                 act="silu"):
+        super(BaseConv, self).__init__()
+        self.conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=ksize,
+            stride=stride,
+            padding=(ksize - 1) // 2,
+            groups=groups,
+            bias_attr=bias)
+        self.bn = nn.BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+        self._init_weights()
+
+    def _init_weights(self):
+        conv_init_(self.conv)
+
+    def forward(self, x):
+        # use 'x * F.sigmoid(x)' replace 'silu'
+        x = self.bn(self.conv(x))
+        y = x * F.sigmoid(x)
+        return y
+
+
+class DWConv(nn.Layer):
+    """Depthwise Conv"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize,
+                 stride=1,
+                 bias=False,
+                 act="silu"):
+        super(DWConv, self).__init__()
+        self.dw_conv = BaseConv(
+            in_channels,
+            in_channels,
+            ksize=ksize,
+            stride=stride,
+            groups=in_channels,
+            bias=bias,
+            act=act)
+        self.pw_conv = BaseConv(
+            in_channels,
+            out_channels,
+            ksize=1,
+            stride=1,
+            groups=1,
+            bias=bias,
+            act=act)
+
+    def forward(self, x):
+        return self.pw_conv(self.dw_conv(x))
+
+
+class Focus(nn.Layer):
+    """Focus width and height information into channel space, used in YOLOX."""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize=3,
+                 stride=1,
+                 bias=False,
+                 act="silu"):
+        super(Focus, self).__init__()
+        self.conv = BaseConv(
+            in_channels * 4,
+            out_channels,
+            ksize=ksize,
+            stride=stride,
+            bias=bias,
+            act=act)
+
+    def forward(self, inputs):
+        # inputs [bs, C, H, W] -> outputs [bs, 4C, W/2, H/2]
+        top_left = inputs[:, :, 0::2, 0::2]
+        top_right = inputs[:, :, 0::2, 1::2]
+        bottom_left = inputs[:, :, 1::2, 0::2]
+        bottom_right = inputs[:, :, 1::2, 1::2]
+        outputs = paddle.concat(
+            [top_left, bottom_left, top_right, bottom_right], 1)
+        return self.conv(outputs)
+
+
+class BottleNeck(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 shortcut=True,
+                 expansion=0.5,
+                 depthwise=False,
+                 bias=False,
+                 act="silu"):
+        super(BottleNeck, self).__init__()
+        hidden_channels = int(out_channels * expansion)
+        Conv = DWConv if depthwise else BaseConv
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.conv2 = Conv(
+            hidden_channels,
+            out_channels,
+            ksize=3,
+            stride=1,
+            bias=bias,
+            act=act)
+        self.add_shortcut = shortcut and in_channels == out_channels
+
+    def forward(self, x):
+        y = self.conv2(self.conv1(x))
+        if self.add_shortcut:
+            y = y + x
+        return y
+
+
+class SPPLayer(nn.Layer):
+    """Spatial Pyramid Pooling (SPP) layer used in YOLOv3-SPP and YOLOX"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_sizes=(5, 9, 13),
+                 bias=False,
+                 act="silu"):
+        super(SPPLayer, self).__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.maxpoolings = nn.LayerList([
+            nn.MaxPool2D(
+                kernel_size=ks, stride=1, padding=ks // 2)
+            for ks in kernel_sizes
+        ])
+        conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
+        self.conv2 = BaseConv(
+            conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = paddle.concat([x] + [mp(x) for mp in self.maxpoolings], axis=1)
+        x = self.conv2(x)
+        return x
+
+
+class SPPFLayer(nn.Layer):
+    """ Spatial Pyramid Pooling - Fast (SPPF) layer used in YOLOv5 by Glenn Jocher,
+        equivalent to SPP(k=(5, 9, 13))
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize=5,
+                 bias=False,
+                 act='silu'):
+        super(SPPFLayer, self).__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.maxpooling = nn.MaxPool2D(
+            kernel_size=ksize, stride=1, padding=ksize // 2)
+        conv2_channels = hidden_channels * 4
+        self.conv2 = BaseConv(
+            conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        y1 = self.maxpooling(x)
+        y2 = self.maxpooling(y1)
+        y3 = self.maxpooling(y2)
+        concats = paddle.concat([x, y1, y2, y3], axis=1)
+        out = self.conv2(concats)
+        return out
+
+
+class CSPLayer(nn.Layer):
+    """CSP (Cross Stage Partial) layer with 3 convs, named C3 in YOLOv5"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_blocks=1,
+                 shortcut=True,
+                 expansion=0.5,
+                 depthwise=False,
+                 bias=False,
+                 act="silu"):
+        super(CSPLayer, self).__init__()
+        hidden_channels = int(out_channels * expansion)
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.conv2 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.bottlenecks = nn.Sequential(* [
+            BottleNeck(
+                hidden_channels,
+                hidden_channels,
+                shortcut=shortcut,
+                expansion=1.0,
+                depthwise=depthwise,
+                bias=bias,
+                act=act) for _ in range(num_blocks)
+        ])
+        self.conv3 = BaseConv(
+            hidden_channels * 2,
+            out_channels,
+            ksize=1,
+            stride=1,
+            bias=bias,
+            act=act)
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_1 = self.bottlenecks(x_1)
+        x_2 = self.conv2(x)
+        x = paddle.concat([x_1, x_2], axis=1)
+        x = self.conv3(x)
+        return x
+
+
+@register
+@serializable
+class CSPDarkNet(nn.Layer):
+    """
+    CSPDarkNet backbone.
+    Args:
+        arch (str): Architecture of CSPDarkNet, from {P5, P6, X}, default as X,
+            and 'X' means used in YOLOX, 'P5/P6' means used in YOLOv5.
+        depth_mult (float): Depth multiplier, multiply number of channels in
+            each layer, default as 1.0.
+        width_mult (float): Width multiplier, multiply number of blocks in
+            CSPLayer, default as 1.0.
+        depthwise (bool): Whether to use depth-wise conv layer.
+        act (str): Activation function type, default as 'silu'.
+        return_idx (list): Index of stages whose feature maps are returned.
+    """
+
+    __shared__ = ['depth_mult', 'width_mult', 'act', 'trt']
+
+    # in_channels, out_channels, num_blocks, add_shortcut, use_spp(use_sppf)
+    # 'X' means setting used in YOLOX, 'P5/P6' means setting used in YOLOv5.
+    arch_settings = {
+        'X': [[64, 128, 3, True, False], [128, 256, 9, True, False],
+              [256, 512, 9, True, False], [512, 1024, 3, False, True]],
+        'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 9, True, False], [512, 1024, 3, True, True]],
+        'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 9, True, False], [512, 768, 3, True, False],
+               [768, 1024, 3, True, True]],
+    }
+
+    def __init__(self,
+                 arch='X',
+                 depth_mult=1.0,
+                 width_mult=1.0,
+                 depthwise=False,
+                 act='silu',
+                 trt=False,
+                 return_idx=[2, 3, 4]):
+        super(CSPDarkNet, self).__init__()
+        self.arch = arch
+        self.return_idx = return_idx
+        Conv = DWConv if depthwise else BaseConv
+        arch_setting = self.arch_settings[arch]
+        base_channels = int(arch_setting[0][0] * width_mult)
+
+        # Note: differences between the latest YOLOv5 and the original YOLOX
+        # 1. self.stem, use SPPF(in YOLOv5) or SPP(in YOLOX)
+        # 2. use SPPF(in YOLOv5) or SPP(in YOLOX)
+        # 3. put SPPF before(YOLOv5) or SPP after(YOLOX) the last cspdark block's CSPLayer
+        # 4. whether SPPF(SPP)'CSPLayer add shortcut, True in YOLOv5, False in YOLOX
+        if arch in ['P5', 'P6']:
+            # in the latest YOLOv5, use Conv stem, and SPPF (fast, only single spp kernal size)
+            self.stem = Conv(
+                3, base_channels, ksize=6, stride=2, bias=False, act=act)
+            spp_kernal_sizes = 5
+        elif arch in ['X']:
+            # in the original YOLOX, use Focus stem, and SPP (three spp kernal sizes)
+            self.stem = Focus(
+                3, base_channels, ksize=3, stride=1, bias=False, act=act)
+            spp_kernal_sizes = (5, 9, 13)
+        else:
+            raise AttributeError("Unsupported arch type: {}".format(arch))
+
+        _out_channels = [base_channels]
+        layers_num = 1
+        self.csp_dark_blocks = []
+
+        for i, (in_channels, out_channels, num_blocks, shortcut,
+                use_spp) in enumerate(arch_setting):
+            in_channels = int(in_channels * width_mult)
+            out_channels = int(out_channels * width_mult)
+            _out_channels.append(out_channels)
+            num_blocks = max(round(num_blocks * depth_mult), 1)
+            stage = []
+
+            conv_layer = self.add_sublayer(
+                'layers{}.stage{}.conv_layer'.format(layers_num, i + 1),
+                Conv(
+                    in_channels, out_channels, 3, 2, bias=False, act=act))
+            stage.append(conv_layer)
+            layers_num += 1
+
+            if use_spp and arch in ['X']:
+                # in YOLOX use SPPLayer
+                spp_layer = self.add_sublayer(
+                    'layers{}.stage{}.spp_layer'.format(layers_num, i + 1),
+                    SPPLayer(
+                        out_channels,
+                        out_channels,
+                        kernel_sizes=spp_kernal_sizes,
+                        bias=False,
+                        act=act))
+                stage.append(spp_layer)
+                layers_num += 1
+
+            csp_layer = self.add_sublayer(
+                'layers{}.stage{}.csp_layer'.format(layers_num, i + 1),
+                CSPLayer(
+                    out_channels,
+                    out_channels,
+                    num_blocks=num_blocks,
+                    shortcut=shortcut,
+                    depthwise=depthwise,
+                    bias=False,
+                    act=act))
+            stage.append(csp_layer)
+            layers_num += 1
+
+            if use_spp and arch in ['P5', 'P6']:
+                # in latest YOLOv5 use SPPFLayer instead of SPPLayer
+                sppf_layer = self.add_sublayer(
+                    'layers{}.stage{}.sppf_layer'.format(layers_num, i + 1),
+                    SPPFLayer(
+                        out_channels,
+                        out_channels,
+                        ksize=5,
+                        bias=False,
+                        act=act))
+                stage.append(sppf_layer)
+                layers_num += 1
+
+            self.csp_dark_blocks.append(nn.Sequential(*stage))
+
+        self._out_channels = [_out_channels[i] for i in self.return_idx]
+        self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx]
+
+    def forward(self, inputs):
+        x = inputs['image']
+        outputs = []
+        x = self.stem(x)
+        for i, layer in enumerate(self.csp_dark_blocks):
+            x = layer(x)
+            if i + 1 in self.return_idx:
+                outputs.append(x)
+        return outputs
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=c, stride=s)
+            for c, s in zip(self._out_channels, self.strides)
+        ]
--- a/rtdetr_paddle/ppdet/modeling/backbones/cspresnet.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/cspresnet.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Constant
+
+from ppdet.modeling.ops import get_act_fn
+from ppdet.core.workspace import register, serializable
+from ..shape_spec import ShapeSpec
+
+__all__ = ['CSPResNet', 'BasicBlock', 'EffectiveSELayer', 'ConvBNLayer']
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size=3,
+                 stride=1,
+                 groups=1,
+                 padding=0,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+
+        self.conv = nn.Conv2D(
+            in_channels=ch_in,
+            out_channels=ch_out,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias_attr=False)
+
+        self.bn = nn.BatchNorm2D(
+            ch_out,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.act = get_act_fn(act) if act is None or isinstance(act, (
+            str, dict)) else act
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+
+        return x
+
+
+class RepVggBlock(nn.Layer):
+    def __init__(self, ch_in, ch_out, act='relu', alpha=False):
+        super(RepVggBlock, self).__init__()
+        self.ch_in = ch_in
+        self.ch_out = ch_out
+        self.conv1 = ConvBNLayer(
+            ch_in, ch_out, 3, stride=1, padding=1, act=None)
+        self.conv2 = ConvBNLayer(
+            ch_in, ch_out, 1, stride=1, padding=0, act=None)
+        self.act = get_act_fn(act) if act is None or isinstance(act, (
+            str, dict)) else act
+        if alpha:
+            self.alpha = self.create_parameter(
+                shape=[1],
+                attr=ParamAttr(initializer=Constant(value=1.)),
+                dtype="float32")
+        else:
+            self.alpha = None
+
+    def forward(self, x):
+        if hasattr(self, 'conv'):
+            y = self.conv(x)
+        else:
+            if self.alpha:
+                y = self.conv1(x) + self.alpha * self.conv2(x)
+            else:
+                y = self.conv1(x) + self.conv2(x)
+        y = self.act(y)
+        return y
+
+    def convert_to_deploy(self):
+        if not hasattr(self, 'conv'):
+            self.conv = nn.Conv2D(
+                in_channels=self.ch_in,
+                out_channels=self.ch_out,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                groups=1)
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.conv.weight.set_value(kernel)
+        self.conv.bias.set_value(bias)
+        self.__delattr__('conv1')
+        self.__delattr__('conv2')
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
+        if self.alpha:
+            return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor(
+                kernel1x1), bias3x3 + self.alpha * bias1x1
+        else:
+            return kernel3x3 + self._pad_1x1_to_3x3_tensor(
+                kernel1x1), bias3x3 + bias1x1
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        kernel = branch.conv.weight
+        running_mean = branch.bn._mean
+        running_var = branch.bn._variance
+        gamma = branch.bn.weight
+        beta = branch.bn.bias
+        eps = branch.bn._epsilon
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape((-1, 1, 1, 1))
+        return kernel * t, beta - running_mean * gamma / std
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 act='relu',
+                 shortcut=True,
+                 use_alpha=False):
+        super(BasicBlock, self).__init__()
+        assert ch_in == ch_out
+        self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act)
+        self.conv2 = RepVggBlock(ch_out, ch_out, act=act, alpha=use_alpha)
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        y = self.conv1(x)
+        y = self.conv2(y)
+        if self.shortcut:
+            return paddle.add(x, y)
+        else:
+            return y
+
+
+class EffectiveSELayer(nn.Layer):
+    """ Effective Squeeze-Excitation
+    From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
+    """
+
+    def __init__(self, channels, act='hardsigmoid'):
+        super(EffectiveSELayer, self).__init__()
+        self.fc = nn.Conv2D(channels, channels, kernel_size=1, padding=0)
+        self.act = get_act_fn(act) if act is None or isinstance(act, (
+            str, dict)) else act
+
+    def forward(self, x):
+        x_se = x.mean((2, 3), keepdim=True)
+        x_se = self.fc(x_se)
+        return x * self.act(x_se)
+
+
+class CSPResStage(nn.Layer):
+    def __init__(self,
+                 block_fn,
+                 ch_in,
+                 ch_out,
+                 n,
+                 stride,
+                 act='relu',
+                 attn='eca',
+                 use_alpha=False):
+        super(CSPResStage, self).__init__()
+
+        ch_mid = (ch_in + ch_out) // 2
+        if stride == 2:
+            self.conv_down = ConvBNLayer(
+                ch_in, ch_mid, 3, stride=2, padding=1, act=act)
+        else:
+            self.conv_down = None
+        self.conv1 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
+        self.conv2 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
+        self.blocks = nn.Sequential(*[
+            block_fn(
+                ch_mid // 2,
+                ch_mid // 2,
+                act=act,
+                shortcut=True,
+                use_alpha=use_alpha) for i in range(n)
+        ])
+        if attn:
+            self.attn = EffectiveSELayer(ch_mid, act='hardsigmoid')
+        else:
+            self.attn = None
+
+        self.conv3 = ConvBNLayer(ch_mid, ch_out, 1, act=act)
+
+    def forward(self, x):
+        if self.conv_down is not None:
+            x = self.conv_down(x)
+        y1 = self.conv1(x)
+        y2 = self.blocks(self.conv2(x))
+        y = paddle.concat([y1, y2], axis=1)
+        if self.attn is not None:
+            y = self.attn(y)
+        y = self.conv3(y)
+        return y
+
+
+@register
+@serializable
+class CSPResNet(nn.Layer):
+    __shared__ = ['width_mult', 'depth_mult', 'trt']
+
+    def __init__(self,
+                 layers=[3, 6, 6, 3],
+                 channels=[64, 128, 256, 512, 1024],
+                 act='swish',
+                 return_idx=[1, 2, 3],
+                 depth_wise=False,
+                 use_large_stem=False,
+                 width_mult=1.0,
+                 depth_mult=1.0,
+                 trt=False,
+                 use_checkpoint=False,
+                 use_alpha=False,
+                 **args):
+        super(CSPResNet, self).__init__()
+        self.use_checkpoint = use_checkpoint
+        channels = [max(round(c * width_mult), 1) for c in channels]
+        layers = [max(round(l * depth_mult), 1) for l in layers]
+        act = get_act_fn(
+            act, trt=trt) if act is None or isinstance(act,
+                                                       (str, dict)) else act
+
+        if use_large_stem:
+            self.stem = nn.Sequential(
+                ('conv1', ConvBNLayer(
+                    3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
+                ('conv2', ConvBNLayer(
+                    channels[0] // 2,
+                    channels[0] // 2,
+                    3,
+                    stride=1,
+                    padding=1,
+                    act=act)), ('conv3', ConvBNLayer(
+                        channels[0] // 2,
+                        channels[0],
+                        3,
+                        stride=1,
+                        padding=1,
+                        act=act)))
+        else:
+            self.stem = nn.Sequential(
+                ('conv1', ConvBNLayer(
+                    3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
+                ('conv2', ConvBNLayer(
+                    channels[0] // 2,
+                    channels[0],
+                    3,
+                    stride=1,
+                    padding=1,
+                    act=act)))
+
+        n = len(channels) - 1
+        self.stages = nn.Sequential(*[(str(i), CSPResStage(
+            BasicBlock,
+            channels[i],
+            channels[i + 1],
+            layers[i],
+            2,
+            act=act,
+            use_alpha=use_alpha)) for i in range(n)])
+
+        self._out_channels = channels[1:]
+        self._out_strides = [4 * 2**i for i in range(n)]
+        self.return_idx = return_idx
+        if use_checkpoint:
+            paddle.seed(0)
+
+    def forward(self, inputs):
+        x = inputs['image']
+        x = self.stem(x)
+        outs = []
+        for idx, stage in enumerate(self.stages):
+            if self.use_checkpoint and self.training:
+                x = paddle.distributed.fleet.utils.recompute(
+                    stage, x, **{"preserve_rng_state": True})
+            else:
+                x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+
+        return outs
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self._out_channels[i], stride=self._out_strides[i])
+            for i in self.return_idx
+        ]
--- a/rtdetr_paddle/ppdet/modeling/backbones/darknet.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/darknet.py
@@ -0,0 +1,345 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling.ops import batch_norm, mish
+from ..shape_spec import ShapeSpec
+
+__all__ = ['DarkNet', 'ConvBNLayer']
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size=3,
+                 stride=1,
+                 groups=1,
+                 padding=0,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 act="leaky",
+                 freeze_norm=False,
+                 data_format='NCHW',
+                 name=''):
+        """
+        conv + bn + activation layer
+
+        Args:
+            ch_in (int): input channel
+            ch_out (int): output channel
+            filter_size (int): filter size, default 3
+            stride (int): stride, default 1
+            groups (int): number of groups of conv layer, default 1
+            padding (int): padding size, default 0
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            act (str): activation function type, default 'leaky', which means leaky_relu
+            freeze_norm (bool): whether to freeze norm, default False
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(ConvBNLayer, self).__init__()
+
+        self.conv = nn.Conv2D(
+            in_channels=ch_in,
+            out_channels=ch_out,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            data_format=data_format,
+            bias_attr=False)
+        self.batch_norm = batch_norm(
+            ch_out,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+        self.act = act
+
+    def forward(self, inputs):
+        out = self.conv(inputs)
+        out = self.batch_norm(out)
+        if self.act == 'leaky':
+            out = F.leaky_relu(out, 0.1)
+        else:
+            out = getattr(F, self.act)(out)
+        return out
+
+
+class DownSample(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size=3,
+                 stride=2,
+                 padding=1,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 data_format='NCHW'):
+        """
+        downsample layer
+
+        Args:
+            ch_in (int): input channel
+            ch_out (int): output channel
+            filter_size (int): filter size, default 3
+            stride (int): stride, default 2
+            padding (int): padding size, default 1
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            freeze_norm (bool): whether to freeze norm, default False
+            data_format (str): data format, NCHW or NHWC
+        """
+
+        super(DownSample, self).__init__()
+
+        self.conv_bn_layer = ConvBNLayer(
+            ch_in=ch_in,
+            ch_out=ch_out,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+        self.ch_out = ch_out
+
+    def forward(self, inputs):
+        out = self.conv_bn_layer(inputs)
+        return out
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 data_format='NCHW'):
+        """
+        BasicBlock layer of DarkNet
+
+        Args:
+            ch_in (int): input channel
+            ch_out (int): output channel
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            freeze_norm (bool): whether to freeze norm, default False
+            data_format (str): data format, NCHW or NHWC
+        """
+
+        super(BasicBlock, self).__init__()
+
+        assert ch_in == ch_out and (ch_in % 2) == 0, \
+            f"ch_in and ch_out should be the same even int, but the input \'ch_in is {ch_in}, \'ch_out is {ch_out}"
+        # example:
+        # --------------{conv1} --> {conv2}
+        # channel route: 10-->5 --> 5-->10
+        self.conv1 = ConvBNLayer(
+            ch_in=ch_in,
+            ch_out=int(ch_out / 2),
+            filter_size=1,
+            stride=1,
+            padding=0,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+        self.conv2 = ConvBNLayer(
+            ch_in=int(ch_out / 2),
+            ch_out=ch_out,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+
+    def forward(self, inputs):
+        conv1 = self.conv1(inputs)
+        conv2 = self.conv2(conv1)
+        out = paddle.add(x=inputs, y=conv2)
+        return out
+
+
+class Blocks(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 count,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 name=None,
+                 data_format='NCHW'):
+        """
+        Blocks layer, which consist of some BaickBlock layers
+
+        Args:
+            ch_in (int): input channel
+            ch_out (int): output channel
+            count (int): number of BasicBlock layer
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            freeze_norm (bool): whether to freeze norm, default False
+            name (str): layer name
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(Blocks, self).__init__()
+
+        self.basicblock0 = BasicBlock(
+            ch_in,
+            ch_out,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+        self.res_out_list = []
+        for i in range(1, count):
+            block_name = '{}.{}'.format(name, i)
+            res_out = self.add_sublayer(
+                block_name,
+                BasicBlock(
+                    ch_out,
+                    ch_out,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    data_format=data_format))
+            self.res_out_list.append(res_out)
+        self.ch_out = ch_out
+
+    def forward(self, inputs):
+        y = self.basicblock0(inputs)
+        for basic_block_i in self.res_out_list:
+            y = basic_block_i(y)
+        return y
+
+
+DarkNet_cfg = {53: ([1, 2, 8, 8, 4])}
+
+
+@register
+@serializable
+class DarkNet(nn.Layer):
+    __shared__ = ['norm_type', 'data_format']
+
+    def __init__(self,
+                 depth=53,
+                 freeze_at=-1,
+                 return_idx=[2, 3, 4],
+                 num_stages=5,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 data_format='NCHW'):
+        """
+        Darknet, see https://pjreddie.com/darknet/yolo/
+
+        Args:
+            depth (int): depth of network
+            freeze_at (int): freeze the backbone at which stage
+            filter_size (int): filter size, default 3
+            return_idx (list): index of stages whose feature maps are returned
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(DarkNet, self).__init__()
+        self.depth = depth
+        self.freeze_at = freeze_at
+        self.return_idx = return_idx
+        self.num_stages = num_stages
+        self.stages = DarkNet_cfg[self.depth][0:num_stages]
+
+        self.conv0 = ConvBNLayer(
+            ch_in=3,
+            ch_out=32,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+
+        self.downsample0 = DownSample(
+            ch_in=32,
+            ch_out=32 * 2,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+
+        self._out_channels = []
+        self.darknet_conv_block_list = []
+        self.downsample_list = []
+        ch_in = [64, 128, 256, 512, 1024]
+        for i, stage in enumerate(self.stages):
+            name = 'stage.{}'.format(i)
+            conv_block = self.add_sublayer(
+                name,
+                Blocks(
+                    int(ch_in[i]),
+                    int(ch_in[i]),
+                    stage,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    data_format=data_format,
+                    name=name))
+            self.darknet_conv_block_list.append(conv_block)
+            if i in return_idx:
+                self._out_channels.append(int(ch_in[i]))
+        for i in range(num_stages - 1):
+            down_name = 'stage.{}.downsample'.format(i)
+            downsample = self.add_sublayer(
+                down_name,
+                DownSample(
+                    ch_in=int(ch_in[i]),
+                    ch_out=int(ch_in[i + 1]),
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    data_format=data_format))
+            self.downsample_list.append(downsample)
+
+    def forward(self, inputs):
+        x = inputs['image']
+
+        out = self.conv0(x)
+        out = self.downsample0(out)
+        blocks = []
+        for i, conv_block_i in enumerate(self.darknet_conv_block_list):
+            out = conv_block_i(out)
+            if i == self.freeze_at:
+                out.stop_gradient = True
+            if i in self.return_idx:
+                blocks.append(out)
+            if i < self.num_stages - 1:
+                out = self.downsample_list[i](out)
+        return blocks
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
--- a/rtdetr_paddle/ppdet/modeling/backbones/focalnet.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/focalnet.py
@@ -0,0 +1,720 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is based on https://github.com/microsoft/FocalNet/blob/main/classification/focalnet.py
+"""
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.modeling.shape_spec import ShapeSpec
+from ppdet.core.workspace import register, serializable
+from .transformer_utils import DropPath, Identity
+from .transformer_utils import add_parameter, to_2tuple
+from .transformer_utils import ones_, zeros_, trunc_normal_
+from .swin_transformer import Mlp
+
+__all__ = ['FocalNet']
+
+MODEL_cfg = {
+    'focalnet_T_224_1k_srf': dict(
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        focal_levels=[2, 2, 2, 2],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.2,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_srf_pretrained.pdparams',
+    ),
+    'focalnet_S_224_1k_srf': dict(
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        focal_levels=[2, 2, 2, 2],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.3,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_srf_pretrained.pdparams',
+    ),
+    'focalnet_B_224_1k_srf': dict(
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        focal_levels=[2, 2, 2, 2],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_srf_pretrained.pdparams',
+    ),
+    'focalnet_T_224_1k_lrf': dict(
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.2,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_lrf_pretrained.pdparams',
+    ),
+    'focalnet_S_224_1k_lrf': dict(
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.3,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_lrf_pretrained.pdparams',
+    ),
+    'focalnet_B_224_1k_lrf': dict(
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_lrf_pretrained.pdparams',
+    ),
+    'focalnet_L_384_22k_fl3': dict(
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[5, 5, 5, 5],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=False,
+        use_layerscale=True,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_pretrained.pdparams',
+    ),
+    'focalnet_L_384_22k_fl4': dict(
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        focal_levels=[4, 4, 4, 4],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=False,
+        use_layerscale=True,
+        normalize_modulator=True,  #
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_fl4_pretrained.pdparams',
+    ),
+    'focalnet_XL_384_22k_fl3': dict(
+        embed_dim=256,
+        depths=[2, 2, 18, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[5, 5, 5, 5],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=False,
+        use_layerscale=True,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_pretrained.pdparams',
+    ),
+    'focalnet_XL_384_22k_fl4': dict(
+        embed_dim=256,
+        depths=[2, 2, 18, 2],
+        focal_levels=[4, 4, 4, 4],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=False,
+        use_layerscale=True,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_fl4_pretrained.pdparams',
+    ),
+    'focalnet_H_224_22k_fl3': dict(
+        embed_dim=352,
+        depths=[2, 2, 18, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=True,  #
+        use_layerscale=True,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_pretrained.pdparams',
+    ),
+    'focalnet_H_224_22k_fl4': dict(
+        embed_dim=352,
+        depths=[2, 2, 18, 2],
+        focal_levels=[4, 4, 4, 4],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=True,  #
+        use_layerscale=True,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_fl4_pretrained.pdparams',
+    ),
+}
+
+
+class FocalModulation(nn.Layer):
+    """
+    Args:
+        dim (int): Number of input channels.
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+        focal_level (int): Number of focal levels
+        focal_window (int): Focal window size at focal level 1
+        focal_factor (int): Step to increase the focal window. Default: 2
+        use_postln_in_modulation (bool): Whether use post-modulation layernorm
+        normalize_modulator (bool): Whether use normalize in modulator
+    """
+
+    def __init__(self,
+                 dim,
+                 proj_drop=0.,
+                 focal_level=2,
+                 focal_window=7,
+                 focal_factor=2,
+                 use_postln_in_modulation=False,
+                 normalize_modulator=False):
+        super().__init__()
+        self.dim = dim
+
+        # specific args for focalv3
+        self.focal_level = focal_level
+        self.focal_window = focal_window
+        self.focal_factor = focal_factor
+        self.use_postln_in_modulation = use_postln_in_modulation
+        self.normalize_modulator = normalize_modulator
+
+        self.f = nn.Linear(
+            dim, 2 * dim + (self.focal_level + 1), bias_attr=True)
+        self.h = nn.Conv2D(
+            dim,
+            dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            bias_attr=True)
+
+        self.act = nn.GELU()
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.focal_layers = nn.LayerList()
+
+        if self.use_postln_in_modulation:
+            self.ln = nn.LayerNorm(dim)
+
+        for k in range(self.focal_level):
+            kernel_size = self.focal_factor * k + self.focal_window
+            self.focal_layers.append(
+                nn.Sequential(
+                    nn.Conv2D(
+                        dim,
+                        dim,
+                        kernel_size=kernel_size,
+                        stride=1,
+                        groups=dim,
+                        padding=kernel_size // 2,
+                        bias_attr=False),
+                    nn.GELU()))
+
+    def forward(self, x):
+        """ Forward function.
+        Args:
+            x: input features with shape of (B, H, W, C)
+        """
+        _, _, _, C = x.shape
+        x = self.f(x)
+        x = x.transpose([0, 3, 1, 2])
+        q, ctx, gates = paddle.split(x, (C, C, self.focal_level + 1), 1)
+
+        ctx_all = 0
+        for l in range(self.focal_level):
+            ctx = self.focal_layers[l](ctx)
+            ctx_all = ctx_all + ctx * gates[:, l:l + 1]
+        ctx_global = self.act(ctx.mean(2, keepdim=True).mean(3, keepdim=True))
+        ctx_all = ctx_all + ctx_global * gates[:, self.focal_level:]
+        if self.normalize_modulator:
+            ctx_all = ctx_all / (self.focal_level + 1)
+
+        x_out = q * self.h(ctx_all)
+        x_out = x_out.transpose([0, 2, 3, 1])
+        if self.use_postln_in_modulation:
+            x_out = self.ln(x_out)
+        x_out = self.proj(x_out)
+        x_out = self.proj_drop(x_out)
+        return x_out
+
+
+class FocalModulationBlock(nn.Layer):
+    """ Focal Modulation Block.
+    Args:
+        dim (int): Number of input channels.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        drop (float, optional): Dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+        focal_level (int): number of focal levels
+        focal_window (int): focal kernel size at level 1
+        use_postln (bool): Whether use layernorm after modulation. Default: False.
+        use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
+        normalize_modulator (bool): Whether use normalize in modulator
+        use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False 
+        layerscale_value (float): Value for layer scale. Default: 1e-4 
+    """
+
+    def __init__(self,
+                 dim,
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 focal_level=2,
+                 focal_window=9,
+                 use_postln=False,
+                 use_postln_in_modulation=False,
+                 normalize_modulator=False,
+                 use_layerscale=False,
+                 layerscale_value=1e-4):
+        super().__init__()
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.focal_window = focal_window
+        self.focal_level = focal_level
+        self.use_postln = use_postln
+        self.use_layerscale = use_layerscale
+
+        self.norm1 = norm_layer(dim)
+        self.modulation = FocalModulation(
+            dim,
+            proj_drop=drop,
+            focal_level=self.focal_level,
+            focal_window=self.focal_window,
+            use_postln_in_modulation=use_postln_in_modulation,
+            normalize_modulator=normalize_modulator)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        self.H = None
+        self.W = None
+
+        self.gamma_1 = 1.0
+        self.gamma_2 = 1.0
+        if self.use_layerscale:
+            self.gamma_1 = add_parameter(self,
+                                         layerscale_value * paddle.ones([dim]))
+            self.gamma_2 = add_parameter(self,
+                                         layerscale_value * paddle.ones([dim]))
+
+    def forward(self, x):
+        """
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        if not self.use_postln:
+            x = self.norm1(x)
+        x = x.reshape([-1, H, W, C])
+
+        # FM
+        x = self.modulation(x).reshape([-1, H * W, C])
+        if self.use_postln:
+            x = self.norm1(x)
+
+        # FFN
+        x = shortcut + self.drop_path(self.gamma_1 * x)
+
+        if self.use_postln:
+            x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x)))
+        else:
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class BasicLayer(nn.Layer):
+    """ A basic focal modulation layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        drop (float, optional): Dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
+        focal_level (int): Number of focal levels
+        focal_window (int): Focal window size at focal level 1
+        use_conv_embed (bool): Whether use overlapped convolution for patch embedding
+        use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False 
+        layerscale_value (float): Value of layerscale
+        use_postln (bool): Whether use layernorm after modulation. Default: False.
+        use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
+        normalize_modulator (bool): Whether use normalize in modulator
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(self,
+                 dim,
+                 depth,
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 focal_level=2,
+                 focal_window=9,
+                 use_conv_embed=False,
+                 use_layerscale=False,
+                 layerscale_value=1e-4,
+                 use_postln=False,
+                 use_postln_in_modulation=False,
+                 normalize_modulator=False,
+                 use_checkpoint=False):
+        super().__init__()
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.LayerList([
+            FocalModulationBlock(
+                dim=dim,
+                mlp_ratio=mlp_ratio,
+                drop=drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, np.ndarray) else drop_path,
+                act_layer=nn.GELU,
+                norm_layer=norm_layer,
+                focal_level=focal_level,
+                focal_window=focal_window,
+                use_postln=use_postln,
+                use_postln_in_modulation=use_postln_in_modulation,
+                normalize_modulator=normalize_modulator,
+                use_layerscale=use_layerscale,
+                layerscale_value=layerscale_value) for i in range(depth)
+        ])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                patch_size=2,
+                in_chans=dim,
+                embed_dim=2 * dim,
+                use_conv_embed=use_conv_embed,
+                norm_layer=norm_layer,
+                is_stem=False)
+        else:
+            self.downsample = None
+
+    def forward(self, x, H, W):
+        """
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+        """
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            x = blk(x)
+
+        if self.downsample is not None:
+            x_reshaped = x.transpose([0, 2, 1]).reshape(
+                [x.shape[0], x.shape[-1], H, W])
+            x_down = self.downsample(x_reshaped)
+            x_down = x_down.flatten(2).transpose([0, 2, 1])
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Layer, optional): Normalization layer. Default: None
+        use_conv_embed (bool): Whether use overlapped convolution for patch embedding. Default: False
+        is_stem (bool): Is the stem block or not. 
+    """
+
+    def __init__(self,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 norm_layer=None,
+                 use_conv_embed=False,
+                 is_stem=False):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        if use_conv_embed:
+            # if we choose to use conv embedding, then we treat the stem and non-stem differently
+            if is_stem:
+                kernel_size = 7
+                padding = 2
+                stride = 4
+            else:
+                kernel_size = 3
+                padding = 1
+                stride = 2
+            self.proj = nn.Conv2D(
+                in_chans,
+                embed_dim,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding)
+        else:
+            self.proj = nn.Conv2D(
+                in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        _, _, H, W = x.shape
+
+        if W % self.patch_size[1] != 0:
+            # for 3D tensor: [pad_left, pad_right]
+            # for 4D tensor: [pad_left, pad_right, pad_top, pad_bottom]
+            x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
+            W += W % self.patch_size[1]
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
+            H += H % self.patch_size[0]
+
+        x = self.proj(x)
+        if self.norm is not None:
+            _, _, Wh, Ww = x.shape
+            x = x.flatten(2).transpose([0, 2, 1])
+            x = self.norm(x)
+            x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
+
+        return x
+
+
+@register
+@serializable
+class FocalNet(nn.Layer):
+    """ FocalNet backbone
+    Args:
+        arch (str): Architecture of FocalNet
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each FocalNet Transformer stage.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        drop_rate (float): Dropout rate.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        focal_levels (Sequence[int]): Number of focal levels at four stages
+        focal_windows (Sequence[int]): Focal window sizes at first focal level at four stages
+        use_conv_embed (bool): Whether use overlapped convolution for patch embedding
+        use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False 
+        layerscale_value (float): Value of layerscale
+        use_postln (bool): Whether use layernorm after modulation. Default: False.
+        use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
+        normalize_modulator (bool): Whether use normalize in modulator
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(
+            self,
+            arch='focalnet_T_224_1k_srf',
+            out_indices=(0, 1, 2, 3),
+            frozen_stages=-1,
+            patch_size=4,
+            in_chans=3,
+            embed_dim=96,
+            depths=[2, 2, 6, 2],
+            mlp_ratio=4.,
+            drop_rate=0.,
+            drop_path_rate=0.2,  # 0.5 better for large+ models
+            norm_layer=nn.LayerNorm,
+            patch_norm=True,
+            focal_levels=[2, 2, 2, 2],
+            focal_windows=[3, 3, 3, 3],
+            use_conv_embed=False,
+            use_layerscale=False,
+            layerscale_value=1e-4,
+            use_postln=False,
+            use_postln_in_modulation=False,
+            normalize_modulator=False,
+            use_checkpoint=False,
+            pretrained=None):
+        super(FocalNet, self).__init__()
+        assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch)
+
+        embed_dim = MODEL_cfg[arch]['embed_dim']
+        depths = MODEL_cfg[arch]['depths']
+        drop_path_rate = MODEL_cfg[arch]['drop_path_rate']
+        focal_levels = MODEL_cfg[arch]['focal_levels']
+        focal_windows = MODEL_cfg[arch]['focal_windows']
+        use_conv_embed = MODEL_cfg[arch]['use_conv_embed']
+        use_layerscale = MODEL_cfg[arch]['use_layerscale']
+        use_postln = MODEL_cfg[arch]['use_postln']
+        use_postln_in_modulation = MODEL_cfg[arch]['use_postln_in_modulation']
+        normalize_modulator = MODEL_cfg[arch]['normalize_modulator']
+        if pretrained is None:
+            pretrained = MODEL_cfg[arch]['pretrained']
+
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.num_layers = len(depths)
+        self.patch_norm = patch_norm
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None,
+            use_conv_embed=use_conv_embed,
+            is_stem=True)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth decay rule
+        dpr = np.linspace(0, drop_path_rate, sum(depths))
+
+        # build layers
+        self.layers = nn.LayerList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                mlp_ratio=mlp_ratio,
+                drop=drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchEmbed
+                if (i_layer < self.num_layers - 1) else None,
+                focal_level=focal_levels[i_layer],
+                focal_window=focal_windows[i_layer],
+                use_conv_embed=use_conv_embed,
+                use_layerscale=use_layerscale,
+                layerscale_value=layerscale_value,
+                use_postln=use_postln,
+                use_postln_in_modulation=use_postln_in_modulation,
+                normalize_modulator=normalize_modulator,
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+
+        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
+        self.num_features = num_features
+
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_sublayer(layer_name, layer)
+
+        self.apply(self._init_weights)
+        self._freeze_stages()
+        if pretrained:
+            if 'http' in pretrained:  #URL
+                path = paddle.utils.download.get_weights_path_from_url(
+                    pretrained)
+            else:  #model in local path
+                path = pretrained
+            self.set_state_dict(paddle.load(path))
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.stop_gradient = True
+
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.stop_gradient = True
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward(self, x):
+        x = self.patch_embed(x['image'])
+        B, _, Wh, Ww = x.shape
+        x = x.flatten(2).transpose([0, 2, 1])
+        x = self.pos_drop(x)
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+                out = x_out.reshape([-1, H, W, self.num_features[i]]).transpose(
+                    (0, 3, 1, 2))
+                outs.append(out)
+
+        return outs
+
+    @property
+    def out_shape(self):
+        out_strides = [4, 8, 16, 32]
+        return [
+            ShapeSpec(
+                channels=self.num_features[i], stride=out_strides[i])
+            for i in self.out_indices
+        ]
--- a/rtdetr_paddle/ppdet/modeling/backbones/hgnet_v2.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/hgnet_v2.py
@@ -0,0 +1,447 @@
+# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import KaimingNormal, Constant
+from paddle.nn import Conv2D, BatchNorm2D, ReLU, AdaptiveAvgPool2D, MaxPool2D
+from paddle.regularizer import L2Decay
+from paddle import ParamAttr
+
+import copy
+
+from ppdet.core.workspace import register, serializable
+from ..shape_spec import ShapeSpec
+
+__all__ = ['PPHGNetV2']
+
+kaiming_normal_ = KaimingNormal()
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+class LearnableAffineBlock(nn.Layer):
+    def __init__(self,
+                 scale_value=1.0,
+                 bias_value=0.0,
+                 lr_mult=1.0,
+                 lab_lr=0.01):
+        super().__init__()
+        self.scale = self.create_parameter(
+            shape=[1, ],
+            default_initializer=Constant(value=scale_value),
+            attr=ParamAttr(learning_rate=lr_mult * lab_lr))
+        self.add_parameter("scale", self.scale)
+        self.bias = self.create_parameter(
+            shape=[1, ],
+            default_initializer=Constant(value=bias_value),
+            attr=ParamAttr(learning_rate=lr_mult * lab_lr))
+        self.add_parameter("bias", self.bias)
+
+    def forward(self, x):
+        return self.scale * x + self.bias
+
+
+class ConvBNAct(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=1,
+                 groups=1,
+                 use_act=True,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.use_act = use_act
+        self.use_lab = use_lab
+        self.conv = Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=padding
+            if isinstance(padding, str) else (kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=False)
+        self.bn = BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(
+                regularizer=L2Decay(0.0), learning_rate=lr_mult),
+            bias_attr=ParamAttr(
+                regularizer=L2Decay(0.0), learning_rate=lr_mult))
+        if self.use_act:
+            self.act = ReLU()
+            if self.use_lab:
+                self.lab = LearnableAffineBlock(lr_mult=lr_mult)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.use_act:
+            x = self.act(x)
+            if self.use_lab:
+                x = self.lab(x)
+        return x
+
+
+class LightConvBNAct(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 groups=1,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.conv1 = ConvBNAct(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            use_act=False,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.conv2 = ConvBNAct(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            groups=out_channels,
+            use_act=True,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+
+
+class StemBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.stem1 = ConvBNAct(
+            in_channels=in_channels,
+            out_channels=mid_channels,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem2a = ConvBNAct(
+            in_channels=mid_channels,
+            out_channels=mid_channels // 2,
+            kernel_size=2,
+            stride=1,
+            padding="SAME",
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem2b = ConvBNAct(
+            in_channels=mid_channels // 2,
+            out_channels=mid_channels,
+            kernel_size=2,
+            stride=1,
+            padding="SAME",
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem3 = ConvBNAct(
+            in_channels=mid_channels * 2,
+            out_channels=mid_channels,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem4 = ConvBNAct(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.pool = nn.MaxPool2D(
+            kernel_size=2, stride=1, ceil_mode=True, padding="SAME")
+
+    def forward(self, x):
+        x = self.stem1(x)
+        x2 = self.stem2a(x)
+        x2 = self.stem2b(x2)
+        x1 = self.pool(x)
+        x = paddle.concat([x1, x2], 1)
+        x = self.stem3(x)
+        x = self.stem4(x)
+
+        return x
+
+
+class HG_Block(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 kernel_size=3,
+                 layer_num=6,
+                 identity=False,
+                 light_block=True,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.identity = identity
+
+        self.layers = nn.LayerList()
+        block_type = "LightConvBNAct" if light_block else "ConvBNAct"
+        for i in range(layer_num):
+            self.layers.append(
+                eval(block_type)(in_channels=in_channels
+                                 if i == 0 else mid_channels,
+                                 out_channels=mid_channels,
+                                 stride=1,
+                                 kernel_size=kernel_size,
+                                 use_lab=use_lab,
+                                 lr_mult=lr_mult))
+        # feature aggregation
+        total_channels = in_channels + layer_num * mid_channels
+        self.aggregation_squeeze_conv = ConvBNAct(
+            in_channels=total_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.aggregation_excitation_conv = ConvBNAct(
+            in_channels=out_channels // 2,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+
+    def forward(self, x):
+        identity = x
+        output = []
+        output.append(x)
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+        x = paddle.concat(output, axis=1)
+        x = self.aggregation_squeeze_conv(x)
+        x = self.aggregation_excitation_conv(x)
+        if self.identity:
+            x += identity
+        return x
+
+
+class HG_Stage(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 block_num,
+                 layer_num=6,
+                 downsample=True,
+                 light_block=True,
+                 kernel_size=3,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.downsample = downsample
+        if downsample:
+            self.downsample = ConvBNAct(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=3,
+                stride=2,
+                groups=in_channels,
+                use_act=False,
+                use_lab=use_lab,
+                lr_mult=lr_mult)
+
+        blocks_list = []
+        for i in range(block_num):
+            blocks_list.append(
+                HG_Block(
+                    in_channels=in_channels if i == 0 else out_channels,
+                    mid_channels=mid_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    layer_num=layer_num,
+                    identity=False if i == 0 else True,
+                    light_block=light_block,
+                    use_lab=use_lab,
+                    lr_mult=lr_mult))
+        self.blocks = nn.Sequential(*blocks_list)
+
+    def forward(self, x):
+        if self.downsample:
+            x = self.downsample(x)
+        x = self.blocks(x)
+        return x
+
+
+def _freeze_norm(m: nn.BatchNorm2D):
+    param_attr = ParamAttr(
+        learning_rate=0., regularizer=L2Decay(0.), trainable=False)
+    bias_attr = ParamAttr(
+        learning_rate=0., regularizer=L2Decay(0.), trainable=False)
+    global_stats = True
+    norm = nn.BatchNorm2D(
+        m._num_features,
+        weight_attr=param_attr,
+        bias_attr=bias_attr,
+        use_global_stats=global_stats)
+    for param in norm.parameters():
+        param.stop_gradient = True
+    return norm
+
+
+def reset_bn(model: nn.Layer, reset_func=_freeze_norm):
+    if isinstance(model, nn.BatchNorm2D):
+        model = reset_func(model)
+    else:
+        for name, child in model.named_children():
+            _child = reset_bn(child, reset_func)
+            if _child is not child:
+                setattr(model, name, _child)
+    return model
+
+
+@register
+@serializable
+class PPHGNetV2(nn.Layer):
+    """
+    PPHGNetV2
+    Args:
+        stem_channels: list. Number of channels for the stem block.
+        stage_type: str. The stage configuration of PPHGNet. such as the number of channels, stride, etc.
+        use_lab: boolean. Whether to use LearnableAffineBlock in network.
+        lr_mult_list: list. Control the learning rate of different stages.
+    Returns:
+        model: nn.Layer. Specific PPHGNetV2 model depends on args.
+    """
+
+    arch_configs = {
+        'L': {
+            'stem_channels': [3, 32, 48],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [48, 48, 128, 1, False, False, 3, 6],
+                "stage2": [128, 96, 512, 1, True, False, 3, 6],
+                "stage3": [512, 192, 1024, 3, True, True, 5, 6],
+                "stage4": [1024, 384, 2048, 1, True, True, 5, 6],
+            }
+        },
+        'X': {
+            'stem_channels': [3, 32, 64],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [64, 64, 128, 1, False, False, 3, 6],
+                "stage2": [128, 128, 512, 2, True, False, 3, 6],
+                "stage3": [512, 256, 1024, 5, True, True, 5, 6],
+                "stage4": [1024, 512, 2048, 2, True, True, 5, 6],
+            }
+        }
+    }
+
+    def __init__(self,
+                 arch,
+                 use_lab=False,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 return_idx=[1, 2, 3],
+                 freeze_stem_only=True,
+                 freeze_at=0,
+                 freeze_norm=True):
+        super().__init__()
+        self.use_lab = use_lab
+        self.return_idx = return_idx
+
+        stem_channels = self.arch_configs[arch]['stem_channels']
+        stage_config = self.arch_configs[arch]['stage_config']
+
+        self._out_strides = [4, 8, 16, 32]
+        self._out_channels = [stage_config[k][2] for k in stage_config]
+
+        # stem
+        self.stem = StemBlock(
+            in_channels=stem_channels[0],
+            mid_channels=stem_channels[1],
+            out_channels=stem_channels[2],
+            use_lab=use_lab,
+            lr_mult=lr_mult_list[0])
+
+        # stages
+        self.stages = nn.LayerList()
+        for i, k in enumerate(stage_config):
+            in_channels, mid_channels, out_channels, block_num, downsample, light_block, kernel_size, layer_num = stage_config[
+                k]
+            self.stages.append(
+                HG_Stage(
+                    in_channels,
+                    mid_channels,
+                    out_channels,
+                    block_num,
+                    layer_num,
+                    downsample,
+                    light_block,
+                    kernel_size,
+                    use_lab,
+                    lr_mult=lr_mult_list[i + 1]))
+
+        if freeze_at >= 0:
+            self._freeze_parameters(self.stem)
+            if not freeze_stem_only:
+                for i in range(min(freeze_at + 1, len(self.stages))):
+                    self._freeze_parameters(self.stages[i])
+
+        if freeze_norm:
+            reset_bn(self, reset_func=_freeze_norm)
+
+        self._init_weights()
+
+    def _freeze_parameters(self, m):
+        for p in m.parameters():
+            p.stop_gradient = True
+
+    def _init_weights(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                kaiming_normal_(m.weight)
+            elif isinstance(m, (nn.BatchNorm2D)):
+                ones_(m.weight)
+                zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                zeros_(m.bias)
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self._out_channels[i], stride=self._out_strides[i])
+            for i in self.return_idx
+        ]
+
+    def forward(self, inputs):
+        x = inputs['image']
+        x = self.stem(x)
+        outs = []
+        for idx, stage in enumerate(self.stages):
+            x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+        return outs
--- a/rtdetr_paddle/ppdet/modeling/backbones/lcnet.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/lcnet.py
@@ -0,0 +1,271 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.nn import AdaptiveAvgPool2D, Conv2D
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import KaimingNormal
+
+from ppdet.core.workspace import register, serializable
+from numbers import Integral
+from ..shape_spec import ShapeSpec
+
+__all__ = ['LCNet']
+
+NET_CONFIG = {
+    "blocks2":
+    #k, in_c, out_c, s, use_se
+    [[3, 16, 32, 1, False], ],
+    "blocks3": [
+        [3, 32, 64, 2, False],
+        [3, 64, 64, 1, False],
+    ],
+    "blocks4": [
+        [3, 64, 128, 2, False],
+        [3, 128, 128, 1, False],
+    ],
+    "blocks5": [
+        [3, 128, 256, 2, False],
+        [5, 256, 256, 1, False],
+        [5, 256, 256, 1, False],
+        [5, 256, 256, 1, False],
+        [5, 256, 256, 1, False],
+        [5, 256, 256, 1, False],
+    ],
+    "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]]
+}
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 num_groups=1,
+                 act='hard_swish'):
+        super().__init__()
+
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=num_groups,
+            weight_attr=ParamAttr(initializer=KaimingNormal()),
+            bias_attr=False)
+
+        self.bn = nn.BatchNorm2D(
+            num_filters,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        if act == 'hard_swish':
+            self.act = nn.Hardswish()
+        elif act == 'relu6':
+            self.act = nn.ReLU6()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+        return x
+
+
+class DepthwiseSeparable(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 dw_size=3,
+                 use_se=False,
+                 act='hard_swish'):
+        super().__init__()
+        self.use_se = use_se
+        self.dw_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_channels,
+            filter_size=dw_size,
+            stride=stride,
+            num_groups=num_channels,
+            act=act)
+        if use_se:
+            self.se = SEModule(num_channels)
+        self.pw_conv = ConvBNLayer(
+            num_channels=num_channels,
+            filter_size=1,
+            num_filters=num_filters,
+            stride=1,
+            act=act)
+
+    def forward(self, x):
+        x = self.dw_conv(x)
+        if self.use_se:
+            x = self.se(x)
+        x = self.pw_conv(x)
+        return x
+
+
+class SEModule(nn.Layer):
+    def __init__(self, channel, reduction=4):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.relu = nn.ReLU()
+        self.conv2 = Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.hardsigmoid = nn.Hardsigmoid()
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        x = paddle.multiply(x=identity, y=x)
+        return x
+
+
+@register
+@serializable
+class LCNet(nn.Layer):
+    def __init__(self, scale=1.0, feature_maps=[3, 4, 5], act='hard_swish'):
+        super().__init__()
+        self.scale = scale
+        self.feature_maps = feature_maps
+
+        out_channels = []
+
+        self.conv1 = ConvBNLayer(
+            num_channels=3,
+            filter_size=3,
+            num_filters=make_divisible(16 * scale),
+            stride=2,
+            act=act)
+
+        self.blocks2 = nn.Sequential(* [
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                act=act)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"])
+        ])
+
+        self.blocks3 = nn.Sequential(* [
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                act=act)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"])
+        ])
+
+        out_channels.append(
+            make_divisible(NET_CONFIG["blocks3"][-1][2] * scale))
+
+        self.blocks4 = nn.Sequential(* [
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                act=act)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"])
+        ])
+
+        out_channels.append(
+            make_divisible(NET_CONFIG["blocks4"][-1][2] * scale))
+
+        self.blocks5 = nn.Sequential(* [
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                act=act)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"])
+        ])
+
+        out_channels.append(
+            make_divisible(NET_CONFIG["blocks5"][-1][2] * scale))
+
+        self.blocks6 = nn.Sequential(* [
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                act=act)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"])
+        ])
+
+        out_channels.append(
+            make_divisible(NET_CONFIG["blocks6"][-1][2] * scale))
+        self._out_channels = [
+            ch for idx, ch in enumerate(out_channels) if idx + 2 in feature_maps
+        ]
+
+    def forward(self, inputs):
+        x = inputs['image']
+        outs = []
+
+        x = self.conv1(x)
+        x = self.blocks2(x)
+        x = self.blocks3(x)
+        outs.append(x)
+        x = self.blocks4(x)
+        outs.append(x)
+        x = self.blocks5(x)
+        outs.append(x)
+        x = self.blocks6(x)
+        outs.append(x)
+        outs = [o for i, o in enumerate(outs) if i + 2 in self.feature_maps]
+        return outs
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
--- a/rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v1.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v1.py
@@ -0,0 +1,402 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import KaimingNormal
+from ppdet.core.workspace import register, serializable
+from numbers import Integral
+from ..shape_spec import ShapeSpec
+
+__all__ = ['MobileNet']
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 act='relu',
+                 conv_lr=1.,
+                 conv_decay=0.,
+                 norm_decay=0.,
+                 norm_type='bn',
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.act = act
+        self._conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            weight_attr=ParamAttr(
+                learning_rate=conv_lr,
+                initializer=KaimingNormal(),
+                regularizer=L2Decay(conv_decay)),
+            bias_attr=False)
+
+        param_attr = ParamAttr(regularizer=L2Decay(norm_decay))
+        bias_attr = ParamAttr(regularizer=L2Decay(norm_decay))
+        if norm_type in ['sync_bn', 'bn']:
+            self._batch_norm = nn.BatchNorm2D(
+                out_channels, weight_attr=param_attr, bias_attr=bias_attr)
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._batch_norm(x)
+        if self.act == "relu":
+            x = F.relu(x)
+        elif self.act == "relu6":
+            x = F.relu6(x)
+        return x
+
+
+class DepthwiseSeparable(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels1,
+                 out_channels2,
+                 num_groups,
+                 stride,
+                 scale,
+                 conv_lr=1.,
+                 conv_decay=0.,
+                 norm_decay=0.,
+                 norm_type='bn',
+                 name=None):
+        super(DepthwiseSeparable, self).__init__()
+
+        self._depthwise_conv = ConvBNLayer(
+            in_channels,
+            int(out_channels1 * scale),
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups * scale),
+            conv_lr=conv_lr,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name=name + "_dw")
+
+        self._pointwise_conv = ConvBNLayer(
+            int(out_channels1 * scale),
+            int(out_channels2 * scale),
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_lr=conv_lr,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name=name + "_sep")
+
+    def forward(self, x):
+        x = self._depthwise_conv(x)
+        x = self._pointwise_conv(x)
+        return x
+
+
+class ExtraBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels1,
+                 out_channels2,
+                 num_groups=1,
+                 stride=2,
+                 conv_lr=1.,
+                 conv_decay=0.,
+                 norm_decay=0.,
+                 norm_type='bn',
+                 name=None):
+        super(ExtraBlock, self).__init__()
+
+        self.pointwise_conv = ConvBNLayer(
+            in_channels,
+            int(out_channels1),
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            num_groups=int(num_groups),
+            act='relu6',
+            conv_lr=conv_lr,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name=name + "_extra1")
+
+        self.normal_conv = ConvBNLayer(
+            int(out_channels1),
+            int(out_channels2),
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups),
+            act='relu6',
+            conv_lr=conv_lr,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name=name + "_extra2")
+
+    def forward(self, x):
+        x = self.pointwise_conv(x)
+        x = self.normal_conv(x)
+        return x
+
+
+@register
+@serializable
+class MobileNet(nn.Layer):
+    __shared__ = ['norm_type']
+
+    def __init__(self,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 conv_decay=0.,
+                 scale=1,
+                 conv_learning_rate=1.0,
+                 feature_maps=[4, 6, 13],
+                 with_extra_blocks=False,
+                 extra_block_filters=[[256, 512], [128, 256], [128, 256],
+                                      [64, 128]]):
+        super(MobileNet, self).__init__()
+        if isinstance(feature_maps, Integral):
+            feature_maps = [feature_maps]
+        self.feature_maps = feature_maps
+        self.with_extra_blocks = with_extra_blocks
+        self.extra_block_filters = extra_block_filters
+
+        self._out_channels = []
+
+        self.conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=int(32 * scale),
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_lr=conv_learning_rate,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name="conv1")
+
+        self.dwsl = []
+        dws21 = self.add_sublayer(
+            "conv2_1",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(32 * scale),
+                out_channels1=32,
+                out_channels2=64,
+                num_groups=32,
+                stride=1,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv2_1"))
+        self.dwsl.append(dws21)
+        self._update_out_channels(int(64 * scale), len(self.dwsl), feature_maps)
+        dws22 = self.add_sublayer(
+            "conv2_2",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(64 * scale),
+                out_channels1=64,
+                out_channels2=128,
+                num_groups=64,
+                stride=2,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv2_2"))
+        self.dwsl.append(dws22)
+        self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps)
+        # 1/4
+        dws31 = self.add_sublayer(
+            "conv3_1",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(128 * scale),
+                out_channels1=128,
+                out_channels2=128,
+                num_groups=128,
+                stride=1,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv3_1"))
+        self.dwsl.append(dws31)
+        self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps)
+        dws32 = self.add_sublayer(
+            "conv3_2",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(128 * scale),
+                out_channels1=128,
+                out_channels2=256,
+                num_groups=128,
+                stride=2,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv3_2"))
+        self.dwsl.append(dws32)
+        self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps)
+        # 1/8
+        dws41 = self.add_sublayer(
+            "conv4_1",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(256 * scale),
+                out_channels1=256,
+                out_channels2=256,
+                num_groups=256,
+                stride=1,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv4_1"))
+        self.dwsl.append(dws41)
+        self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps)
+        dws42 = self.add_sublayer(
+            "conv4_2",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(256 * scale),
+                out_channels1=256,
+                out_channels2=512,
+                num_groups=256,
+                stride=2,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv4_2"))
+        self.dwsl.append(dws42)
+        self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps)
+        # 1/16
+        for i in range(5):
+            tmp = self.add_sublayer(
+                "conv5_" + str(i + 1),
+                sublayer=DepthwiseSeparable(
+                    in_channels=int(512 * scale),
+                    out_channels1=512,
+                    out_channels2=512,
+                    num_groups=512,
+                    stride=1,
+                    scale=scale,
+                    conv_lr=conv_learning_rate,
+                    conv_decay=conv_decay,
+                    norm_decay=norm_decay,
+                    norm_type=norm_type,
+                    name="conv5_" + str(i + 1)))
+            self.dwsl.append(tmp)
+            self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps)
+        dws56 = self.add_sublayer(
+            "conv5_6",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(512 * scale),
+                out_channels1=512,
+                out_channels2=1024,
+                num_groups=512,
+                stride=2,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv5_6"))
+        self.dwsl.append(dws56)
+        self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps)
+        # 1/32
+        dws6 = self.add_sublayer(
+            "conv6",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(1024 * scale),
+                out_channels1=1024,
+                out_channels2=1024,
+                num_groups=1024,
+                stride=1,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv6"))
+        self.dwsl.append(dws6)
+        self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps)
+
+        if self.with_extra_blocks:
+            self.extra_blocks = []
+            for i, block_filter in enumerate(self.extra_block_filters):
+                in_c = 1024 if i == 0 else self.extra_block_filters[i - 1][1]
+                conv_extra = self.add_sublayer(
+                    "conv7_" + str(i + 1),
+                    sublayer=ExtraBlock(
+                        in_c,
+                        block_filter[0],
+                        block_filter[1],
+                        conv_lr=conv_learning_rate,
+                        conv_decay=conv_decay,
+                        norm_decay=norm_decay,
+                        norm_type=norm_type,
+                        name="conv7_" + str(i + 1)))
+                self.extra_blocks.append(conv_extra)
+                self._update_out_channels(
+                    block_filter[1],
+                    len(self.dwsl) + len(self.extra_blocks), feature_maps)
+
+    def _update_out_channels(self, channel, feature_idx, feature_maps):
+        if feature_idx in feature_maps:
+            self._out_channels.append(channel)
+
+    def forward(self, inputs):
+        outs = []
+        y = self.conv1(inputs['image'])
+        for i, block in enumerate(self.dwsl):
+            y = block(y)
+            if i + 1 in self.feature_maps:
+                outs.append(y)
+
+        if not self.with_extra_blocks:
+            return outs
+
+        y = outs[-1]
+        for i, block in enumerate(self.extra_blocks):
+            idx = i + len(self.dwsl)
+            y = block(y)
+            if idx + 1 in self.feature_maps:
+                outs.append(y)
+        return outs
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
--- a/rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v3.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v3.py
@@ -0,0 +1,478 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from ppdet.core.workspace import register, serializable
+from numbers import Integral
+from ..shape_spec import ShapeSpec
+
+__all__ = ['MobileNetV3']
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 act=None,
+                 lr_mult=1.,
+                 conv_decay=0.,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 name=""):
+        super(ConvBNLayer, self).__init__()
+        self.act = act
+        self.conv = nn.Conv2D(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            weight_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
+            bias_attr=False)
+
+        norm_lr = 0. if freeze_norm else lr_mult
+        param_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay),
+            trainable=False if freeze_norm else True)
+        bias_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay),
+            trainable=False if freeze_norm else True)
+        global_stats = True if freeze_norm else None
+        if norm_type in ['sync_bn', 'bn']:
+            self.bn = nn.BatchNorm2D(
+                out_c,
+                weight_attr=param_attr,
+                bias_attr=bias_attr,
+                use_global_stats=global_stats)
+        norm_params = self.bn.parameters()
+        if freeze_norm:
+            for param in norm_params:
+                param.stop_gradient = True
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act is not None:
+            if self.act == "relu":
+                x = F.relu(x)
+            elif self.act == "relu6":
+                x = F.relu6(x)
+            elif self.act == "hard_swish":
+                x = F.hardswish(x)
+            else:
+                raise NotImplementedError(
+                    "The activation function is selected incorrectly.")
+        return x
+
+
+class ResidualUnit(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 mid_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 use_se,
+                 lr_mult,
+                 conv_decay=0.,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 act=None,
+                 return_list=False,
+                 name=''):
+        super(ResidualUnit, self).__init__()
+        self.if_shortcut = stride == 1 and in_c == out_c
+        self.use_se = use_se
+        self.return_list = return_list
+
+        self.expand_conv = ConvBNLayer(
+            in_c=in_c,
+            out_c=mid_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act=act,
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_expand")
+        self.bottleneck_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=mid_c,
+            filter_size=filter_size,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            num_groups=mid_c,
+            act=act,
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_depthwise")
+        if self.use_se:
+            self.mid_se = SEModule(
+                mid_c, lr_mult, conv_decay, name=name + "_se")
+        self.linear_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=out_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act=None,
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_linear")
+
+    def forward(self, inputs):
+        y = self.expand_conv(inputs)
+        x = self.bottleneck_conv(y)
+        if self.use_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.if_shortcut:
+            x = paddle.add(inputs, x)
+        if self.return_list:
+            return [y, x]
+        else:
+            return x
+
+
+class SEModule(nn.Layer):
+    def __init__(self, channel, lr_mult, conv_decay, reduction=4, name=""):
+        super(SEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2D(1)
+        mid_channels = int(channel // reduction)
+        self.conv1 = nn.Conv2D(
+            in_channels=channel,
+            out_channels=mid_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
+            bias_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))
+        self.conv2 = nn.Conv2D(
+            in_channels=mid_channels,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
+            bias_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))
+
+    def forward(self, inputs):
+        outputs = self.avg_pool(inputs)
+        outputs = self.conv1(outputs)
+        outputs = F.relu(outputs)
+        outputs = self.conv2(outputs)
+        outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5)
+        return paddle.multiply(x=inputs, y=outputs)
+
+
+class ExtraBlockDW(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 ch_1,
+                 ch_2,
+                 stride,
+                 lr_mult,
+                 conv_decay=0.,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 name=None):
+        super(ExtraBlockDW, self).__init__()
+        self.pointwise_conv = ConvBNLayer(
+            in_c=in_c,
+            out_c=ch_1,
+            filter_size=1,
+            stride=1,
+            padding='SAME',
+            act='relu6',
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_extra1")
+        self.depthwise_conv = ConvBNLayer(
+            in_c=ch_1,
+            out_c=ch_2,
+            filter_size=3,
+            stride=stride,
+            padding='SAME',
+            num_groups=int(ch_1),
+            act='relu6',
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_extra2_dw")
+        self.normal_conv = ConvBNLayer(
+            in_c=ch_2,
+            out_c=ch_2,
+            filter_size=1,
+            stride=1,
+            padding='SAME',
+            act='relu6',
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_extra2_sep")
+
+    def forward(self, inputs):
+        x = self.pointwise_conv(inputs)
+        x = self.depthwise_conv(x)
+        x = self.normal_conv(x)
+        return x
+
+
+@register
+@serializable
+class MobileNetV3(nn.Layer):
+    __shared__ = ['norm_type']
+
+    def __init__(
+            self,
+            scale=1.0,
+            model_name="large",
+            feature_maps=[6, 12, 15],
+            with_extra_blocks=False,
+            extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]],
+            lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+            conv_decay=0.0,
+            multiplier=1.0,
+            norm_type='bn',
+            norm_decay=0.0,
+            freeze_norm=False):
+        super(MobileNetV3, self).__init__()
+        if isinstance(feature_maps, Integral):
+            feature_maps = [feature_maps]
+        if norm_type == 'sync_bn' and freeze_norm:
+            raise ValueError(
+                "The norm_type should not be sync_bn when freeze_norm is True")
+        self.feature_maps = feature_maps
+        self.with_extra_blocks = with_extra_blocks
+        self.extra_block_filters = extra_block_filters
+
+        inplanes = 16
+        if model_name == "large":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, False, "relu", 1],
+                [3, 64, 24, False, "relu", 2],
+                [3, 72, 24, False, "relu", 1],
+                [5, 72, 40, True, "relu", 2],  # RCNN output
+                [5, 120, 40, True, "relu", 1],
+                [5, 120, 40, True, "relu", 1],  # YOLOv3 output
+                [3, 240, 80, False, "hard_swish", 2],  # RCNN output
+                [3, 200, 80, False, "hard_swish", 1],
+                [3, 184, 80, False, "hard_swish", 1],
+                [3, 184, 80, False, "hard_swish", 1],
+                [3, 480, 112, True, "hard_swish", 1],
+                [3, 672, 112, True, "hard_swish", 1],  # YOLOv3 output
+                [5, 672, 160, True, "hard_swish", 2],  # SSD/SSDLite/RCNN output
+                [5, 960, 160, True, "hard_swish", 1],
+                [5, 960, 160, True, "hard_swish", 1],  # YOLOv3 output
+            ]
+        elif model_name == "small":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, True, "relu", 2],
+                [3, 72, 24, False, "relu", 2],  # RCNN output
+                [3, 88, 24, False, "relu", 1],  # YOLOv3 output
+                [5, 96, 40, True, "hard_swish", 2],  # RCNN output
+                [5, 240, 40, True, "hard_swish", 1],
+                [5, 240, 40, True, "hard_swish", 1],
+                [5, 120, 48, True, "hard_swish", 1],
+                [5, 144, 48, True, "hard_swish", 1],  # YOLOv3 output
+                [5, 288, 96, True, "hard_swish", 2],  # SSD/SSDLite/RCNN output
+                [5, 576, 96, True, "hard_swish", 1],
+                [5, 576, 96, True, "hard_swish", 1],  # YOLOv3 output
+            ]
+        else:
+            raise NotImplementedError(
+                "mode[{}_model] is not implemented!".format(model_name))
+
+        if multiplier != 1.0:
+            self.cfg[-3][2] = int(self.cfg[-3][2] * multiplier)
+            self.cfg[-2][1] = int(self.cfg[-2][1] * multiplier)
+            self.cfg[-2][2] = int(self.cfg[-2][2] * multiplier)
+            self.cfg[-1][1] = int(self.cfg[-1][1] * multiplier)
+            self.cfg[-1][2] = int(self.cfg[-1][2] * multiplier)
+
+        self.conv1 = ConvBNLayer(
+            in_c=3,
+            out_c=make_divisible(inplanes * scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            num_groups=1,
+            act="hard_swish",
+            lr_mult=lr_mult_list[0],
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name="conv1")
+
+        self._out_channels = []
+        self.block_list = []
+        i = 0
+        inplanes = make_divisible(inplanes * scale)
+        for (k, exp, c, se, nl, s) in self.cfg:
+            lr_idx = min(i // 3, len(lr_mult_list) - 1)
+            lr_mult = lr_mult_list[lr_idx]
+
+            # for SSD/SSDLite, first head input is after ResidualUnit expand_conv
+            return_list = self.with_extra_blocks and i + 2 in self.feature_maps
+
+            block = self.add_sublayer(
+                "conv" + str(i + 2),
+                sublayer=ResidualUnit(
+                    in_c=inplanes,
+                    mid_c=make_divisible(scale * exp),
+                    out_c=make_divisible(scale * c),
+                    filter_size=k,
+                    stride=s,
+                    use_se=se,
+                    act=nl,
+                    lr_mult=lr_mult,
+                    conv_decay=conv_decay,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    return_list=return_list,
+                    name="conv" + str(i + 2)))
+            self.block_list.append(block)
+            inplanes = make_divisible(scale * c)
+            i += 1
+            self._update_out_channels(
+                make_divisible(scale * exp)
+                if return_list else inplanes, i + 1, feature_maps)
+
+        if self.with_extra_blocks:
+            self.extra_block_list = []
+            extra_out_c = make_divisible(scale * self.cfg[-1][1])
+            lr_idx = min(i // 3, len(lr_mult_list) - 1)
+            lr_mult = lr_mult_list[lr_idx]
+
+            conv_extra = self.add_sublayer(
+                "conv" + str(i + 2),
+                sublayer=ConvBNLayer(
+                    in_c=inplanes,
+                    out_c=extra_out_c,
+                    filter_size=1,
+                    stride=1,
+                    padding=0,
+                    num_groups=1,
+                    act="hard_swish",
+                    lr_mult=lr_mult,
+                    conv_decay=conv_decay,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    name="conv" + str(i + 2)))
+            self.extra_block_list.append(conv_extra)
+            i += 1
+            self._update_out_channels(extra_out_c, i + 1, feature_maps)
+
+            for j, block_filter in enumerate(self.extra_block_filters):
+                in_c = extra_out_c if j == 0 else self.extra_block_filters[j -
+                                                                           1][1]
+                conv_extra = self.add_sublayer(
+                    "conv" + str(i + 2),
+                    sublayer=ExtraBlockDW(
+                        in_c,
+                        block_filter[0],
+                        block_filter[1],
+                        stride=2,
+                        lr_mult=lr_mult,
+                        conv_decay=conv_decay,
+                        norm_type=norm_type,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        name='conv' + str(i + 2)))
+                self.extra_block_list.append(conv_extra)
+                i += 1
+                self._update_out_channels(block_filter[1], i + 1, feature_maps)
+
+    def _update_out_channels(self, channel, feature_idx, feature_maps):
+        if feature_idx in feature_maps:
+            self._out_channels.append(channel)
+
+    def forward(self, inputs):
+        x = self.conv1(inputs['image'])
+        outs = []
+        for idx, block in enumerate(self.block_list):
+            x = block(x)
+            if idx + 2 in self.feature_maps:
+                if isinstance(x, list):
+                    outs.append(x[0])
+                    x = x[1]
+                else:
+                    outs.append(x)
+
+        if not self.with_extra_blocks:
+            return outs
+
+        for i, block in enumerate(self.extra_block_list):
+            idx = i + len(self.block_list)
+            x = block(x)
+            if idx + 2 in self.feature_maps:
+                outs.append(x)
+        return outs
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
--- a/rtdetr_paddle/ppdet/modeling/backbones/mobileone.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/mobileone.py
@@ -0,0 +1,266 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is the paddle implementation of MobileOne block, see: https://arxiv.org/pdf/2206.04040.pdf. 
+Some codes are based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
+Ths copyright of microsoft/Swin-Transformer is as follows:
+MIT License [see LICENSE for details]
+"""
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Normal, Constant
+
+from ppdet.modeling.ops import get_act_fn
+from ppdet.modeling.layers import ConvNormLayer
+
+
+class MobileOneBlock(nn.Layer):
+    def __init__(
+            self,
+            ch_in,
+            ch_out,
+            stride,
+            kernel_size,
+            conv_num=1,
+            norm_type='bn',
+            norm_decay=0.,
+            norm_groups=32,
+            bias_on=False,
+            lr_scale=1.,
+            freeze_norm=False,
+            initializer=Normal(
+                mean=0., std=0.01),
+            skip_quant=False,
+            act='relu', ):
+        super(MobileOneBlock, self).__init__()
+
+        self.ch_in = ch_in
+        self.ch_out = ch_out
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = (kernel_size - 1) // 2
+        self.k = conv_num
+
+        self.depth_conv = nn.LayerList()
+        self.point_conv = nn.LayerList()
+        for _ in range(self.k):
+            self.depth_conv.append(
+                ConvNormLayer(
+                    ch_in,
+                    ch_in,
+                    kernel_size,
+                    stride=stride,
+                    groups=ch_in,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    norm_groups=norm_groups,
+                    bias_on=bias_on,
+                    lr_scale=lr_scale,
+                    freeze_norm=freeze_norm,
+                    initializer=initializer,
+                    skip_quant=skip_quant))
+            self.point_conv.append(
+                ConvNormLayer(
+                    ch_in,
+                    ch_out,
+                    1,
+                    stride=1,
+                    groups=1,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    norm_groups=norm_groups,
+                    bias_on=bias_on,
+                    lr_scale=lr_scale,
+                    freeze_norm=freeze_norm,
+                    initializer=initializer,
+                    skip_quant=skip_quant))
+        self.rbr_1x1 = ConvNormLayer(
+            ch_in,
+            ch_in,
+            1,
+            stride=self.stride,
+            groups=ch_in,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            norm_groups=norm_groups,
+            bias_on=bias_on,
+            lr_scale=lr_scale,
+            freeze_norm=freeze_norm,
+            initializer=initializer,
+            skip_quant=skip_quant)
+        self.rbr_identity_st1 = nn.BatchNorm2D(
+            num_features=ch_in,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(
+                0.0))) if ch_in == ch_out and self.stride == 1 else None
+        self.rbr_identity_st2 = nn.BatchNorm2D(
+            num_features=ch_out,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(
+                0.0))) if ch_in == ch_out and self.stride == 1 else None
+        self.act = get_act_fn(act) if act is None or isinstance(act, (
+            str, dict)) else act
+
+    def forward(self, x):
+        if hasattr(self, "conv1") and hasattr(self, "conv2"):
+            y = self.act(self.conv2(self.act(self.conv1(x))))
+        else:
+            if self.rbr_identity_st1 is None:
+                id_out_st1 = 0
+            else:
+                id_out_st1 = self.rbr_identity_st1(x)
+
+            x1_1 = 0
+            for i in range(self.k):
+                x1_1 += self.depth_conv[i](x)
+
+            x1_2 = self.rbr_1x1(x)
+            x1 = self.act(x1_1 + x1_2 + id_out_st1)
+
+            if self.rbr_identity_st2 is None:
+                id_out_st2 = 0
+            else:
+                id_out_st2 = self.rbr_identity_st2(x1)
+
+            x2_1 = 0
+            for i in range(self.k):
+                x2_1 += self.point_conv[i](x1)
+            y = self.act(x2_1 + id_out_st2)
+
+        return y
+
+    def convert_to_deploy(self):
+        if not hasattr(self, 'conv1'):
+            self.conv1 = nn.Conv2D(
+                in_channels=self.ch_in,
+                out_channels=self.ch_in,
+                kernel_size=self.kernel_size,
+                stride=self.stride,
+                padding=self.padding,
+                groups=self.ch_in,
+                bias_attr=ParamAttr(
+                    initializer=Constant(value=0.), learning_rate=1.))
+        if not hasattr(self, 'conv2'):
+            self.conv2 = nn.Conv2D(
+                in_channels=self.ch_in,
+                out_channels=self.ch_out,
+                kernel_size=1,
+                stride=1,
+                padding='SAME',
+                groups=1,
+                bias_attr=ParamAttr(
+                    initializer=Constant(value=0.), learning_rate=1.))
+
+        conv1_kernel, conv1_bias, conv2_kernel, conv2_bias = self.get_equivalent_kernel_bias(
+        )
+        self.conv1.weight.set_value(conv1_kernel)
+        self.conv1.bias.set_value(conv1_bias)
+        self.conv2.weight.set_value(conv2_kernel)
+        self.conv2.bias.set_value(conv2_bias)
+        self.__delattr__('depth_conv')
+        self.__delattr__('point_conv')
+        self.__delattr__('rbr_1x1')
+        if hasattr(self, 'rbr_identity_st1'):
+            self.__delattr__('rbr_identity_st1')
+        if hasattr(self, 'rbr_identity_st2'):
+            self.__delattr__('rbr_identity_st2')
+
+    def get_equivalent_kernel_bias(self):
+        st1_kernel3x3, st1_bias3x3 = self._fuse_bn_tensor(self.depth_conv)
+        st1_kernel1x1, st1_bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
+        st1_kernelid, st1_biasid = self._fuse_bn_tensor(
+            self.rbr_identity_st1, kernel_size=self.kernel_size)
+
+        st2_kernel1x1, st2_bias1x1 = self._fuse_bn_tensor(self.point_conv)
+        st2_kernelid, st2_biasid = self._fuse_bn_tensor(
+            self.rbr_identity_st2, kernel_size=1)
+
+        conv1_kernel = st1_kernel3x3 + self._pad_1x1_to_3x3_tensor(
+            st1_kernel1x1) + st1_kernelid
+
+        conv1_bias = st1_bias3x3 + st1_bias1x1 + st1_biasid
+
+        conv2_kernel = st2_kernel1x1 + st2_kernelid
+        conv2_bias = st2_bias1x1 + st2_biasid
+
+        return conv1_kernel, conv1_bias, conv2_kernel, conv2_bias
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            padding_size = (self.kernel_size - 1) // 2
+            return nn.functional.pad(
+                kernel1x1,
+                [padding_size, padding_size, padding_size, padding_size])
+
+    def _fuse_bn_tensor(self, branch, kernel_size=3):
+        if branch is None:
+            return 0, 0
+
+        if isinstance(branch, nn.LayerList):
+            fused_kernels = []
+            fused_bias = []
+            for block in branch:
+                kernel = block.conv.weight
+                running_mean = block.norm._mean
+                running_var = block.norm._variance
+                gamma = block.norm.weight
+                beta = block.norm.bias
+                eps = block.norm._epsilon
+
+                std = (running_var + eps).sqrt()
+                t = (gamma / std).reshape((-1, 1, 1, 1))
+
+                fused_kernels.append(kernel * t)
+                fused_bias.append(beta - running_mean * gamma / std)
+
+            return sum(fused_kernels), sum(fused_bias)
+
+        elif isinstance(branch, ConvNormLayer):
+            kernel = branch.conv.weight
+            running_mean = branch.norm._mean
+            running_var = branch.norm._variance
+            gamma = branch.norm.weight
+            beta = branch.norm.bias
+            eps = branch.norm._epsilon
+        else:
+            assert isinstance(branch, nn.BatchNorm2D)
+            input_dim = self.ch_in if kernel_size == 1 else 1
+            kernel_value = paddle.zeros(
+                shape=[self.ch_in, input_dim, kernel_size, kernel_size],
+                dtype='float32')
+            if kernel_size > 1:
+                for i in range(self.ch_in):
+                    kernel_value[i, i % input_dim, (kernel_size - 1) // 2, (
+                        kernel_size - 1) // 2] = 1
+            elif kernel_size == 1:
+                for i in range(self.ch_in):
+                    kernel_value[i, i % input_dim, 0, 0] = 1
+            else:
+                raise ValueError("Invalid kernel size recieved!")
+            kernel = paddle.to_tensor(kernel_value, place=branch.weight.place)
+            running_mean = branch._mean
+            running_var = branch._variance
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch._epsilon
+
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape((-1, 1, 1, 1))
+
+        return kernel * t, beta - running_mean * gamma / std
--- a/rtdetr_paddle/ppdet/modeling/backbones/name_adapter.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/name_adapter.py
@@ -0,0 +1,69 @@
+class NameAdapter(object):
+    """Fix the backbones variable names for pretrained weight"""
+
+    def __init__(self, model):
+        super(NameAdapter, self).__init__()
+        self.model = model
+
+    @property
+    def model_type(self):
+        return getattr(self.model, '_model_type', '')
+
+    @property
+    def variant(self):
+        return getattr(self.model, 'variant', '')
+
+    def fix_conv_norm_name(self, name):
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        # the naming rule is same as pretrained weight
+        if self.model_type == 'SEResNeXt':
+            bn_name = name + "_bn"
+        return bn_name
+
+    def fix_shortcut_name(self, name):
+        if self.model_type == 'SEResNeXt':
+            name = 'conv' + name + '_prj'
+        return name
+
+    def fix_bottleneck_name(self, name):
+        if self.model_type == 'SEResNeXt':
+            conv_name1 = 'conv' + name + '_x1'
+            conv_name2 = 'conv' + name + '_x2'
+            conv_name3 = 'conv' + name + '_x3'
+            shortcut_name = name
+        else:
+            conv_name1 = name + "_branch2a"
+            conv_name2 = name + "_branch2b"
+            conv_name3 = name + "_branch2c"
+            shortcut_name = name + "_branch1"
+        return conv_name1, conv_name2, conv_name3, shortcut_name
+
+    def fix_basicblock_name(self, name):
+        if self.model_type == 'SEResNeXt':
+            conv_name1 = 'conv' + name + '_x1'
+            conv_name2 = 'conv' + name + '_x2'
+            shortcut_name = name
+        else:
+            conv_name1 = name + "_branch2a"
+            conv_name2 = name + "_branch2b"
+            shortcut_name = name + "_branch1"
+        return conv_name1, conv_name2, shortcut_name
+
+    def fix_layer_warp_name(self, stage_num, count, i):
+        name = 'res' + str(stage_num)
+        if count > 10 and stage_num == 4:
+            if i == 0:
+                conv_name = name + "a"
+            else:
+                conv_name = name + "b" + str(i)
+        else:
+            conv_name = name + chr(ord("a") + i)
+        if self.model_type == 'SEResNeXt':
+            conv_name = str(stage_num + 2) + '_' + str(i + 1)
+        return conv_name
+
+    def fix_c1_stage_name(self):
+        return "res_conv1" if self.model_type == 'ResNeXt' else "conv1"
--- a/rtdetr_paddle/ppdet/modeling/backbones/resnet.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/resnet.py
@@ -0,0 +1,611 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import math
+from numbers import Integral
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, serializable
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Uniform
+from paddle import ParamAttr
+from paddle.nn.initializer import Constant
+from paddle.vision.ops import DeformConv2D
+from .name_adapter import NameAdapter
+from ..shape_spec import ShapeSpec
+
+__all__ = ['ResNet', 'Res5Head', 'Blocks', 'BasicBlock', 'BottleNeck']
+
+ResNet_cfg = {
+    18: [2, 2, 2, 2],
+    34: [3, 4, 6, 3],
+    50: [3, 4, 6, 3],
+    101: [3, 4, 23, 3],
+    152: [3, 8, 36, 3],
+}
+
+
+class ConvNormLayer(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size,
+                 stride,
+                 groups=1,
+                 act=None,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 lr=1.0,
+                 dcn_v2=False):
+        super(ConvNormLayer, self).__init__()
+        assert norm_type in ['bn', 'sync_bn']
+        self.norm_type = norm_type
+        self.act = act
+        self.dcn_v2 = dcn_v2
+
+        if not self.dcn_v2:
+            self.conv = nn.Conv2D(
+                in_channels=ch_in,
+                out_channels=ch_out,
+                kernel_size=filter_size,
+                stride=stride,
+                padding=(filter_size - 1) // 2,
+                groups=groups,
+                weight_attr=ParamAttr(learning_rate=lr),
+                bias_attr=False)
+        else:
+            self.offset_channel = 2 * filter_size**2
+            self.mask_channel = filter_size**2
+
+            self.conv_offset = nn.Conv2D(
+                in_channels=ch_in,
+                out_channels=3 * filter_size**2,
+                kernel_size=filter_size,
+                stride=stride,
+                padding=(filter_size - 1) // 2,
+                weight_attr=ParamAttr(initializer=Constant(0.)),
+                bias_attr=ParamAttr(initializer=Constant(0.)))
+            self.conv = DeformConv2D(
+                in_channels=ch_in,
+                out_channels=ch_out,
+                kernel_size=filter_size,
+                stride=stride,
+                padding=(filter_size - 1) // 2,
+                dilation=1,
+                groups=groups,
+                weight_attr=ParamAttr(learning_rate=lr),
+                bias_attr=False)
+
+        norm_lr = 0. if freeze_norm else lr
+        param_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay),
+            trainable=False if freeze_norm else True)
+        bias_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay),
+            trainable=False if freeze_norm else True)
+
+        global_stats = True if freeze_norm else None
+        if norm_type in ['sync_bn', 'bn']:
+            self.norm = nn.BatchNorm2D(
+                ch_out,
+                weight_attr=param_attr,
+                bias_attr=bias_attr,
+                use_global_stats=global_stats)
+        norm_params = self.norm.parameters()
+
+        if freeze_norm:
+            for param in norm_params:
+                param.stop_gradient = True
+
+    def forward(self, inputs):
+        if not self.dcn_v2:
+            out = self.conv(inputs)
+        else:
+            offset_mask = self.conv_offset(inputs)
+            offset, mask = paddle.split(
+                offset_mask,
+                num_or_sections=[self.offset_channel, self.mask_channel],
+                axis=1)
+            mask = F.sigmoid(mask)
+            out = self.conv(inputs, offset, mask=mask)
+
+        if self.norm_type in ['bn', 'sync_bn']:
+            out = self.norm(out)
+        if self.act:
+            out = getattr(F, self.act)(out)
+        return out
+
+
+class SELayer(nn.Layer):
+    def __init__(self, ch, reduction_ratio=16):
+        super(SELayer, self).__init__()
+        self.pool = nn.AdaptiveAvgPool2D(1)
+        stdv = 1.0 / math.sqrt(ch)
+        c_ = ch // reduction_ratio
+        self.squeeze = nn.Linear(
+            ch,
+            c_,
+            weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
+            bias_attr=True)
+
+        stdv = 1.0 / math.sqrt(c_)
+        self.extract = nn.Linear(
+            c_,
+            ch,
+            weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
+            bias_attr=True)
+
+    def forward(self, inputs):
+        out = self.pool(inputs)
+        out = paddle.squeeze(out, axis=[2, 3])
+        out = self.squeeze(out)
+        out = F.relu(out)
+        out = self.extract(out)
+        out = F.sigmoid(out)
+        out = paddle.unsqueeze(out, axis=[2, 3])
+        scale = out * inputs
+        return scale
+
+
+class BasicBlock(nn.Layer):
+
+    expansion = 1
+
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 stride,
+                 shortcut,
+                 variant='b',
+                 groups=1,
+                 base_width=64,
+                 lr=1.0,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 dcn_v2=False,
+                 std_senet=False):
+        super(BasicBlock, self).__init__()
+        assert groups == 1 and base_width == 64, 'BasicBlock only supports groups=1 and base_width=64'
+
+        self.shortcut = shortcut
+        if not shortcut:
+            if variant == 'd' and stride == 2:
+                self.short = nn.Sequential()
+                self.short.add_sublayer(
+                    'pool',
+                    nn.AvgPool2D(
+                        kernel_size=2, stride=2, padding=0, ceil_mode=True))
+                self.short.add_sublayer(
+                    'conv',
+                    ConvNormLayer(
+                        ch_in=ch_in,
+                        ch_out=ch_out,
+                        filter_size=1,
+                        stride=1,
+                        norm_type=norm_type,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        lr=lr))
+            else:
+                self.short = ConvNormLayer(
+                    ch_in=ch_in,
+                    ch_out=ch_out,
+                    filter_size=1,
+                    stride=stride,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    lr=lr)
+
+        self.branch2a = ConvNormLayer(
+            ch_in=ch_in,
+            ch_out=ch_out,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr)
+
+        self.branch2b = ConvNormLayer(
+            ch_in=ch_out,
+            ch_out=ch_out,
+            filter_size=3,
+            stride=1,
+            act=None,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr,
+            dcn_v2=dcn_v2)
+
+        self.std_senet = std_senet
+        if self.std_senet:
+            self.se = SELayer(ch_out)
+
+    def forward(self, inputs):
+        out = self.branch2a(inputs)
+        out = self.branch2b(out)
+        if self.std_senet:
+            out = self.se(out)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        out = paddle.add(x=out, y=short)
+        out = F.relu(out)
+
+        return out
+
+
+class BottleNeck(nn.Layer):
+
+    expansion = 4
+
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 stride,
+                 shortcut,
+                 variant='b',
+                 groups=1,
+                 base_width=4,
+                 lr=1.0,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 dcn_v2=False,
+                 std_senet=False):
+        super(BottleNeck, self).__init__()
+        if variant == 'a':
+            stride1, stride2 = stride, 1
+        else:
+            stride1, stride2 = 1, stride
+
+        # ResNeXt
+        width = int(ch_out * (base_width / 64.)) * groups
+
+        self.branch2a = ConvNormLayer(
+            ch_in=ch_in,
+            ch_out=width,
+            filter_size=1,
+            stride=stride1,
+            groups=1,
+            act='relu',
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr)
+
+        self.branch2b = ConvNormLayer(
+            ch_in=width,
+            ch_out=width,
+            filter_size=3,
+            stride=stride2,
+            groups=groups,
+            act='relu',
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr,
+            dcn_v2=dcn_v2)
+
+        self.branch2c = ConvNormLayer(
+            ch_in=width,
+            ch_out=ch_out * self.expansion,
+            filter_size=1,
+            stride=1,
+            groups=1,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr)
+
+        self.shortcut = shortcut
+        if not shortcut:
+            if variant == 'd' and stride == 2:
+                self.short = nn.Sequential()
+                self.short.add_sublayer(
+                    'pool',
+                    nn.AvgPool2D(
+                        kernel_size=2, stride=2, padding=0, ceil_mode=True))
+                self.short.add_sublayer(
+                    'conv',
+                    ConvNormLayer(
+                        ch_in=ch_in,
+                        ch_out=ch_out * self.expansion,
+                        filter_size=1,
+                        stride=1,
+                        norm_type=norm_type,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        lr=lr))
+            else:
+                self.short = ConvNormLayer(
+                    ch_in=ch_in,
+                    ch_out=ch_out * self.expansion,
+                    filter_size=1,
+                    stride=stride,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    lr=lr)
+
+        self.std_senet = std_senet
+        if self.std_senet:
+            self.se = SELayer(ch_out * self.expansion)
+
+    def forward(self, inputs):
+
+        out = self.branch2a(inputs)
+        out = self.branch2b(out)
+        out = self.branch2c(out)
+
+        if self.std_senet:
+            out = self.se(out)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        out = paddle.add(x=out, y=short)
+        out = F.relu(out)
+
+        return out
+
+
+class Blocks(nn.Layer):
+    def __init__(self,
+                 block,
+                 ch_in,
+                 ch_out,
+                 count,
+                 name_adapter,
+                 stage_num,
+                 variant='b',
+                 groups=1,
+                 base_width=64,
+                 lr=1.0,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 dcn_v2=False,
+                 std_senet=False):
+        super(Blocks, self).__init__()
+
+        self.blocks = []
+        for i in range(count):
+            conv_name = name_adapter.fix_layer_warp_name(stage_num, count, i)
+            layer = self.add_sublayer(
+                conv_name,
+                block(
+                    ch_in=ch_in,
+                    ch_out=ch_out,
+                    stride=2 if i == 0 and stage_num != 2 else 1,
+                    shortcut=False if i == 0 else True,
+                    variant=variant,
+                    groups=groups,
+                    base_width=base_width,
+                    lr=lr,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    dcn_v2=dcn_v2,
+                    std_senet=std_senet))
+            self.blocks.append(layer)
+            if i == 0:
+                ch_in = ch_out * block.expansion
+
+    def forward(self, inputs):
+        block_out = inputs
+        for block in self.blocks:
+            block_out = block(block_out)
+        return block_out
+
+
+@register
+@serializable
+class ResNet(nn.Layer):
+    __shared__ = ['norm_type']
+
+    def __init__(self,
+                 depth=50,
+                 ch_in=64,
+                 variant='b',
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0],
+                 groups=1,
+                 base_width=64,
+                 norm_type='bn',
+                 norm_decay=0,
+                 freeze_norm=True,
+                 freeze_at=0,
+                 return_idx=[0, 1, 2, 3],
+                 dcn_v2_stages=[-1],
+                 num_stages=4,
+                 std_senet=False,
+                 freeze_stem_only=False):
+        """
+        Residual Network, see https://arxiv.org/abs/1512.03385
+        
+        Args:
+            depth (int): ResNet depth, should be 18, 34, 50, 101, 152.
+            ch_in (int): output channel of first stage, default 64
+            variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently
+            lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5),
+                                 lower learning rate ratio is need for pretrained model 
+                                 got using distillation(default as [1.0, 1.0, 1.0, 1.0]).
+            groups (int): group convolution cardinality
+            base_width (int): base width of each group convolution
+            norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel'
+            norm_decay (float): weight decay for normalization layer weights
+            freeze_norm (bool): freeze normalization layers
+            freeze_at (int): freeze the backbone at which stage
+            return_idx (list): index of the stages whose feature maps are returned
+            dcn_v2_stages (list): index of stages who select deformable conv v2
+            num_stages (int): total num of stages
+            std_senet (bool): whether use senet, default False.
+        """
+        super(ResNet, self).__init__()
+        self._model_type = 'ResNet' if groups == 1 else 'ResNeXt'
+        assert num_stages >= 1 and num_stages <= 4
+        self.depth = depth
+        self.variant = variant
+        self.groups = groups
+        self.base_width = base_width
+        self.norm_type = norm_type
+        self.norm_decay = norm_decay
+        self.freeze_norm = freeze_norm
+        self.freeze_at = freeze_at
+        if isinstance(return_idx, Integral):
+            return_idx = [return_idx]
+        assert max(return_idx) < num_stages, \
+            'the maximum return index must smaller than num_stages, ' \
+            'but received maximum return index is {} and num_stages ' \
+            'is {}'.format(max(return_idx), num_stages)
+        self.return_idx = return_idx
+        self.num_stages = num_stages
+        assert len(lr_mult_list) == 4, \
+            "lr_mult_list length must be 4 but got {}".format(len(lr_mult_list))
+        if isinstance(dcn_v2_stages, Integral):
+            dcn_v2_stages = [dcn_v2_stages]
+        assert max(dcn_v2_stages) < num_stages
+
+        if isinstance(dcn_v2_stages, Integral):
+            dcn_v2_stages = [dcn_v2_stages]
+        assert max(dcn_v2_stages) < num_stages
+        self.dcn_v2_stages = dcn_v2_stages
+
+        block_nums = ResNet_cfg[depth]
+        na = NameAdapter(self)
+
+        conv1_name = na.fix_c1_stage_name()
+        if variant in ['c', 'd']:
+            conv_def = [
+                [3, ch_in // 2, 3, 2, "conv1_1"],
+                [ch_in // 2, ch_in // 2, 3, 1, "conv1_2"],
+                [ch_in // 2, ch_in, 3, 1, "conv1_3"],
+            ]
+        else:
+            conv_def = [[3, ch_in, 7, 2, conv1_name]]
+        self.conv1 = nn.Sequential()
+        for (c_in, c_out, k, s, _name) in conv_def:
+            self.conv1.add_sublayer(
+                _name,
+                ConvNormLayer(
+                    ch_in=c_in,
+                    ch_out=c_out,
+                    filter_size=k,
+                    stride=s,
+                    groups=1,
+                    act='relu',
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    lr=1.0))
+
+        self.ch_in = ch_in
+        ch_out_list = [64, 128, 256, 512]
+        block = BottleNeck if depth >= 50 else BasicBlock
+
+        self._out_channels = [block.expansion * v for v in ch_out_list]
+        self._out_strides = [4, 8, 16, 32]
+
+        self.res_layers = []
+        for i in range(num_stages):
+            lr_mult = lr_mult_list[i]
+            stage_num = i + 2
+            res_name = "res{}".format(stage_num)
+            res_layer = self.add_sublayer(
+                res_name,
+                Blocks(
+                    block,
+                    self.ch_in,
+                    ch_out_list[i],
+                    count=block_nums[i],
+                    name_adapter=na,
+                    stage_num=stage_num,
+                    variant=variant,
+                    groups=groups,
+                    base_width=base_width,
+                    lr=lr_mult,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    dcn_v2=(i in self.dcn_v2_stages),
+                    std_senet=std_senet))
+            self.res_layers.append(res_layer)
+            self.ch_in = self._out_channels[i]
+
+        if freeze_at >= 0:
+            self._freeze_parameters(self.conv1)
+            if not freeze_stem_only:
+                for i in range(min(freeze_at + 1, num_stages)):
+                    self._freeze_parameters(self.res_layers[i])
+
+    def _freeze_parameters(self, m):
+        for p in m.parameters():
+            p.stop_gradient = True
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self._out_channels[i], stride=self._out_strides[i])
+            for i in self.return_idx
+        ]
+
+    def forward(self, inputs):
+        x = inputs['image']
+        conv1 = self.conv1(x)
+        x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1)
+        outs = []
+        for idx, stage in enumerate(self.res_layers):
+            x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+        return outs
+
+
+@register
+class Res5Head(nn.Layer):
+    def __init__(self, depth=50):
+        super(Res5Head, self).__init__()
+        feat_in, feat_out = [1024, 512]
+        if depth < 50:
+            feat_in = 256
+        na = NameAdapter(self)
+        block = BottleNeck if depth >= 50 else BasicBlock
+        self.res5 = Blocks(
+            block, feat_in, feat_out, count=3, name_adapter=na, stage_num=5)
+        self.feat_out = feat_out if depth < 50 else feat_out * 4
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(
+            channels=self.feat_out,
+            stride=16, )]
+
+    def forward(self, roi_feat, stage=0):
+        y = self.res5(roi_feat)
+        return y
--- a/rtdetr_paddle/ppdet/modeling/backbones/shufflenet_v2.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/shufflenet_v2.py
@@ -0,0 +1,250 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm2D
+from paddle.nn.initializer import KaimingNormal
+from paddle.regularizer import L2Decay
+
+from ppdet.core.workspace import register, serializable
+from numbers import Integral
+from ..shape_spec import ShapeSpec
+from ppdet.modeling.ops import channel_shuffle
+
+__all__ = ['ShuffleNetV2']
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 groups=1,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(initializer=KaimingNormal()),
+            bias_attr=False)
+
+        self._batch_norm = BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        if act == "hard_swish":
+            act = 'hardswish'
+        self.act = act
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if self.act:
+            y = getattr(F, self.act)(y)
+        return y
+
+
+class InvertedResidual(nn.Layer):
+    def __init__(self, in_channels, out_channels, stride, act="relu"):
+        super(InvertedResidual, self).__init__()
+        self._conv_pw = ConvBNLayer(
+            in_channels=in_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+        self._conv_dw = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=out_channels // 2,
+            act=None)
+        self._conv_linear = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+
+    def forward(self, inputs):
+        x1, x2 = paddle.split(
+            inputs,
+            num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2],
+            axis=1)
+        x2 = self._conv_pw(x2)
+        x2 = self._conv_dw(x2)
+        x2 = self._conv_linear(x2)
+        out = paddle.concat([x1, x2], axis=1)
+        return channel_shuffle(out, 2)
+
+
+class InvertedResidualDS(nn.Layer):
+    def __init__(self, in_channels, out_channels, stride, act="relu"):
+        super(InvertedResidualDS, self).__init__()
+
+        # branch1
+        self._conv_dw_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=in_channels,
+            act=None)
+        self._conv_linear_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+        # branch2
+        self._conv_pw_2 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+        self._conv_dw_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=out_channels // 2,
+            act=None)
+        self._conv_linear_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+
+    def forward(self, inputs):
+        x1 = self._conv_dw_1(inputs)
+        x1 = self._conv_linear_1(x1)
+        x2 = self._conv_pw_2(inputs)
+        x2 = self._conv_dw_2(x2)
+        x2 = self._conv_linear_2(x2)
+        out = paddle.concat([x1, x2], axis=1)
+
+        return channel_shuffle(out, 2)
+
+
+@register
+@serializable
+class ShuffleNetV2(nn.Layer):
+    def __init__(self, scale=1.0, act="relu", feature_maps=[5, 13, 17]):
+        super(ShuffleNetV2, self).__init__()
+        self.scale = scale
+        if isinstance(feature_maps, Integral):
+            feature_maps = [feature_maps]
+        self.feature_maps = feature_maps
+        stage_repeats = [4, 8, 4]
+
+        if scale == 0.25:
+            stage_out_channels = [-1, 24, 24, 48, 96, 512]
+        elif scale == 0.33:
+            stage_out_channels = [-1, 24, 32, 64, 128, 512]
+        elif scale == 0.5:
+            stage_out_channels = [-1, 24, 48, 96, 192, 1024]
+        elif scale == 1.0:
+            stage_out_channels = [-1, 24, 116, 232, 464, 1024]
+        elif scale == 1.5:
+            stage_out_channels = [-1, 24, 176, 352, 704, 1024]
+        elif scale == 2.0:
+            stage_out_channels = [-1, 24, 244, 488, 976, 2048]
+        else:
+            raise NotImplementedError("This scale size:[" + str(scale) +
+                                      "] is not implemented!")
+        self._out_channels = []
+        self._feature_idx = 0
+        # 1. conv1
+        self._conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=stage_out_channels[1],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            act=act)
+        self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
+        self._feature_idx += 1
+
+        # 2. bottleneck sequences
+        self._block_list = []
+        for stage_id, num_repeat in enumerate(stage_repeats):
+            for i in range(num_repeat):
+                if i == 0:
+                    block = self.add_sublayer(
+                        name=str(stage_id + 2) + '_' + str(i + 1),
+                        sublayer=InvertedResidualDS(
+                            in_channels=stage_out_channels[stage_id + 1],
+                            out_channels=stage_out_channels[stage_id + 2],
+                            stride=2,
+                            act=act))
+                else:
+                    block = self.add_sublayer(
+                        name=str(stage_id + 2) + '_' + str(i + 1),
+                        sublayer=InvertedResidual(
+                            in_channels=stage_out_channels[stage_id + 2],
+                            out_channels=stage_out_channels[stage_id + 2],
+                            stride=1,
+                            act=act))
+                self._block_list.append(block)
+                self._feature_idx += 1
+                self._update_out_channels(stage_out_channels[stage_id + 2],
+                                          self._feature_idx, self.feature_maps)
+
+    def _update_out_channels(self, channel, feature_idx, feature_maps):
+        if feature_idx in feature_maps:
+            self._out_channels.append(channel)
+
+    def forward(self, inputs):
+        y = self._conv1(inputs['image'])
+        y = self._max_pool(y)
+        outs = []
+        for i, inv in enumerate(self._block_list):
+            y = inv(y)
+            if i + 2 in self.feature_maps:
+                outs.append(y)
+
+        return outs
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
--- a/rtdetr_paddle/ppdet/modeling/backbones/swin_transformer.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/swin_transformer.py
@@ -0,0 +1,752 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is based on https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
+Ths copyright of microsoft/Swin-Transformer is as follows:
+MIT License [see LICENSE for details]
+"""
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.modeling.shape_spec import ShapeSpec
+from ppdet.core.workspace import register, serializable
+from .transformer_utils import DropPath, Identity
+from .transformer_utils import add_parameter, to_2tuple
+from .transformer_utils import ones_, zeros_, trunc_normal_
+
+__all__ = ['SwinTransformer']
+
+MODEL_cfg = {
+    # use 22kto1k finetune weights as default pretrained, can set by SwinTransformer.pretrained in config
+    'swin_T_224': dict(
+        pretrain_img_size=224,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_tiny_patch4_window7_224_22kto1k_pretrained.pdparams',
+    ),
+    'swin_S_224': dict(
+        pretrain_img_size=224,
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_small_patch4_window7_224_22kto1k_pretrained.pdparams',
+    ),
+    'swin_B_224': dict(
+        pretrain_img_size=224,
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=7,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window7_224_22kto1k_pretrained.pdparams',
+    ),
+    'swin_L_224': dict(
+        pretrain_img_size=224,
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=7,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window7_224_22kto1k_pretrained.pdparams',
+    ),
+    'swin_B_384': dict(
+        pretrain_img_size=384,
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window12_384_22kto1k_pretrained.pdparams',
+    ),
+    'swin_L_384': dict(
+        pretrain_img_size=384,
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window12_384_22kto1k_pretrained.pdparams',
+    ),
+}
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.reshape(
+        [-1, H // window_size, window_size, W // window_size, window_size, C])
+    windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
+        [-1, window_size, window_size, C])
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    _, _, _, C = windows.shape
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.reshape(
+        [-1, H // window_size, W // window_size, window_size, window_size, C])
+    x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, H, W, C])
+    return x
+
+
+class WindowAttention(nn.Layer):
+    """ Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = add_parameter(
+            self,
+            paddle.zeros(((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                          num_heads)))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = paddle.arange(self.window_size[0])
+        coords_w = paddle.arange(self.window_size[1])
+        coords = paddle.stack(paddle.meshgrid(
+            [coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+        coords_flatten_1 = coords_flatten.unsqueeze(axis=2)
+        coords_flatten_2 = coords_flatten.unsqueeze(axis=1)
+        relative_coords = coords_flatten_1 - coords_flatten_2
+        relative_coords = relative_coords.transpose(
+            [1, 2, 0])  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[
+            0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        self.relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table)
+        self.softmax = nn.Softmax(axis=-1)
+
+    def forward(self, x, mask=None):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(
+            [-1, N, 3, self.num_heads, C // self.num_heads]).transpose(
+                [2, 0, 3, 1, 4])
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        attn = paddle.mm(q, k.transpose([0, 1, 3, 2]))
+
+        index = self.relative_position_index.flatten()
+
+        relative_position_bias = paddle.index_select(
+            self.relative_position_bias_table, index)
+        relative_position_bias = relative_position_bias.reshape([
+            self.window_size[0] * self.window_size[1],
+            self.window_size[0] * self.window_size[1], -1
+        ])  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.transpose(
+            [2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.reshape([-1, nW, self.num_heads, N, N
+                                 ]) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.reshape([-1, self.num_heads, N, N])
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        # x = (attn @ v).transpose(1, 2).reshape([B_, N, C])
+        x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([-1, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock(nn.Layer):
+    """ Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=7,
+                 shift_size=0,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+        self.H = None
+        self.W = None
+
+    def forward(self, x, mask_matrix):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.reshape([-1, H, W, C])
+
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, [0, pad_l, 0, pad_b, 0, pad_r, 0, pad_t],
+                  data_format='NHWC')
+        _, Hp, Wp, _ = x.shape
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = paddle.roll(
+                x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.reshape(
+            [x_windows.shape[0], self.window_size * self.window_size,
+             C])  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.reshape(
+            [x_windows.shape[0], self.window_size, self.window_size, C])
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp,
+                                   Wp)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = paddle.roll(
+                shifted_x,
+                shifts=(self.shift_size, self.shift_size),
+                axis=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :]
+
+        x = x.reshape([-1, H * W, C])
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+
+class PatchMerging(nn.Layer):
+    r""" Patch Merging Layer.
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+
+        x = x.reshape([-1, H, W, C])
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            # paddle F.pad default data_format is 'NCHW'
+            x = F.pad(x, [0, 0, 0, H % 2, 0, W % 2, 0, 0], data_format='NHWC')
+            H += H % 2
+            W += W % 2
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = paddle.concat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.reshape([-1, H * W // 4, 4 * C])  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+class BasicLayer(nn.Layer):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
+    """
+
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+
+        # build blocks
+        self.blocks = nn.LayerList([
+            SwinTransformerBlock(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, np.ndarray) else drop_path,
+                norm_layer=norm_layer) for i in range(depth)
+        ])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = paddle.zeros([1, Hp, Wp, 1], dtype='float32')  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+
+                cnt += 1
+
+        mask_windows = window_partition(
+            img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.reshape(
+            [-1, self.window_size * self.window_size])
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        huns = -100.0 * paddle.ones_like(attn_mask)
+        attn_mask = huns * (attn_mask != 0).astype("float32")
+
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Layer, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        # TODO # export dynamic shape
+        B, C, H, W = x.shape
+        # assert [H, W] == self.img_size[:2], "Input image size ({H}*{W}) doesn't match model ({}*{}).".format(H, W, self.img_size[0], self.img_size[1])
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
+
+        x = self.proj(x)
+        if self.norm is not None:
+            _, _, Wh, Ww = x.shape
+            x = x.flatten(2).transpose([0, 2, 1])
+            x = self.norm(x)
+            x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
+
+        return x
+
+
+@register
+@serializable
+class SwinTransformer(nn.Layer):
+    """ Swin Transformer backbone
+    Args:
+        arch (str): Architecture of FocalNet
+        pretrain_img_size (int | tuple(int)): Input image size. Default 224
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        in_chans (int): Number of input image channels. Default: 3
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 7
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+    """
+
+    def __init__(self,
+                 arch='swin_T_224',
+                 pretrain_img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 out_indices=(0, 1, 2, 3),
+                 frozen_stages=-1,
+                 pretrained=None):
+        super(SwinTransformer, self).__init__()
+        assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch)
+
+        pretrain_img_size = MODEL_cfg[arch]['pretrain_img_size']
+        embed_dim = MODEL_cfg[arch]['embed_dim']
+        depths = MODEL_cfg[arch]['depths']
+        num_heads = MODEL_cfg[arch]['num_heads']
+        window_size = MODEL_cfg[arch]['window_size']
+        if pretrained is None:
+            pretrained = MODEL_cfg[arch]['pretrained']
+
+        self.num_layers = len(depths)
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [
+                pretrain_img_size[0] // patch_size[0],
+                pretrain_img_size[1] // patch_size[1]
+            ]
+
+            self.absolute_pos_embed = add_parameter(
+                self,
+                paddle.zeros((1, embed_dim, patches_resolution[0],
+                              patches_resolution[1])))
+            trunc_normal_(self.absolute_pos_embed)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = np.linspace(0, drop_path_rate,
+                          sum(depths))  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.LayerList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging
+                if (i_layer < self.num_layers - 1) else None)
+            self.layers.append(layer)
+
+        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
+        self.num_features = num_features
+
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_sublayer(layer_name, layer)
+
+        self.apply(self._init_weights)
+        self._freeze_stages()
+        if pretrained:
+            if 'http' in pretrained:  #URL
+                path = paddle.utils.download.get_weights_path_from_url(
+                    pretrained)
+            else:  #model in local path
+                path = pretrained
+            self.set_state_dict(paddle.load(path))
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.stop_gradient = True
+
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.stop_gradient = True
+
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.stop_gradient = True
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x['image'])
+        B, _, Wh, Ww = x.shape
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(
+                self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
+            x = (x + absolute_pos_embed).flatten(2).transpose([0, 2, 1])
+        else:
+            x = x.flatten(2).transpose([0, 2, 1])
+        x = self.pos_drop(x)
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+                out = x_out.reshape((-1, H, W, self.num_features[i])).transpose(
+                    (0, 3, 1, 2))
+                outs.append(out)
+
+        return outs
+
+    @property
+    def out_shape(self):
+        out_strides = [4, 8, 16, 32]
+        return [
+            ShapeSpec(
+                channels=self.num_features[i], stride=out_strides[i])
+            for i in self.out_indices
+        ]
--- a/rtdetr_paddle/ppdet/modeling/backbones/trans_encoder.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/trans_encoder.py
@@ -0,0 +1,381 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import ReLU, Swish, GELU
+import math
+
+from ppdet.core.workspace import register
+from ..shape_spec import ShapeSpec
+
+__all__ = ['TransEncoder']
+
+
+class BertEmbeddings(nn.Layer):
+    def __init__(self, word_size, position_embeddings_size, word_type_size,
+                 hidden_size, dropout_prob):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(
+            word_size, hidden_size, padding_idx=0)
+        self.position_embeddings = nn.Embedding(position_embeddings_size,
+                                                hidden_size)
+        self.token_type_embeddings = nn.Embedding(word_type_size, hidden_size)
+        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
+        self.dropout = nn.Dropout(dropout_prob)
+
+    def forward(self, x, token_type_ids=None, position_ids=None):
+        seq_len = paddle.shape(x)[1]
+        if position_ids is None:
+            position_ids = paddle.arange(seq_len).unsqueeze(0).expand_as(x)
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(paddle.shape(x))
+
+        word_embs = self.word_embeddings(x)
+        position_embs = self.position_embeddings(position_ids)
+        token_type_embs = self.token_type_embeddings(token_type_ids)
+
+        embs_cmb = word_embs + position_embs + token_type_embs
+        embs_out = self.layernorm(embs_cmb)
+        embs_out = self.dropout(embs_out)
+        return embs_out
+
+
+class BertSelfAttention(nn.Layer):
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 output_attentions=False):
+        super(BertSelfAttention, self).__init__()
+        if hidden_size % num_attention_heads != 0:
+            raise ValueError(
+                "The hidden_size must be a multiple of the number of attention "
+                "heads, but got {} % {} != 0" %
+                (hidden_size, num_attention_heads))
+
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_size = int(hidden_size / num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(hidden_size, self.all_head_size)
+        self.key = nn.Linear(hidden_size, self.all_head_size)
+        self.value = nn.Linear(hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(attention_probs_dropout_prob)
+        self.output_attentions = output_attentions
+
+    def forward(self, x, attention_mask, head_mask=None):
+        query = self.query(x)
+        key = self.key(x)
+        value = self.value(x)
+
+        query_dim1, query_dim2 = paddle.shape(query)[:-1]
+        new_shape = [
+            query_dim1, query_dim2, self.num_attention_heads,
+            self.attention_head_size
+        ]
+        query = query.reshape(new_shape).transpose(perm=(0, 2, 1, 3))
+        key = key.reshape(new_shape).transpose(perm=(0, 2, 3, 1))
+        value = value.reshape(new_shape).transpose(perm=(0, 2, 1, 3))
+
+        attention = paddle.matmul(query,
+                                  key) / math.sqrt(self.attention_head_size)
+        attention = attention + attention_mask
+        attention_value = F.softmax(attention, axis=-1)
+        attention_value = self.dropout(attention_value)
+
+        if head_mask is not None:
+            attention_value = attention_value * head_mask
+
+        context = paddle.matmul(attention_value, value).transpose(perm=(0, 2, 1,
+                                                                        3))
+        ctx_dim1, ctx_dim2 = paddle.shape(context)[:-2]
+        new_context_shape = [
+            ctx_dim1,
+            ctx_dim2,
+            self.all_head_size,
+        ]
+        context = context.reshape(new_context_shape)
+
+        if self.output_attentions:
+            return (context, attention_value)
+        else:
+            return (context, )
+
+
+class BertAttention(nn.Layer):
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 output_attentions=False):
+        super(BertAttention, self).__init__()
+        self.bert_selfattention = BertSelfAttention(
+            hidden_size, num_attention_heads, attention_probs_dropout_prob,
+            output_attentions)
+        self.fc = nn.Linear(hidden_size, hidden_size)
+        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
+        self.dropout = nn.Dropout(fc_dropout_prob)
+
+    def forward(self, x, attention_mask, head_mask=None):
+        attention_feats = self.bert_selfattention(x, attention_mask, head_mask)
+        features = self.fc(attention_feats[0])
+        features = self.dropout(features)
+        features = self.layernorm(features + x)
+        if len(attention_feats) == 2:
+            return (features, attention_feats[1])
+        else:
+            return (features, )
+
+
+class BertFeedForward(nn.Layer):
+    def __init__(self,
+                 hidden_size,
+                 intermediate_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 act_fn='ReLU',
+                 output_attentions=False):
+        super(BertFeedForward, self).__init__()
+        self.fc1 = nn.Linear(hidden_size, intermediate_size)
+        self.act_fn = eval(act_fn)
+        self.fc2 = nn.Linear(intermediate_size, hidden_size)
+        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
+        self.dropout = nn.Dropout(fc_dropout_prob)
+
+    def forward(self, x):
+        features = self.fc1(x)
+        features = self.act_fn(features)
+        features = self.fc2(features)
+        features = self.dropout(features)
+        features = self.layernorm(features + x)
+        return features
+
+
+class BertLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size,
+                 intermediate_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 act_fn='ReLU',
+                 output_attentions=False):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(hidden_size, num_attention_heads,
+                                       attention_probs_dropout_prob,
+                                       output_attentions)
+        self.feed_forward = BertFeedForward(
+            hidden_size, intermediate_size, num_attention_heads,
+            attention_probs_dropout_prob, fc_dropout_prob, act_fn,
+            output_attentions)
+
+    def forward(self, x, attention_mask, head_mask=None):
+        attention_feats = self.attention(x, attention_mask, head_mask)
+        features = self.feed_forward(attention_feats[0])
+        if len(attention_feats) == 2:
+            return (features, attention_feats[1])
+        else:
+            return (features, )
+
+
+class BertEncoder(nn.Layer):
+    def __init__(self,
+                 num_hidden_layers,
+                 hidden_size,
+                 intermediate_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 act_fn='ReLU',
+                 output_attentions=False,
+                 output_hidden_feats=False):
+        super(BertEncoder, self).__init__()
+        self.output_attentions = output_attentions
+        self.output_hidden_feats = output_hidden_feats
+        self.layers = nn.LayerList([
+            BertLayer(hidden_size, intermediate_size, num_attention_heads,
+                      attention_probs_dropout_prob, fc_dropout_prob, act_fn,
+                      output_attentions) for _ in range(num_hidden_layers)
+        ])
+
+    def forward(self, x, attention_mask, head_mask=None):
+        all_features = (x, )
+        all_attentions = ()
+
+        for i, layer in enumerate(self.layers):
+            mask = head_mask[i] if head_mask is not None else None
+            layer_out = layer(x, attention_mask, mask)
+
+            if self.output_hidden_feats:
+                all_features = all_features + (x, )
+            x = layer_out[0]
+            if self.output_attentions:
+                all_attentions = all_attentions + (layer_out[1], )
+
+        outputs = (x, )
+        if self.output_hidden_feats:
+            outputs += (all_features, )
+        if self.output_attentions:
+            outputs += (all_attentions, )
+        return outputs
+
+
+class BertPooler(nn.Layer):
+    def __init__(self, hidden_size):
+        super(BertPooler, self).__init__()
+        self.fc = nn.Linear(hidden_size, hidden_size)
+        self.act = nn.Tanh()
+
+    def forward(self, x):
+        first_token = x[:, 0]
+        pooled_output = self.fc(first_token)
+        pooled_output = self.act(pooled_output)
+        return pooled_output
+
+
+class METROEncoder(nn.Layer):
+    def __init__(self,
+                 vocab_size,
+                 num_hidden_layers,
+                 features_dims,
+                 position_embeddings_size,
+                 hidden_size,
+                 intermediate_size,
+                 output_feature_dim,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 act_fn='ReLU',
+                 output_attentions=False,
+                 output_hidden_feats=False,
+                 use_img_layernorm=False):
+        super(METROEncoder, self).__init__()
+        self.img_dims = features_dims
+        self.num_hidden_layers = num_hidden_layers
+        self.use_img_layernorm = use_img_layernorm
+        self.output_attentions = output_attentions
+        self.embedding = BertEmbeddings(vocab_size, position_embeddings_size, 2,
+                                        hidden_size, fc_dropout_prob)
+        self.encoder = BertEncoder(
+            num_hidden_layers, hidden_size, intermediate_size,
+            num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob,
+            act_fn, output_attentions, output_hidden_feats)
+        self.pooler = BertPooler(hidden_size)
+        self.position_embeddings = nn.Embedding(position_embeddings_size,
+                                                hidden_size)
+        self.img_embedding = nn.Linear(
+            features_dims, hidden_size, bias_attr=True)
+        self.dropout = nn.Dropout(fc_dropout_prob)
+        self.cls_head = nn.Linear(hidden_size, output_feature_dim)
+        self.residual = nn.Linear(features_dims, output_feature_dim)
+
+        self.apply(self.init_weights)
+
+    def init_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.set_value(
+                paddle.normal(
+                    mean=0.0, std=0.02, shape=module.weight.shape))
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.set_value(paddle.zeros(shape=module.bias.shape))
+            module.weight.set_value(
+                paddle.full(
+                    shape=module.weight.shape, fill_value=1.0))
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.set_value(paddle.zeros(shape=module.bias.shape))
+
+    def forward(self, x):
+        batchsize, seq_len = paddle.shape(x)[:2]
+        input_ids = paddle.zeros((batchsize, seq_len), dtype="int64")
+        position_ids = paddle.arange(
+            seq_len, dtype="int64").unsqueeze(0).expand_as(input_ids)
+
+        attention_mask = paddle.ones_like(input_ids).unsqueeze(1).unsqueeze(2)
+        head_mask = [None] * self.num_hidden_layers
+
+        position_embs = self.position_embeddings(position_ids)
+        attention_mask = (1.0 - attention_mask) * -10000.0
+
+        img_features = self.img_embedding(x)
+
+        # We empirically observe that adding an additional learnable position embedding leads to more stable training
+        embeddings = position_embs + img_features
+        if self.use_img_layernorm:
+            embeddings = self.layernorm(embeddings)
+        embeddings = self.dropout(embeddings)
+
+        encoder_outputs = self.encoder(
+            embeddings, attention_mask, head_mask=head_mask)
+
+        pred_score = self.cls_head(encoder_outputs[0])
+        res_img_feats = self.residual(x)
+        pred_score = pred_score + res_img_feats
+
+        if self.output_attentions and self.output_hidden_feats:
+            return pred_score, encoder_outputs[1], encoder_outputs[-1]
+        else:
+            return pred_score
+
+
+def gelu(x):
+    """Implementation of the gelu activation function.
+        https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0)))
+
+
+@register
+class TransEncoder(nn.Layer):
+    def __init__(self,
+                 vocab_size=30522,
+                 num_hidden_layers=4,
+                 num_attention_heads=4,
+                 position_embeddings_size=512,
+                 intermediate_size=3072,
+                 input_feat_dim=[2048, 512, 128],
+                 hidden_feat_dim=[1024, 256, 128],
+                 attention_probs_dropout_prob=0.1,
+                 fc_dropout_prob=0.1,
+                 act_fn='gelu',
+                 output_attentions=False,
+                 output_hidden_feats=False):
+        super(TransEncoder, self).__init__()
+        output_feat_dim = input_feat_dim[1:] + [3]
+        trans_encoder = []
+        for i in range(len(output_feat_dim)):
+            features_dims = input_feat_dim[i]
+            output_feature_dim = output_feat_dim[i]
+            hidden_size = hidden_feat_dim[i]
+
+            # init a transformer encoder and append it to a list
+            assert hidden_size % num_attention_heads == 0
+            model = METROEncoder(vocab_size, num_hidden_layers, features_dims,
+                                 position_embeddings_size, hidden_size,
+                                 intermediate_size, output_feature_dim,
+                                 num_attention_heads,
+                                 attention_probs_dropout_prob, fc_dropout_prob,
+                                 act_fn, output_attentions, output_hidden_feats)
+            trans_encoder.append(model)
+        self.trans_encoder = paddle.nn.Sequential(*trans_encoder)
+
+    def forward(self, x):
+        out = self.trans_encoder(x)
+        return out
--- a/rtdetr_paddle/ppdet/modeling/backbones/transformer_utils.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/transformer_utils.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddle.nn.initializer import TruncatedNormal, Constant, Assign
+
+# Common initializations
+ones_ = Constant(value=1.)
+zeros_ = Constant(value=0.)
+trunc_normal_ = TruncatedNormal(std=.02)
+
+
+# Common Layers
+def drop_path(x, drop_prob=0., training=False):
+    """
+        Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+        the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+        See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
+    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+# common funcs
+
+
+def to_2tuple(x):
+    if isinstance(x, (list, tuple)):
+        return x
+    return tuple([x] * 2)
+
+
+def add_parameter(layer, datas, name=None):
+    parameter = layer.create_parameter(
+        shape=(datas.shape), default_initializer=Assign(datas))
+    if name:
+        layer.add_parameter(name, parameter)
+    return parameter
+
+
+def window_partition(x, window_size):
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """
+    B, H, W, C = paddle.shape(x)
+
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    x = F.pad(x.transpose([0, 3, 1, 2]),
+              paddle.to_tensor(
+                  [0, int(pad_w), 0, int(pad_h)],
+                  dtype='int32')).transpose([0, 2, 3, 1])
+    Hp, Wp = H + pad_h, W + pad_w
+
+    num_h, num_w = Hp // window_size, Wp // window_size
+
+    x = x.reshape([B, num_h, window_size, num_w, window_size, C])
+    windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
+        [-1, window_size, window_size, C])
+    return windows, (Hp, Wp), (num_h, num_w)
+
+
+def window_unpartition(x, pad_hw, num_hw, hw):
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """
+    Hp, Wp = pad_hw
+    num_h, num_w = num_hw
+    H, W = hw
+    B, window_size, _, C = paddle.shape(x)
+    B = B // (num_h * num_w)
+    x = x.reshape([B, num_h, num_w, window_size, window_size, C])
+    x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, Hp, Wp, C])
+
+    return x[:, :H, :W, :]
--- a/rtdetr_paddle/ppdet/modeling/backbones/vision_transformer.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/vision_transformer.py
@@ -0,0 +1,652 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+from paddle.nn.initializer import Constant
+
+from ppdet.modeling.shape_spec import ShapeSpec
+from ppdet.core.workspace import register, serializable
+
+from .transformer_utils import zeros_, DropPath, Identity
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 window_size=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=False)
+
+        if qkv_bias:
+            self.q_bias = self.create_parameter(
+                shape=([dim]), default_initializer=zeros_)
+            self.v_bias = self.create_parameter(
+                shape=([dim]), default_initializer=zeros_)
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0] - 1) * (
+                2 * window_size[1] - 1) + 3
+            self.relative_position_bias_table = self.create_parameter(
+                shape=(self.num_relative_distance, num_heads),
+                default_initializer=zeros_)  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+
+            # get pair-wise relative position index for each token inside the window
+            coords_h = paddle.arange(window_size[0])
+            coords_w = paddle.arange(window_size[1])
+            coords = paddle.stack(paddle.meshgrid(
+                [coords_h, coords_w]))  # 2, Wh, Ww
+            coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww 
+            coords_flatten_1 = paddle.unsqueeze(coords_flatten, 2)
+            coords_flatten_2 = paddle.unsqueeze(coords_flatten, 1)
+            relative_coords = coords_flatten_1.clone() - coords_flatten_2.clone(
+            )
+
+            #relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Wh
+            relative_coords = relative_coords.transpose(
+                (1, 2, 0))  #.contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += window_size[
+                0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = \
+                paddle.zeros(shape=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
+            relative_position_index[1:, 1:] = relative_coords.sum(
+                -1)  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+
+            self.register_buffer("relative_position_index",
+                                 relative_position_index)
+            # trunc_normal_(self.relative_position_bias_table, std=.0)
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, rel_pos_bias=None):
+        x_shape = paddle.shape(x)
+        N, C = x_shape[1], x_shape[2]
+
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = paddle.concat(
+                (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
+        qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)
+
+        qkv = qkv.reshape((-1, N, 3, self.num_heads,
+                           C // self.num_heads)).transpose((2, 0, 3, 1, 4))
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+
+        if self.relative_position_bias_table is not None:
+            relative_position_bias = self.relative_position_bias_table[
+                self.relative_position_index.reshape([-1])].reshape([
+                    self.window_size[0] * self.window_size[1] + 1,
+                    self.window_size[0] * self.window_size[1] + 1, -1
+                ])  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.transpose(
+                (2, 0, 1))  #.contiguous()  # nH, Wh*Ww, Wh*Ww
+            attn = attn + relative_position_bias.unsqueeze(0)
+        if rel_pos_bias is not None:
+            attn = attn + rel_pos_bias
+
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 window_size=None,
+                 init_values=None,
+                 act_layer=nn.GELU,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim, epsilon=1e-6)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            window_size=window_size)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        if init_values is not None:
+            self.gamma_1 = self.create_parameter(
+                shape=([dim]), default_initializer=Constant(value=init_values))
+            self.gamma_2 = self.create_parameter(
+                shape=([dim]), default_initializer=Constant(value=init_values))
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x, rel_pos_bias=None):
+
+        if self.gamma_1 is None:
+            x = x + self.drop_path(
+                self.attn(
+                    self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_1 * self.attn(
+                self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=[224, 224],
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768):
+        super().__init__()
+        self.num_patches_w = img_size[0] // patch_size
+        self.num_patches_h = img_size[1] // patch_size
+
+        num_patches = self.num_patches_w * self.num_patches_h
+        self.patch_shape = (img_size[0] // patch_size,
+                            img_size[1] // patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    @property
+    def num_patches_in_h(self):
+        return self.img_size[1] // self.patch_size
+
+    @property
+    def num_patches_in_w(self):
+        return self.img_size[0] // self.patch_size
+
+    def forward(self, x, mask=None):
+        B, C, H, W = x.shape
+        return self.proj(x)
+
+
+class RelativePositionBias(nn.Layer):
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0] - 1) * (
+            2 * window_size[1] - 1) + 3
+        self.relative_position_bias_table = self.create_parameter(
+            shape=(self.num_relative_distance, num_heads),
+            default_initialize=zeros_)
+        # cls to token & token 2 cls & cls to cls
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = paddle.arange(window_size[0])
+        coords_w = paddle.arange(window_size[1])
+        coords = paddle.stack(paddle.meshgrid(
+            [coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = coords.flatten(1)  # 2, Wh*Ww
+
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.transpos(
+            (1, 2, 0))  # Wh*Ww, Wh*Ww, 2 
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = \
+            paddle.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:, 1:] = relative_coords.sum(
+            -1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+        self.register_buffer("relative_position_index", relative_position_index)
+
+    def forward(self):
+        relative_position_bias = \
+            self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([
+                 self.window_size[0] * self.window_size[1] + 1,
+                 self.window_size[0] * self.window_size[1] + 1, -1])  # Wh*Ww,Wh*Ww,nH 
+        return relative_position_bias.transpose((2, 0, 1))  # nH, Wh*Ww, Wh*Ww
+
+
+def get_sinusoid_encoding_table(n_position, d_hid, token=False):
+    ''' Sinusoid position encoding table '''
+
+    def get_position_angle_vec(position):
+        return [
+            position / np.power(10000, 2 * (hid_j // 2) / d_hid)
+            for hid_j in range(d_hid)
+        ]
+
+    sinusoid_table = np.array(
+        [get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+    if token:
+        sinusoid_table = np.concatenate(
+            [sinusoid_table, np.zeros([1, d_hid])], dim=0)
+
+    return paddle.to_tensor(sinusoid_table, dtype=paddle.float32).unsqueeze(0)
+
+
+@register
+@serializable
+class VisionTransformer(nn.Layer):
+    """ Vision Transformer with support for patch input
+    """
+
+    def __init__(self,
+                 img_size=[672, 1092],
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer='nn.LayerNorm',
+                 init_values=None,
+                 use_rel_pos_bias=False,
+                 use_shared_rel_pos_bias=False,
+                 epsilon=1e-5,
+                 final_norm=False,
+                 pretrained=None,
+                 out_indices=[3, 5, 7, 11],
+                 use_abs_pos_emb=False,
+                 use_sincos_pos_emb=True,
+                 with_fpn=True,
+                 num_fpn_levels=4,
+                 use_checkpoint=False,
+                 **args):
+        super().__init__()
+        self.img_size = img_size
+        self.embed_dim = embed_dim
+        self.with_fpn = with_fpn
+        self.use_checkpoint = use_checkpoint
+        self.use_sincos_pos_emb = use_sincos_pos_emb
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.final_norm = final_norm
+        self.out_indices = out_indices
+        self.num_fpn_levels = num_fpn_levels
+
+        if use_checkpoint:
+            paddle.seed(0)
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim)
+
+        self.pos_w = self.patch_embed.num_patches_in_w
+        self.pos_h = self.patch_embed.num_patches_in_h
+
+        self.cls_token = self.create_parameter(
+            shape=(1, 1, embed_dim),
+            default_initializer=paddle.nn.initializer.Constant(value=0.))
+
+        if use_abs_pos_emb:
+            self.pos_embed = self.create_parameter(
+                shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
+                default_initializer=paddle.nn.initializer.TruncatedNormal(
+                    std=.02))
+        elif use_sincos_pos_emb:
+            pos_embed = self.build_2d_sincos_position_embedding(embed_dim)
+
+            self.pos_embed = pos_embed
+            self.pos_embed = self.create_parameter(shape=pos_embed.shape)
+            self.pos_embed.set_value(pos_embed.numpy())
+            self.pos_embed.stop_gradient = True
+
+        else:
+            self.pos_embed = None
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(
+                window_size=self.patch_embed.patch_shape, num_heads=num_heads)
+        else:
+            self.rel_pos_bias = None
+
+        dpr = np.linspace(0, drop_path_rate, depth)
+
+        self.blocks = nn.LayerList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                init_values=init_values,
+                window_size=self.patch_embed.patch_shape
+                if use_rel_pos_bias else None,
+                epsilon=epsilon) for i in range(depth)
+        ])
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+        assert len(out_indices) <= 4, ''
+        self.out_indices = out_indices
+        self.out_channels = [embed_dim for _ in range(num_fpn_levels)]
+        self.out_strides = [4, 8, 16, 32][-num_fpn_levels:] if with_fpn else [
+            patch_size for _ in range(len(out_indices))
+        ]
+
+        self.norm = Identity()
+
+        if self.with_fpn:
+            assert num_fpn_levels <= 4, ''
+            self.init_fpn(
+                embed_dim=embed_dim,
+                patch_size=patch_size, )
+
+    def init_weight(self):
+        pretrained = self.pretrained
+
+        if pretrained:
+            if 'http' in pretrained:  #URL
+                path = paddle.utils.download.get_weights_path_from_url(
+                    pretrained)
+            else:  #model in local path
+                path = pretrained
+
+            load_state_dict = paddle.load(path)
+            model_state_dict = self.state_dict()
+            pos_embed_name = "pos_embed"
+
+            if pos_embed_name in load_state_dict.keys():
+                load_pos_embed = paddle.to_tensor(
+                    load_state_dict[pos_embed_name], dtype="float32")
+                if self.pos_embed.shape != load_pos_embed.shape:
+                    pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
+                    model_state_dict[pos_embed_name] = self.resize_pos_embed(
+                        load_pos_embed, (pos_size, pos_size),
+                        (self.pos_h, self.pos_w))
+
+                    # self.set_state_dict(model_state_dict)
+                    load_state_dict[pos_embed_name] = model_state_dict[
+                        pos_embed_name]
+
+                    print("Load pos_embed and resize it from {} to {} .".format(
+                        load_pos_embed.shape, self.pos_embed.shape))
+
+            self.set_state_dict(load_state_dict)
+            print("Load load_state_dict....")
+
+    def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):
+        if patch_size == 16:
+            self.fpn1 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2),
+                nn.BatchNorm2D(embed_dim),
+                nn.GELU(),
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn2 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn3 = Identity()
+
+            self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)
+        elif patch_size == 8:
+            self.fpn1 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn2 = Identity()
+
+            self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )
+
+            self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )
+
+        if not out_with_norm:
+            self.norm = Identity()
+        else:
+            self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6)
+
+    def interpolate_pos_encoding(self, x, w, h):
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        w0 = w // self.patch_embed.patch_size
+        h0 = h // self.patch_embed.patch_size
+        if npatch == N and w0 == self.patch_embed.num_patches_w and h0 == self.patch_embed.num_patches_h:
+            return self.pos_embed
+        class_pos_embed = self.pos_embed[:, 0]
+        patch_pos_embed = self.pos_embed[:, 1:]
+        dim = x.shape[-1]
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        # w0, h0 = w0 + 0.1, h0 + 0.1
+        # patch_pos_embed = nn.functional.interpolate(
+        #     patch_pos_embed.reshape([
+        #         1, self.patch_embed.num_patches_w,
+        #         self.patch_embed.num_patches_h, dim
+        #     ]).transpose((0, 3, 1, 2)),
+        #     scale_factor=(w0 / self.patch_embed.num_patches_w,
+        #                   h0 / self.patch_embed.num_patches_h),
+        #     mode='bicubic', )
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape([
+                1, self.patch_embed.num_patches_w,
+                self.patch_embed.num_patches_h, dim
+            ]).transpose((0, 3, 1, 2)),
+            (w0, h0),
+            mode='bicubic', )
+
+        assert int(w0) == patch_pos_embed.shape[-2] and int(
+            h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.transpose(
+            (0, 2, 3, 1)).reshape([1, -1, dim])
+        return paddle.concat(
+            (class_pos_embed.unsqueeze(0), patch_pos_embed), axis=1)
+
+    def resize_pos_embed(self, pos_embed, old_hw, new_hw):
+        """
+        Resize pos_embed weight.
+        Args:
+            pos_embed (Tensor): the pos_embed weight
+            old_hw (list[int]): the height and width of old pos_embed
+            new_hw (list[int]): the height and width of new pos_embed
+        Returns:
+            Tensor: the resized pos_embed weight
+        """
+        cls_pos_embed = pos_embed[:, :1, :]
+        pos_embed = pos_embed[:, 1:, :]
+
+        pos_embed = pos_embed.transpose([0, 2, 1])
+        pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
+        pos_embed = F.interpolate(
+            pos_embed, new_hw, mode='bicubic', align_corners=False)
+        pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
+        pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
+
+        return pos_embed
+
+    def build_2d_sincos_position_embedding(
+            self,
+            embed_dim=768,
+            temperature=10000., ):
+        h, w = self.patch_embed.patch_shape
+        grid_w = paddle.arange(w, dtype=paddle.float32)
+        grid_h = paddle.arange(h, dtype=paddle.float32)
+        grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
+        assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
+        pos_dim = embed_dim // 4
+        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
+        omega = 1. / (temperature**omega)
+
+        out_w = grid_w.flatten()[..., None] @omega[None]
+        out_h = grid_h.flatten()[..., None] @omega[None]
+
+        pos_emb = paddle.concat(
+            [
+                paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
+                paddle.cos(out_h)
+            ],
+            axis=1)[None, :, :]
+
+        pe_token = paddle.zeros([1, 1, embed_dim], dtype=paddle.float32)
+        pos_embed = paddle.concat([pe_token, pos_emb], axis=1)
+        # pos_embed.stop_gradient = True
+
+        return pos_embed
+
+    def forward(self, x):
+        x = x['image'] if isinstance(x, dict) else x
+        _, _, h, w = x.shape
+
+        x = self.patch_embed(x)
+
+        B, D, Hp, Wp = x.shape  # b * c * h * w
+
+        cls_tokens = self.cls_token.expand(
+            (B, self.cls_token.shape[-2], self.cls_token.shape[-1]))
+        x = x.flatten(2).transpose([0, 2, 1])  # b * hw * c
+        x = paddle.concat([cls_tokens, x], axis=1)
+
+        if self.pos_embed is not None:
+            # x = x + self.interpolate_pos_encoding(x, w, h)
+            x = x + self.interpolate_pos_encoding(x, h, w)
+
+        x = self.pos_drop(x)
+
+        rel_pos_bias = self.rel_pos_bias(
+        ) if self.rel_pos_bias is not None else None
+
+        feats = []
+        for idx, blk in enumerate(self.blocks):
+            if self.use_checkpoint and self.training:
+                x = paddle.distributed.fleet.utils.recompute(
+                    blk, x, rel_pos_bias, **{"preserve_rng_state": True})
+            else:
+                x = blk(x, rel_pos_bias)
+
+            if idx in self.out_indices:
+                xp = paddle.reshape(
+                    paddle.transpose(
+                        self.norm(x[:, 1:, :]), perm=[0, 2, 1]),
+                    shape=[B, D, Hp, Wp])
+                feats.append(xp)
+
+        if self.with_fpn:
+            fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4][
+                -self.num_fpn_levels:]
+            assert len(fpns) == len(feats) or len(feats) == 1, ''
+            outputs = []
+            for i, m in enumerate(fpns):
+                outputs.append(
+                    m(feats[i] if len(feats) == len(fpns) else feats[-1]))
+
+            return outputs
+
+        return feats
+
+    @property
+    def num_layers(self):
+        return len(self.blocks)
+
+    @property
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=c, stride=s)
+            for c, s in zip(self.out_channels, self.out_strides)
+        ]
--- a/rtdetr_paddle/ppdet/modeling/backbones/vit_mae.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/vit_mae.py
@@ -0,0 +1,749 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+import math
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Constant, TruncatedNormal
+
+from ppdet.modeling.shape_spec import ShapeSpec
+from ppdet.core.workspace import register, serializable
+
+from .transformer_utils import (zeros_, DropPath, Identity, window_partition,
+                                window_unpartition)
+from ..initializer import linear_init_
+
+__all__ = ['VisionTransformer2D', 'SimpleFeaturePyramid']
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer='nn.GELU',
+                 drop=0.,
+                 lr_factor=1.0):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(
+            in_features,
+            hidden_features,
+            weight_attr=ParamAttr(learning_rate=lr_factor),
+            bias_attr=ParamAttr(learning_rate=lr_factor))
+        self.act = eval(act_layer)()
+        self.fc2 = nn.Linear(
+            hidden_features,
+            out_features,
+            weight_attr=ParamAttr(learning_rate=lr_factor),
+            bias_attr=ParamAttr(learning_rate=lr_factor))
+        self.drop = nn.Dropout(drop)
+
+        self._init_weights()
+
+    def _init_weights(self):
+        linear_init_(self.fc1)
+        linear_init_(self.fc2)
+
+    def forward(self, x):
+        x = self.drop(self.act(self.fc1(x)))
+        x = self.drop(self.fc2(x))
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 attn_bias=False,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 use_rel_pos=False,
+                 rel_pos_zero_init=True,
+                 window_size=None,
+                 input_size=None,
+                 qk_scale=None,
+                 lr_factor=1.0):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = qk_scale or self.head_dim**-0.5
+        self.use_rel_pos = use_rel_pos
+        self.input_size = input_size
+        self.rel_pos_zero_init = rel_pos_zero_init
+        self.window_size = window_size
+        self.lr_factor = lr_factor
+
+        self.qkv = nn.Linear(
+            dim,
+            dim * 3,
+            weight_attr=ParamAttr(learning_rate=lr_factor),
+            bias_attr=ParamAttr(learning_rate=lr_factor)
+            if attn_bias else False)
+        if qkv_bias:
+            self.q_bias = self.create_parameter(
+                shape=([dim]), default_initializer=zeros_)
+            self.v_bias = self.create_parameter(
+                shape=([dim]), default_initializer=zeros_)
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        self.proj = nn.Linear(
+            dim,
+            dim,
+            weight_attr=ParamAttr(learning_rate=lr_factor),
+            bias_attr=ParamAttr(learning_rate=lr_factor))
+        self.attn_drop = nn.Dropout(attn_drop)
+        if window_size is None:
+            self.window_size = self.input_size[0]
+
+        self._init_weights()
+
+    def _init_weights(self):
+        linear_init_(self.qkv)
+        linear_init_(self.proj)
+
+        if self.use_rel_pos:
+            self.rel_pos_h = self.create_parameter(
+                [2 * self.window_size - 1, self.head_dim],
+                attr=ParamAttr(learning_rate=self.lr_factor),
+                default_initializer=Constant(value=0.))
+            self.rel_pos_w = self.create_parameter(
+                [2 * self.window_size - 1, self.head_dim],
+                attr=ParamAttr(learning_rate=self.lr_factor),
+                default_initializer=Constant(value=0.))
+
+            if not self.rel_pos_zero_init:
+                TruncatedNormal(self.rel_pos_h, std=0.02)
+                TruncatedNormal(self.rel_pos_w, std=0.02)
+
+    def get_rel_pos(self, seq_size, rel_pos):
+        max_rel_dist = int(2 * seq_size - 1)
+        # Interpolate rel pos if needed.
+        if rel_pos.shape[0] != max_rel_dist:
+            # Interpolate rel pos.
+            rel_pos = rel_pos.reshape([1, rel_pos.shape[0], -1])
+            rel_pos = rel_pos.transpose([0, 2, 1])
+            rel_pos_resized = F.interpolate(
+                rel_pos,
+                size=(max_rel_dist, ),
+                mode="linear",
+                data_format='NCW')
+            rel_pos_resized = rel_pos_resized.reshape([-1, max_rel_dist])
+            rel_pos_resized = rel_pos_resized.transpose([1, 0])
+        else:
+            rel_pos_resized = rel_pos
+
+        coords = paddle.arange(seq_size, dtype='float32')
+        relative_coords = coords.unsqueeze(-1) - coords.unsqueeze(0)
+        relative_coords += (seq_size - 1)
+        relative_coords = relative_coords.astype('int64').flatten()
+
+        return paddle.index_select(rel_pos_resized, relative_coords).reshape(
+            [seq_size, seq_size, self.head_dim])
+
+    def add_decomposed_rel_pos(self, attn, q, h, w):
+        """
+        Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+        Args:
+            attn (Tensor): attention map.
+            q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+        Returns:
+            attn (Tensor): attention map with added relative positional embeddings.
+        """
+        Rh = self.get_rel_pos(h, self.rel_pos_h)
+        Rw = self.get_rel_pos(w, self.rel_pos_w)
+
+        B, _, dim = q.shape
+        r_q = q.reshape([B, h, w, dim])
+        # bhwc, hch->bhwh1
+        # bwhc, wcw->bhw1w
+        rel_h = paddle.einsum("bhwc,hkc->bhwk", r_q, Rh).unsqueeze(-1)
+        rel_w = paddle.einsum("bhwc,wkc->bhwk", r_q, Rw).unsqueeze(-2)
+
+        attn = attn.reshape([B, h, w, h, w]) + rel_h + rel_w
+        return attn.reshape([B, h * w, h * w])
+
+    def forward(self, x):
+        B, H, W, C = paddle.shape(x)
+
+        if self.q_bias is not None:
+            qkv_bias = paddle.concat(
+                (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
+            qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)
+        else:
+            qkv = self.qkv(x).reshape(
+                [B, H * W, 3, self.num_heads, self.head_dim]).transpose(
+                    [2, 0, 3, 1, 4]).reshape(
+                        [3, B * self.num_heads, H * W, self.head_dim])
+
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn = q.matmul(k.transpose([0, 2, 1])) * self.scale
+
+        if self.use_rel_pos:
+            attn = self.add_decomposed_rel_pos(attn, q, H, W)
+
+        attn = F.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+        x = attn.matmul(v).reshape(
+            [B, self.num_heads, H * W, self.head_dim]).transpose(
+                [0, 2, 1, 3]).reshape([B, H, W, C])
+        x = self.proj(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 attn_bias=False,
+                 qk_scale=None,
+                 init_values=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 use_rel_pos=True,
+                 rel_pos_zero_init=True,
+                 window_size=None,
+                 input_size=None,
+                 act_layer='nn.GELU',
+                 norm_layer='nn.LayerNorm',
+                 lr_factor=1.0,
+                 epsilon=1e-5):
+        super().__init__()
+        self.window_size = window_size
+
+        self.norm1 = eval(norm_layer)(dim,
+                                      weight_attr=ParamAttr(
+                                          learning_rate=lr_factor,
+                                          regularizer=L2Decay(0.0)),
+                                      bias_attr=ParamAttr(
+                                          learning_rate=lr_factor,
+                                          regularizer=L2Decay(0.0)),
+                                      epsilon=epsilon)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_bias=attn_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            use_rel_pos=use_rel_pos,
+            rel_pos_zero_init=rel_pos_zero_init,
+            window_size=window_size,
+            input_size=input_size,
+            lr_factor=lr_factor)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = eval(norm_layer)(dim,
+                                      weight_attr=ParamAttr(
+                                          learning_rate=lr_factor,
+                                          regularizer=L2Decay(0.0)),
+                                      bias_attr=ParamAttr(
+                                          learning_rate=lr_factor,
+                                          regularizer=L2Decay(0.0)),
+                                      epsilon=epsilon)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=int(dim * mlp_ratio),
+                       act_layer=act_layer,
+                       drop=drop,
+                       lr_factor=lr_factor)
+        if init_values is not None:
+            self.gamma_1 = self.create_parameter(
+                shape=([dim]), default_initializer=Constant(value=init_values))
+            self.gamma_2 = self.create_parameter(
+                shape=([dim]), default_initializer=Constant(value=init_values))
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x):
+        y = self.norm1(x)
+        if self.window_size is not None:
+            y, pad_hw, num_hw = window_partition(y, self.window_size)
+        y = self.attn(y)
+        if self.gamma_1 is not None:
+            y = self.gamma_1 * y
+
+        if self.window_size is not None:
+            y = window_unpartition(y, pad_hw, num_hw, (x.shape[1], x.shape[2]))
+        x = x + self.drop_path(y)
+        if self.gamma_2 is None:
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=(224, 224),
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 lr_factor=0.01):
+        super().__init__()
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.proj = nn.Conv2D(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            weight_attr=ParamAttr(learning_rate=lr_factor),
+            bias_attr=ParamAttr(learning_rate=lr_factor))
+
+    @property
+    def num_patches_in_h(self):
+        return self.img_size[1] // self.patch_size
+
+    @property
+    def num_patches_in_w(self):
+        return self.img_size[0] // self.patch_size
+
+    def forward(self, x):
+        out = self.proj(x)
+        return out
+
+
+@register
+@serializable
+class VisionTransformer2D(nn.Layer):
+    """ Vision Transformer with support for patch input
+    """
+
+    def __init__(self,
+                 img_size=(1024, 1024),
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 attn_bias=False,
+                 qk_scale=None,
+                 init_values=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 act_layer='nn.GELU',
+                 norm_layer='nn.LayerNorm',
+                 lr_decay_rate=1.0,
+                 global_attn_indexes=(2, 5, 8, 11),
+                 use_abs_pos=False,
+                 use_rel_pos=False,
+                 use_abs_pos_emb=False,
+                 use_sincos_pos_emb=False,
+                 rel_pos_zero_init=True,
+                 epsilon=1e-5,
+                 final_norm=False,
+                 pretrained=None,
+                 window_size=None,
+                 out_indices=(11, ),
+                 with_fpn=False,
+                 use_checkpoint=False,
+                 *args,
+                 **kwargs):
+        super().__init__()
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.depth = depth
+        self.global_attn_indexes = global_attn_indexes
+        self.epsilon = epsilon
+        self.with_fpn = with_fpn
+        self.use_checkpoint = use_checkpoint
+
+        self.patch_h = img_size[0] // patch_size
+        self.patch_w = img_size[1] // patch_size
+        self.num_patches = self.patch_h * self.patch_w
+        self.use_abs_pos = use_abs_pos
+        self.use_abs_pos_emb = use_abs_pos_emb
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim)
+
+        dpr = np.linspace(0, drop_path_rate, depth)
+        if use_checkpoint:
+            paddle.seed(0)
+
+        if use_abs_pos_emb:
+            self.pos_w = self.patch_embed.num_patches_in_w
+            self.pos_h = self.patch_embed.num_patches_in_h
+            self.pos_embed = self.create_parameter(
+                shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
+                default_initializer=paddle.nn.initializer.TruncatedNormal(
+                    std=.02))
+        elif use_sincos_pos_emb:
+            pos_embed = self.get_2d_sincos_position_embedding(self.patch_h,
+                                                              self.patch_w)
+
+            self.pos_embed = pos_embed
+            self.pos_embed = self.create_parameter(shape=pos_embed.shape)
+            self.pos_embed.set_value(pos_embed.numpy())
+            self.pos_embed.stop_gradient = True
+        else:
+            self.pos_embed = None
+
+        self.blocks = nn.LayerList([
+            Block(
+                embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                attn_bias=attn_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                use_rel_pos=use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                window_size=None
+                if i in self.global_attn_indexes else window_size,
+                input_size=[self.patch_h, self.patch_w],
+                act_layer=act_layer,
+                lr_factor=self.get_vit_lr_decay_rate(i, lr_decay_rate),
+                norm_layer=norm_layer,
+                init_values=init_values,
+                epsilon=epsilon) for i in range(depth)
+        ])
+
+        assert len(out_indices) <= 4, 'out_indices out of bound'
+        self.out_indices = out_indices
+        self.pretrained = pretrained
+        self.init_weight()
+
+        self.out_channels = [embed_dim for _ in range(len(out_indices))]
+        self.out_strides = [4, 8, 16, 32][-len(out_indices):] if with_fpn else [
+            patch_size for _ in range(len(out_indices))
+        ]
+        self.norm = Identity()
+        if self.with_fpn:
+            self.init_fpn(
+                embed_dim=embed_dim,
+                patch_size=patch_size,
+                out_with_norm=final_norm)
+
+    def get_vit_lr_decay_rate(self, layer_id, lr_decay_rate):
+        return lr_decay_rate**(self.depth - layer_id)
+
+    def init_weight(self):
+        pretrained = self.pretrained
+        if pretrained:
+            if 'http' in pretrained:
+                path = paddle.utils.download.get_weights_path_from_url(
+                    pretrained)
+            else:
+                path = pretrained
+
+            load_state_dict = paddle.load(path)
+            model_state_dict = self.state_dict()
+            pos_embed_name = "pos_embed"
+
+            if pos_embed_name in load_state_dict.keys(
+            ) and self.use_abs_pos_emb:
+                load_pos_embed = paddle.to_tensor(
+                    load_state_dict[pos_embed_name], dtype="float32")
+                if self.pos_embed.shape != load_pos_embed.shape:
+                    pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
+                    model_state_dict[pos_embed_name] = self.resize_pos_embed(
+                        load_pos_embed, (pos_size, pos_size),
+                        (self.pos_h, self.pos_w))
+
+                    # self.set_state_dict(model_state_dict)
+                    load_state_dict[pos_embed_name] = model_state_dict[
+                        pos_embed_name]
+
+                    print("Load pos_embed and resize it from {} to {} .".format(
+                        load_pos_embed.shape, self.pos_embed.shape))
+
+            self.set_state_dict(load_state_dict)
+            print("Load load_state_dict....")
+
+    def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):
+        if patch_size == 16:
+            self.fpn1 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2),
+                nn.BatchNorm2D(embed_dim),
+                nn.GELU(),
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn2 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn3 = Identity()
+
+            self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)
+        elif patch_size == 8:
+            self.fpn1 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn2 = Identity()
+
+            self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )
+
+            self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )
+
+        if not out_with_norm:
+            self.norm = Identity()
+        else:
+            self.norm = nn.LayerNorm(embed_dim, epsilon=self.epsilon)
+
+    def resize_pos_embed(self, pos_embed, old_hw, new_hw):
+        """
+        Resize pos_embed weight.
+        Args:
+            pos_embed (Tensor): the pos_embed weight
+            old_hw (list[int]): the height and width of old pos_embed
+            new_hw (list[int]): the height and width of new pos_embed
+        Returns:
+            Tensor: the resized pos_embed weight
+        """
+        cls_pos_embed = pos_embed[:, :1, :]
+        pos_embed = pos_embed[:, 1:, :]
+
+        pos_embed = pos_embed.transpose([0, 2, 1])
+        pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
+        pos_embed = F.interpolate(
+            pos_embed, new_hw, mode='bicubic', align_corners=False)
+        pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
+        pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
+
+        return pos_embed
+
+    def get_2d_sincos_position_embedding(self, h, w, temperature=10000.):
+        grid_y, grid_x = paddle.meshgrid(
+            paddle.arange(
+                h, dtype=paddle.float32),
+            paddle.arange(
+                w, dtype=paddle.float32))
+        assert self.embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
+        pos_dim = self.embed_dim // 4
+        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
+        omega = (1. / (temperature**omega)).unsqueeze(0)
+
+        out_x = grid_x.reshape([-1, 1]).matmul(omega)
+        out_y = grid_y.reshape([-1, 1]).matmul(omega)
+
+        pos_emb = paddle.concat(
+            [
+                paddle.sin(out_y), paddle.cos(out_y), paddle.sin(out_x),
+                paddle.cos(out_x)
+            ],
+            axis=1)
+
+        return pos_emb.reshape([1, h, w, self.embed_dim])
+
+    def forward(self, inputs):
+        x = self.patch_embed(inputs['image']).transpose([0, 2, 3, 1])
+        B, Hp, Wp, _ = paddle.shape(x)
+
+        if self.use_abs_pos:
+            x = x + self.get_2d_sincos_position_embedding(Hp, Wp)
+
+        if self.use_abs_pos_emb:
+            x = x + self.resize_pos_embed(self.pos_embed,
+                                          (self.pos_h, self.pos_w), (Hp, Wp))
+
+        feats = []
+        for idx, blk in enumerate(self.blocks):
+            if self.use_checkpoint and self.training:
+                x = paddle.distributed.fleet.utils.recompute(
+                    blk, x, **{"preserve_rng_state": True})
+            else:
+                x = blk(x)
+            if idx in self.out_indices:
+                feats.append(self.norm(x.transpose([0, 3, 1, 2])))
+
+        if self.with_fpn:
+            fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
+            for i in range(len(feats)):
+                feats[i] = fpns[i](feats[i])
+        return feats
+
+    @property
+    def num_layers(self):
+        return len(self.blocks)
+
+    @property
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=c, stride=s)
+            for c, s in zip(self.out_channels, self.out_strides)
+        ]
+
+
+class LayerNorm(nn.Layer):
+    """
+    A LayerNorm variant, popularized by Transformers, that performs point-wise mean and
+    variance normalization over the channel dimension for inputs that have shape
+    (batch_size, channels, height, width).    
+    Note that, the modified LayerNorm on used in ResBlock and SimpleFeaturePyramid.
+
+    In ViT, we use the nn.LayerNorm
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6):
+        super().__init__()
+        self.weight = self.create_parameter([normalized_shape])
+        self.bias = self.create_parameter([normalized_shape])
+        self.eps = eps
+        self.normalized_shape = (normalized_shape, )
+
+    def forward(self, x):
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / paddle.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+
+
+@register
+@serializable
+class SimpleFeaturePyramid(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 spatial_scales,
+                 num_levels=4,
+                 use_bias=False):
+        """
+        Args:
+            in_channels (list[int]): input channels of each level which can be 
+                derived from the output shape of backbone by from_config
+            out_channel (int): output channel of each level.
+            spatial_scales (list[float]): list of scaling factors to upsample or downsample
+                the input features for creating pyramid features which can be derived from 
+                the output shape of backbone by from_config
+            num_levels (int): number of levels of output features.
+            use_bias (bool): whether use bias or not.
+        """
+        super(SimpleFeaturePyramid, self).__init__()
+
+        self.in_channels = in_channels[0]
+        self.out_channels = out_channels
+        self.num_levels = num_levels
+
+        self.stages = []
+        dim = self.in_channels
+        if num_levels == 4:
+            scale_factors = [2.0, 1.0, 0.5]
+        elif num_levels == 5:
+            scale_factors = [4.0, 2.0, 1.0, 0.5]
+        else:
+            raise NotImplementedError(
+                f"num_levels={num_levels} is not supported yet.")
+
+        dim = in_channels[0]
+        for idx, scale in enumerate(scale_factors):
+            out_dim = dim
+            if scale == 4.0:
+                layers = [
+                    nn.Conv2DTranspose(
+                        dim, dim // 2, kernel_size=2, stride=2),
+                    nn.LayerNorm(dim // 2),
+                    nn.GELU(),
+                    nn.Conv2DTranspose(
+                        dim // 2, dim // 4, kernel_size=2, stride=2),
+                ]
+                out_dim = dim // 4
+            elif scale == 2.0:
+                layers = [
+                    nn.Conv2DTranspose(
+                        dim, dim // 2, kernel_size=2, stride=2)
+                ]
+                out_dim = dim // 2
+            elif scale == 1.0:
+                layers = []
+            elif scale == 0.5:
+                layers = [nn.MaxPool2D(kernel_size=2, stride=2)]
+
+            layers.extend([
+                nn.Conv2D(
+                    out_dim,
+                    out_channels,
+                    kernel_size=1,
+                    bias_attr=use_bias, ), LayerNorm(out_channels), nn.Conv2D(
+                        out_channels,
+                        out_channels,
+                        kernel_size=3,
+                        padding=1,
+                        bias_attr=use_bias, ), LayerNorm(out_channels)
+            ])
+            layers = nn.Sequential(*layers)
+
+            stage = -int(math.log2(spatial_scales[0] * scale_factors[idx]))
+            self.add_sublayer(f"simfp_{stage}", layers)
+            self.stages.append(layers)
+
+        # top block output feature maps.
+        self.top_block = nn.Sequential(
+            nn.MaxPool2D(
+                kernel_size=1, stride=2, padding=0))
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            'in_channels': [i.channels for i in input_shape],
+            'spatial_scales': [1.0 / i.stride for i in input_shape],
+        }
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(channels=self.out_channels)
+            for _ in range(self.num_levels)
+        ]
+
+    def forward(self, feats):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W).
+        """
+        features = feats[0]
+        results = []
+
+        for stage in self.stages:
+            results.append(stage(features))
+
+        top_block_in_feature = results[-1]
+        results.append(self.top_block(top_block_in_feature))
+        assert self.num_levels == len(results)
+
+        return results
--- a/rtdetr_paddle/ppdet/modeling/bbox_utils.py
+++ b/rtdetr_paddle/ppdet/modeling/bbox_utils.py
@@ -0,0 +1,607 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import numpy as np
+
+
+def bbox2delta(src_boxes, tgt_boxes, weights=[1.0, 1.0, 1.0, 1.0]):
+    """Encode bboxes to deltas.
+    """
+    src_w = src_boxes[:, 2] - src_boxes[:, 0]
+    src_h = src_boxes[:, 3] - src_boxes[:, 1]
+    src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
+    src_ctr_y = src_boxes[:, 1] + 0.5 * src_h
+
+    tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]
+    tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]
+    tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w
+    tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h
+
+    wx, wy, ww, wh = weights
+    dx = wx * (tgt_ctr_x - src_ctr_x) / src_w
+    dy = wy * (tgt_ctr_y - src_ctr_y) / src_h
+    dw = ww * paddle.log(tgt_w / src_w)
+    dh = wh * paddle.log(tgt_h / src_h)
+
+    deltas = paddle.stack((dx, dy, dw, dh), axis=1)
+    return deltas
+
+
+def delta2bbox(deltas, boxes, weights=[1.0, 1.0, 1.0, 1.0], max_shape=None):
+    """Decode deltas to boxes. Used in RCNNBox,CascadeHead,RCNNHead,RetinaHead.
+    Note: return tensor shape [n,1,4]
+        If you want to add a reshape, please add after the calling code instead of here.
+    """
+    clip_scale = math.log(1000.0 / 16)
+
+    widths = boxes[:, 2] - boxes[:, 0]
+    heights = boxes[:, 3] - boxes[:, 1]
+    ctr_x = boxes[:, 0] + 0.5 * widths
+    ctr_y = boxes[:, 1] + 0.5 * heights
+
+    wx, wy, ww, wh = weights
+    dx = deltas[:, 0::4] / wx
+    dy = deltas[:, 1::4] / wy
+    dw = deltas[:, 2::4] / ww
+    dh = deltas[:, 3::4] / wh
+    # Prevent sending too large values into paddle.exp()
+    dw = paddle.clip(dw, max=clip_scale)
+    dh = paddle.clip(dh, max=clip_scale)
+
+    pred_ctr_x = dx * widths.unsqueeze(1) + ctr_x.unsqueeze(1)
+    pred_ctr_y = dy * heights.unsqueeze(1) + ctr_y.unsqueeze(1)
+    pred_w = paddle.exp(dw) * widths.unsqueeze(1)
+    pred_h = paddle.exp(dh) * heights.unsqueeze(1)
+
+    pred_boxes = []
+    pred_boxes.append(pred_ctr_x - 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y - 0.5 * pred_h)
+    pred_boxes.append(pred_ctr_x + 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
+    pred_boxes = paddle.stack(pred_boxes, axis=-1)
+
+    if max_shape is not None:
+        pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip(
+            min=0, max=max_shape[1])
+        pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip(
+            min=0, max=max_shape[0])
+    return pred_boxes
+
+
+def bbox2delta_v2(src_boxes,
+                  tgt_boxes,
+                  delta_mean=[0.0, 0.0, 0.0, 0.0],
+                  delta_std=[1.0, 1.0, 1.0, 1.0]):
+    """Encode bboxes to deltas.
+    Modified from bbox2delta() which just use weight parameters to multiply deltas.
+    """
+    src_w = src_boxes[:, 2] - src_boxes[:, 0]
+    src_h = src_boxes[:, 3] - src_boxes[:, 1]
+    src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
+    src_ctr_y = src_boxes[:, 1] + 0.5 * src_h
+
+    tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]
+    tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]
+    tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w
+    tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h
+
+    dx = (tgt_ctr_x - src_ctr_x) / src_w
+    dy = (tgt_ctr_y - src_ctr_y) / src_h
+    dw = paddle.log(tgt_w / src_w)
+    dh = paddle.log(tgt_h / src_h)
+
+    deltas = paddle.stack((dx, dy, dw, dh), axis=1)
+    deltas = (
+        deltas - paddle.to_tensor(delta_mean)) / paddle.to_tensor(delta_std)
+    return deltas
+
+
+def delta2bbox_v2(deltas,
+                  boxes,
+                  delta_mean=[0.0, 0.0, 0.0, 0.0],
+                  delta_std=[1.0, 1.0, 1.0, 1.0],
+                  max_shape=None,
+                  ctr_clip=32.0):
+    """Decode deltas to bboxes.
+    Modified from delta2bbox() which just use weight parameters to be divided by deltas.
+    Used in YOLOFHead.
+    Note: return tensor shape [n,1,4]
+        If you want to add a reshape, please add after the calling code instead of here.
+    """
+    clip_scale = math.log(1000.0 / 16)
+
+    widths = boxes[:, 2] - boxes[:, 0]
+    heights = boxes[:, 3] - boxes[:, 1]
+    ctr_x = boxes[:, 0] + 0.5 * widths
+    ctr_y = boxes[:, 1] + 0.5 * heights
+
+    deltas = deltas * paddle.to_tensor(delta_std) + paddle.to_tensor(delta_mean)
+    dx = deltas[:, 0::4]
+    dy = deltas[:, 1::4]
+    dw = deltas[:, 2::4]
+    dh = deltas[:, 3::4]
+
+    # Prevent sending too large values into paddle.exp()
+    dx = dx * widths.unsqueeze(1)
+    dy = dy * heights.unsqueeze(1)
+    if ctr_clip is not None:
+        dx = paddle.clip(dx, max=ctr_clip, min=-ctr_clip)
+        dy = paddle.clip(dy, max=ctr_clip, min=-ctr_clip)
+        dw = paddle.clip(dw, max=clip_scale)
+        dh = paddle.clip(dh, max=clip_scale)
+    else:
+        dw = dw.clip(min=-clip_scale, max=clip_scale)
+        dh = dh.clip(min=-clip_scale, max=clip_scale)
+
+    pred_ctr_x = dx + ctr_x.unsqueeze(1)
+    pred_ctr_y = dy + ctr_y.unsqueeze(1)
+    pred_w = paddle.exp(dw) * widths.unsqueeze(1)
+    pred_h = paddle.exp(dh) * heights.unsqueeze(1)
+
+    pred_boxes = []
+    pred_boxes.append(pred_ctr_x - 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y - 0.5 * pred_h)
+    pred_boxes.append(pred_ctr_x + 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
+    pred_boxes = paddle.stack(pred_boxes, axis=-1)
+
+    if max_shape is not None:
+        pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip(
+            min=0, max=max_shape[1])
+        pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip(
+            min=0, max=max_shape[0])
+    return pred_boxes
+
+
+def expand_bbox(bboxes, scale):
+    w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5
+    h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5
+    x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5
+    y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5
+
+    w_half *= scale
+    h_half *= scale
+
+    bboxes_exp = np.zeros(bboxes.shape, dtype=np.float32)
+    bboxes_exp[:, 0] = x_c - w_half
+    bboxes_exp[:, 2] = x_c + w_half
+    bboxes_exp[:, 1] = y_c - h_half
+    bboxes_exp[:, 3] = y_c + h_half
+
+    return bboxes_exp
+
+
+def clip_bbox(boxes, im_shape):
+    h, w = im_shape[0], im_shape[1]
+    x1 = boxes[:, 0].clip(0, w)
+    y1 = boxes[:, 1].clip(0, h)
+    x2 = boxes[:, 2].clip(0, w)
+    y2 = boxes[:, 3].clip(0, h)
+    return paddle.stack([x1, y1, x2, y2], axis=1)
+
+
+def nonempty_bbox(boxes, min_size=0, return_mask=False):
+    w = boxes[:, 2] - boxes[:, 0]
+    h = boxes[:, 3] - boxes[:, 1]
+    mask = paddle.logical_and(h > min_size, w > min_size)
+    if return_mask:
+        return mask
+    keep = paddle.nonzero(mask).flatten()
+    return keep
+
+
+def bbox_area(boxes):
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+def bbox_overlaps(boxes1, boxes2):
+    """
+    Calculate overlaps between boxes1 and boxes2
+
+    Args:
+        boxes1 (Tensor): boxes with shape [M, 4]
+        boxes2 (Tensor): boxes with shape [N, 4]
+
+    Return:
+        overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N]
+    """
+    M = boxes1.shape[0]
+    N = boxes2.shape[0]
+    if M * N == 0:
+        return paddle.zeros([M, N], dtype='float32')
+    area1 = bbox_area(boxes1)
+    area2 = bbox_area(boxes2)
+
+    xy_max = paddle.minimum(
+        paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:])
+    xy_min = paddle.maximum(
+        paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2])
+    width_height = xy_max - xy_min
+    width_height = width_height.clip(min=0)
+    inter = width_height.prod(axis=2)
+
+    overlaps = paddle.where(inter > 0, inter /
+                            (paddle.unsqueeze(area1, 1) + area2 - inter),
+                            paddle.zeros_like(inter))
+    return overlaps
+
+
+def batch_bbox_overlaps(bboxes1,
+                        bboxes2,
+                        mode='iou',
+                        is_aligned=False,
+                        eps=1e-6):
+    """Calculate overlap between two set of bboxes.
+    If ``is_aligned `` is ``False``, then calculate the overlaps between each
+    bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
+    pair of bboxes1 and bboxes2.
+    Args:
+        bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
+        bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
+            B indicates the batch dim, in shape (B1, B2, ..., Bn).
+            If ``is_aligned `` is ``True``, then m and n must be equal.
+        mode (str): "iou" (intersection over union) or "iof" (intersection over
+            foreground).
+        is_aligned (bool, optional): If True, then m and n must be equal.
+            Default False.
+        eps (float, optional): A value added to the denominator for numerical
+            stability. Default 1e-6.
+    Returns:
+        Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
+    """
+    assert mode in ['iou', 'iof', 'giou'], 'Unsupported mode {}'.format(mode)
+    # Either the boxes are empty or the length of boxes's last dimenstion is 4
+    assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0)
+    assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0)
+
+    # Batch dim must be the same
+    # Batch dim: (B1, B2, ... Bn)
+    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
+    batch_shape = bboxes1.shape[:-2]
+
+    rows = bboxes1.shape[-2] if bboxes1.shape[0] > 0 else 0
+    cols = bboxes2.shape[-2] if bboxes2.shape[0] > 0 else 0
+    if is_aligned:
+        assert rows == cols
+
+    if rows * cols == 0:
+        if is_aligned:
+            return paddle.full(batch_shape + (rows, ), 1)
+        else:
+            return paddle.full(batch_shape + (rows, cols), 1)
+
+    area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])
+    area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1])
+
+    if is_aligned:
+        lt = paddle.maximum(bboxes1[:, :2], bboxes2[:, :2])  # [B, rows, 2]
+        rb = paddle.minimum(bboxes1[:, 2:], bboxes2[:, 2:])  # [B, rows, 2]
+
+        wh = (rb - lt).clip(min=0)  # [B, rows, 2]
+        overlap = wh[:, 0] * wh[:, 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1 + area2 - overlap
+        else:
+            union = area1
+        if mode == 'giou':
+            enclosed_lt = paddle.minimum(bboxes1[:, :2], bboxes2[:, :2])
+            enclosed_rb = paddle.maximum(bboxes1[:, 2:], bboxes2[:, 2:])
+    else:
+        lt = paddle.maximum(bboxes1[:, :2].reshape([rows, 1, 2]),
+                            bboxes2[:, :2])  # [B, rows, cols, 2]
+        rb = paddle.minimum(bboxes1[:, 2:].reshape([rows, 1, 2]),
+                            bboxes2[:, 2:])  # [B, rows, cols, 2]
+
+        wh = (rb - lt).clip(min=0)  # [B, rows, cols, 2]
+        overlap = wh[:, :, 0] * wh[:, :, 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1.reshape([rows,1]) \
+                    + area2.reshape([1,cols]) - overlap
+        else:
+            union = area1[:, None]
+        if mode == 'giou':
+            enclosed_lt = paddle.minimum(bboxes1[:, :2].reshape([rows, 1, 2]),
+                                         bboxes2[:, :2])
+            enclosed_rb = paddle.maximum(bboxes1[:, 2:].reshape([rows, 1, 2]),
+                                         bboxes2[:, 2:])
+
+    eps = paddle.to_tensor([eps])
+    union = paddle.maximum(union, eps)
+    ious = overlap / union
+    if mode in ['iou', 'iof']:
+        return ious
+    # calculate gious
+    enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
+    enclose_area = enclose_wh[:, :, 0] * enclose_wh[:, :, 1]
+    enclose_area = paddle.maximum(enclose_area, eps)
+    gious = ious - (enclose_area - union) / enclose_area
+    return 1 - gious
+
+
+def xywh2xyxy(box):
+    x, y, w, h = box
+    x1 = x - w * 0.5
+    y1 = y - h * 0.5
+    x2 = x + w * 0.5
+    y2 = y + h * 0.5
+    return [x1, y1, x2, y2]
+
+
+def make_grid(h, w, dtype):
+    yv, xv = paddle.meshgrid([paddle.arange(h), paddle.arange(w)])
+    return paddle.stack((xv, yv), 2).cast(dtype=dtype)
+
+
+def decode_yolo(box, anchor, downsample_ratio):
+    """decode yolo box
+
+    Args:
+        box (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
+        anchor (list): anchor with the shape [na, 2]
+        downsample_ratio (int): downsample ratio, default 32
+        scale (float): scale, default 1.
+
+    Return:
+        box (list): decoded box, [x, y, w, h], all have the shape [b, na, h, w, 1]
+    """
+    x, y, w, h = box
+    na, grid_h, grid_w = x.shape[1:4]
+    grid = make_grid(grid_h, grid_w, x.dtype).reshape((1, 1, grid_h, grid_w, 2))
+    x1 = (x + grid[:, :, :, :, 0:1]) / grid_w
+    y1 = (y + grid[:, :, :, :, 1:2]) / grid_h
+
+    anchor = paddle.to_tensor(anchor, dtype=x.dtype)
+    anchor = anchor.reshape((1, na, 1, 1, 2))
+    w1 = paddle.exp(w) * anchor[:, :, :, :, 0:1] / (downsample_ratio * grid_w)
+    h1 = paddle.exp(h) * anchor[:, :, :, :, 1:2] / (downsample_ratio * grid_h)
+
+    return [x1, y1, w1, h1]
+
+
+def batch_iou_similarity(box1, box2, eps=1e-9):
+    """Calculate iou of box1 and box2 in batch
+
+    Args:
+        box1 (Tensor): box with the shape [N, M1, 4]
+        box2 (Tensor): box with the shape [N, M2, 4]
+
+    Return:
+        iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2]
+    """
+    box1 = box1.unsqueeze(2)  # [N, M1, 4] -> [N, M1, 1, 4]
+    box2 = box2.unsqueeze(1)  # [N, M2, 4] -> [N, 1, M2, 4]
+    px1y1, px2y2 = box1[:, :, :, 0:2], box1[:, :, :, 2:4]
+    gx1y1, gx2y2 = box2[:, :, :, 0:2], box2[:, :, :, 2:4]
+    x1y1 = paddle.maximum(px1y1, gx1y1)
+    x2y2 = paddle.minimum(px2y2, gx2y2)
+    overlap = (x2y2 - x1y1).clip(0).prod(-1)
+    area1 = (px2y2 - px1y1).clip(0).prod(-1)
+    area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
+    union = area1 + area2 - overlap + eps
+    return overlap / union
+
+
+def bbox_iou(box1, box2, giou=False, diou=False, ciou=False, eps=1e-9):
+    """calculate the iou of box1 and box2
+
+    Args:
+        box1 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
+        box2 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
+        giou (bool): whether use giou or not, default False
+        diou (bool): whether use diou or not, default False
+        ciou (bool): whether use ciou or not, default False
+        eps (float): epsilon to avoid divide by zero
+
+    Return:
+        iou (Tensor): iou of box1 and box1, with the shape [b, na, h, w, 1]
+    """
+    px1, py1, px2, py2 = box1
+    gx1, gy1, gx2, gy2 = box2
+    x1 = paddle.maximum(px1, gx1)
+    y1 = paddle.maximum(py1, gy1)
+    x2 = paddle.minimum(px2, gx2)
+    y2 = paddle.minimum(py2, gy2)
+
+    overlap = ((x2 - x1).clip(0)) * ((y2 - y1).clip(0))
+
+    area1 = (px2 - px1) * (py2 - py1)
+    area1 = area1.clip(0)
+
+    area2 = (gx2 - gx1) * (gy2 - gy1)
+    area2 = area2.clip(0)
+
+    union = area1 + area2 - overlap + eps
+    iou = overlap / union
+
+    if giou or ciou or diou:
+        # convex w, h
+        cw = paddle.maximum(px2, gx2) - paddle.minimum(px1, gx1)
+        ch = paddle.maximum(py2, gy2) - paddle.minimum(py1, gy1)
+        if giou:
+            c_area = cw * ch + eps
+            return iou - (c_area - union) / c_area
+        else:
+            # convex diagonal squared
+            c2 = cw**2 + ch**2 + eps
+            # center distance
+            rho2 = ((px1 + px2 - gx1 - gx2)**2 + (py1 + py2 - gy1 - gy2)**2) / 4
+            if diou:
+                return iou - rho2 / c2
+            else:
+                w1, h1 = px2 - px1, py2 - py1 + eps
+                w2, h2 = gx2 - gx1, gy2 - gy1 + eps
+                delta = paddle.atan(w1 / h1) - paddle.atan(w2 / h2)
+                v = (4 / math.pi**2) * paddle.pow(delta, 2)
+                alpha = v / (1 + eps - iou + v)
+                alpha.stop_gradient = True
+                return iou - (rho2 / c2 + v * alpha)
+    else:
+        return iou
+
+
+def bbox_iou_np_expand(box1, box2, x1y1x2y2=True, eps=1e-16):
+    """
+    Calculate the iou of box1 and box2 with numpy.
+
+    Args:
+        box1 (ndarray): [N, 4]
+        box2 (ndarray): [M, 4], usually N != M
+        x1y1x2y2 (bool): whether in x1y1x2y2 stype, default True
+        eps (float): epsilon to avoid divide by zero
+    Return:
+        iou (ndarray): iou of box1 and box2, [N, M]
+    """
+    N, M = len(box1), len(box2)  # usually N != M
+    if x1y1x2y2:
+        b1_x1, b1_y1 = box1[:, 0], box1[:, 1]
+        b1_x2, b1_y2 = box1[:, 2], box1[:, 3]
+        b2_x1, b2_y1 = box2[:, 0], box2[:, 1]
+        b2_x2, b2_y2 = box2[:, 2], box2[:, 3]
+    else:
+        # cxcywh style
+        # Transform from center and width to exact coordinates
+        b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
+        b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
+        b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
+        b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
+
+    # get the coordinates of the intersection rectangle
+    inter_rect_x1 = np.zeros((N, M), dtype=np.float32)
+    inter_rect_y1 = np.zeros((N, M), dtype=np.float32)
+    inter_rect_x2 = np.zeros((N, M), dtype=np.float32)
+    inter_rect_y2 = np.zeros((N, M), dtype=np.float32)
+    for i in range(len(box2)):
+        inter_rect_x1[:, i] = np.maximum(b1_x1, b2_x1[i])
+        inter_rect_y1[:, i] = np.maximum(b1_y1, b2_y1[i])
+        inter_rect_x2[:, i] = np.minimum(b1_x2, b2_x2[i])
+        inter_rect_y2[:, i] = np.minimum(b1_y2, b2_y2[i])
+    # Intersection area
+    inter_area = np.maximum(inter_rect_x2 - inter_rect_x1, 0) * np.maximum(
+        inter_rect_y2 - inter_rect_y1, 0)
+    # Union Area
+    b1_area = np.repeat(
+        ((b1_x2 - b1_x1) * (b1_y2 - b1_y1)).reshape(-1, 1), M, axis=-1)
+    b2_area = np.repeat(
+        ((b2_x2 - b2_x1) * (b2_y2 - b2_y1)).reshape(1, -1), N, axis=0)
+
+    ious = inter_area / (b1_area + b2_area - inter_area + eps)
+    return ious
+
+
+def bbox2distance(points, bbox, max_dis=None, eps=0.1):
+    """Decode bounding box based on distances.
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        bbox (Tensor): Shape (n, 4), "xyxy" format
+        max_dis (float): Upper bound of the distance.
+        eps (float): a small value to ensure target < max_dis, instead <=
+    Returns:
+        Tensor: Decoded distances.
+    """
+    left = points[:, 0] - bbox[:, 0]
+    top = points[:, 1] - bbox[:, 1]
+    right = bbox[:, 2] - points[:, 0]
+    bottom = bbox[:, 3] - points[:, 1]
+    if max_dis is not None:
+        left = left.clip(min=0, max=max_dis - eps)
+        top = top.clip(min=0, max=max_dis - eps)
+        right = right.clip(min=0, max=max_dis - eps)
+        bottom = bottom.clip(min=0, max=max_dis - eps)
+    return paddle.stack([left, top, right, bottom], -1)
+
+
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+        Args:
+            points (Tensor): Shape (n, 2), [x, y].
+            distance (Tensor): Distance from the given point to 4
+                boundaries (left, top, right, bottom).
+            max_shape (tuple): Shape of the image.
+        Returns:
+            Tensor: Decoded bboxes.
+        """
+    x1 = points[:, 0] - distance[:, 0]
+    y1 = points[:, 1] - distance[:, 1]
+    x2 = points[:, 0] + distance[:, 2]
+    y2 = points[:, 1] + distance[:, 3]
+    if max_shape is not None:
+        x1 = x1.clip(min=0, max=max_shape[1])
+        y1 = y1.clip(min=0, max=max_shape[0])
+        x2 = x2.clip(min=0, max=max_shape[1])
+        y2 = y2.clip(min=0, max=max_shape[0])
+    return paddle.stack([x1, y1, x2, y2], -1)
+
+
+def bbox_center(boxes):
+    """Get bbox centers from boxes.
+    Args:
+        boxes (Tensor): boxes with shape (..., 4), "xmin, ymin, xmax, ymax" format.
+    Returns:
+        Tensor: boxes centers with shape (..., 2), "cx, cy" format.
+    """
+    boxes_cx = (boxes[..., 0] + boxes[..., 2]) / 2
+    boxes_cy = (boxes[..., 1] + boxes[..., 3]) / 2
+    return paddle.stack([boxes_cx, boxes_cy], axis=-1)
+
+
+def batch_distance2bbox(points, distance, max_shapes=None):
+    """Decode distance prediction to bounding box for batch.
+    Args:
+        points (Tensor): [B, ..., 2], "xy" format
+        distance (Tensor): [B, ..., 4], "ltrb" format
+        max_shapes (Tensor): [B, 2], "h,w" format, Shape of the image.
+    Returns:
+        Tensor: Decoded bboxes, "x1y1x2y2" format.
+    """
+    lt, rb = paddle.split(distance, 2, -1)
+    # while tensor add parameters, parameters should be better placed on the second place
+    x1y1 = -lt + points
+    x2y2 = rb + points
+    out_bbox = paddle.concat([x1y1, x2y2], -1)
+    if max_shapes is not None:
+        max_shapes = max_shapes.flip(-1).tile([1, 2])
+        delta_dim = out_bbox.ndim - max_shapes.ndim
+        for _ in range(delta_dim):
+            max_shapes.unsqueeze_(1)
+        out_bbox = paddle.where(out_bbox < max_shapes, out_bbox, max_shapes)
+        out_bbox = paddle.where(out_bbox > 0, out_bbox,
+                                paddle.zeros_like(out_bbox))
+    return out_bbox
+
+
+def iou_similarity(box1, box2, eps=1e-10):
+    """Calculate iou of box1 and box2
+
+    Args:
+        box1 (Tensor): box with the shape [M1, 4]
+        box2 (Tensor): box with the shape [M2, 4]
+
+    Return:
+        iou (Tensor): iou between box1 and box2 with the shape [M1, M2]
+    """
+    box1 = box1.unsqueeze(1)  # [M1, 4] -> [M1, 1, 4]
+    box2 = box2.unsqueeze(0)  # [M2, 4] -> [1, M2, 4]
+    px1y1, px2y2 = box1[:, :, 0:2], box1[:, :, 2:4]
+    gx1y1, gx2y2 = box2[:, :, 0:2], box2[:, :, 2:4]
+    x1y1 = paddle.maximum(px1y1, gx1y1)
+    x2y2 = paddle.minimum(px2y2, gx2y2)
+    overlap = (x2y2 - x1y1).clip(0).prod(-1)
+    area1 = (px2y2 - px1y1).clip(0).prod(-1)
+    area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
+    union = area1 + area2 - overlap + eps
+    return overlap / union
--- a/rtdetr_paddle/ppdet/modeling/cls_utils.py
+++ b/rtdetr_paddle/ppdet/modeling/cls_utils.py
@@ -0,0 +1,40 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def _get_class_default_kwargs(cls, *args, **kwargs):
+    """
+    Get default arguments of a class in dict format, if args and
+    kwargs is specified, it will replace default arguments
+    """
+    varnames = cls.__init__.__code__.co_varnames
+    argcount = cls.__init__.__code__.co_argcount
+    keys = varnames[:argcount]
+    assert keys[0] == 'self'
+    keys = keys[1:]
+
+    values = list(cls.__init__.__defaults__)
+    assert len(values) == len(keys)
+
+    if len(args) > 0:
+        for i, arg in enumerate(args):
+            values[i] = arg
+
+    default_kwargs = dict(zip(keys, values))
+
+    if len(kwargs) > 0:
+        for k, v in kwargs.items():
+            default_kwargs[k] = v
+
+    return default_kwargs
--- a/rtdetr_paddle/ppdet/modeling/heads/init.py
+++ b/rtdetr_paddle/ppdet/modeling/heads/init.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .detr_head import *
+
--- a/rtdetr_paddle/ppdet/modeling/heads/detr_head.py
+++ b/rtdetr_paddle/ppdet/modeling/heads/detr_head.py
@@ -0,0 +1,534 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+from ..initializer import linear_init_, constant_
+from ..transformers.utils import inverse_sigmoid
+
+import pycocotools.mask as mask_util
+
+__all__ = ['DETRHead', 'DeformableDETRHead', 'DINOHead', 'MaskDINOHead']
+
+
+class MLP(nn.Layer):
+    """This code is based on
+        https://github.com/facebookresearch/detr/blob/main/models/detr.py
+    """
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.LayerList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for l in self.layers:
+            linear_init_(l)
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+class MultiHeadAttentionMap(nn.Layer):
+    """This code is based on
+        https://github.com/facebookresearch/detr/blob/main/models/segmentation.py
+
+        This is a 2D attention module, which only returns the attention softmax (no multiplication by value)
+    """
+
+    def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0,
+                 bias=True):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.dropout = nn.Dropout(dropout)
+
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.XavierUniform())
+        bias_attr = paddle.framework.ParamAttr(
+            initializer=paddle.nn.initializer.Constant()) if bias else False
+
+        self.q_proj = nn.Linear(query_dim, hidden_dim, weight_attr, bias_attr)
+        self.k_proj = nn.Conv2D(
+            query_dim,
+            hidden_dim,
+            1,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr)
+
+        self.normalize_fact = float(hidden_dim / self.num_heads)**-0.5
+
+    def forward(self, q, k, mask=None):
+        q = self.q_proj(q)
+        k = self.k_proj(k)
+        bs, num_queries, n, c, h, w = q.shape[0], q.shape[1], self.num_heads,\
+                                      self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1]
+        qh = q.reshape([bs, num_queries, n, c])
+        kh = k.reshape([bs, n, c, h, w])
+        # weights = paddle.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh)
+        qh = qh.transpose([0, 2, 1, 3]).reshape([-1, num_queries, c])
+        kh = kh.reshape([-1, c, h * w])
+        weights = paddle.bmm(qh * self.normalize_fact, kh).reshape(
+            [bs, n, num_queries, h, w]).transpose([0, 2, 1, 3, 4])
+
+        if mask is not None:
+            weights += mask
+        # fix a potenial bug: https://github.com/facebookresearch/detr/issues/247
+        weights = F.softmax(weights.flatten(3), axis=-1).reshape(weights.shape)
+        weights = self.dropout(weights)
+        return weights
+
+
+class MaskHeadFPNConv(nn.Layer):
+    """This code is based on
+        https://github.com/facebookresearch/detr/blob/main/models/segmentation.py
+
+        Simple convolutional head, using group norm.
+        Upsampling is done using a FPN approach
+    """
+
+    def __init__(self, input_dim, fpn_dims, context_dim, num_groups=8):
+        super().__init__()
+
+        inter_dims = [input_dim,
+                      ] + [context_dim // (2**i) for i in range(1, 5)]
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.KaimingUniform())
+        bias_attr = paddle.framework.ParamAttr(
+            initializer=paddle.nn.initializer.Constant())
+
+        self.conv0 = self._make_layers(input_dim, input_dim, 3, num_groups,
+                                       weight_attr, bias_attr)
+        self.conv_inter = nn.LayerList()
+        for in_dims, out_dims in zip(inter_dims[:-1], inter_dims[1:]):
+            self.conv_inter.append(
+                self._make_layers(in_dims, out_dims, 3, num_groups, weight_attr,
+                                  bias_attr))
+
+        self.conv_out = nn.Conv2D(
+            inter_dims[-1],
+            1,
+            3,
+            padding=1,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr)
+
+        self.adapter = nn.LayerList()
+        for i in range(len(fpn_dims)):
+            self.adapter.append(
+                nn.Conv2D(
+                    fpn_dims[i],
+                    inter_dims[i + 1],
+                    1,
+                    weight_attr=weight_attr,
+                    bias_attr=bias_attr))
+
+    def _make_layers(self,
+                     in_dims,
+                     out_dims,
+                     kernel_size,
+                     num_groups,
+                     weight_attr=None,
+                     bias_attr=None):
+        return nn.Sequential(
+            nn.Conv2D(
+                in_dims,
+                out_dims,
+                kernel_size,
+                padding=kernel_size // 2,
+                weight_attr=weight_attr,
+                bias_attr=bias_attr),
+            nn.GroupNorm(num_groups, out_dims),
+            nn.ReLU())
+
+    def forward(self, x, bbox_attention_map, fpns):
+        x = paddle.concat([
+            x.tile([bbox_attention_map.shape[1], 1, 1, 1]),
+            bbox_attention_map.flatten(0, 1)
+        ], 1)
+        x = self.conv0(x)
+        for inter_layer, adapter_layer, feat in zip(self.conv_inter[:-1],
+                                                    self.adapter, fpns):
+            feat = adapter_layer(feat).tile(
+                [bbox_attention_map.shape[1], 1, 1, 1])
+            x = inter_layer(x)
+            x = feat + F.interpolate(x, size=feat.shape[-2:])
+
+        x = self.conv_inter[-1](x)
+        x = self.conv_out(x)
+        return x
+
+
+@register
+class DETRHead(nn.Layer):
+    __shared__ = ['num_classes', 'hidden_dim', 'use_focal_loss']
+    __inject__ = ['loss']
+
+    def __init__(self,
+                 num_classes=80,
+                 hidden_dim=256,
+                 nhead=8,
+                 num_mlp_layers=3,
+                 loss='DETRLoss',
+                 fpn_dims=[1024, 512, 256],
+                 with_mask_head=False,
+                 use_focal_loss=False):
+        super(DETRHead, self).__init__()
+        # add background class
+        self.num_classes = num_classes if use_focal_loss else num_classes + 1
+        self.hidden_dim = hidden_dim
+        self.loss = loss
+        self.with_mask_head = with_mask_head
+        self.use_focal_loss = use_focal_loss
+
+        self.score_head = nn.Linear(hidden_dim, self.num_classes)
+        self.bbox_head = MLP(hidden_dim,
+                             hidden_dim,
+                             output_dim=4,
+                             num_layers=num_mlp_layers)
+        if self.with_mask_head:
+            self.bbox_attention = MultiHeadAttentionMap(hidden_dim, hidden_dim,
+                                                        nhead)
+            self.mask_head = MaskHeadFPNConv(hidden_dim + nhead, fpn_dims,
+                                             hidden_dim)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.score_head)
+
+    @classmethod
+    def from_config(cls, cfg, hidden_dim, nhead, input_shape):
+
+        return {
+            'hidden_dim': hidden_dim,
+            'nhead': nhead,
+            'fpn_dims': [i.channels for i in input_shape[::-1]][1:]
+        }
+
+    @staticmethod
+    def get_gt_mask_from_polygons(gt_poly, pad_mask):
+        out_gt_mask = []
+        for polygons, padding in zip(gt_poly, pad_mask):
+            height, width = int(padding[:, 0].sum()), int(padding[0, :].sum())
+            masks = []
+            for obj_poly in polygons:
+                rles = mask_util.frPyObjects(obj_poly, height, width)
+                rle = mask_util.merge(rles)
+                masks.append(
+                    paddle.to_tensor(mask_util.decode(rle)).astype('float32'))
+            masks = paddle.stack(masks)
+            masks_pad = paddle.zeros(
+                [masks.shape[0], pad_mask.shape[1], pad_mask.shape[2]])
+            masks_pad[:, :height, :width] = masks
+            out_gt_mask.append(masks_pad)
+        return out_gt_mask
+
+    def forward(self, out_transformer, body_feats, inputs=None):
+        r"""
+        Args:
+            out_transformer (Tuple): (feats: [num_levels, batch_size,
+                                                num_queries, hidden_dim],
+                            memory: [batch_size, hidden_dim, h, w],
+                            src_proj: [batch_size, h*w, hidden_dim],
+                            src_mask: [batch_size, 1, 1, h, w])
+            body_feats (List(Tensor)): list[[B, C, H, W]]
+            inputs (dict): dict(inputs)
+        """
+        feats, memory, src_proj, src_mask = out_transformer
+        outputs_logit = self.score_head(feats)
+        outputs_bbox = F.sigmoid(self.bbox_head(feats))
+        outputs_seg = None
+        if self.with_mask_head:
+            bbox_attention_map = self.bbox_attention(feats[-1], memory,
+                                                     src_mask)
+            fpn_feats = [a for a in body_feats[::-1]][1:]
+            outputs_seg = self.mask_head(src_proj, bbox_attention_map,
+                                         fpn_feats)
+            outputs_seg = outputs_seg.reshape([
+                feats.shape[1], feats.shape[2], outputs_seg.shape[-2],
+                outputs_seg.shape[-1]
+            ])
+
+        if self.training:
+            assert inputs is not None
+            assert 'gt_bbox' in inputs and 'gt_class' in inputs
+            gt_mask = self.get_gt_mask_from_polygons(
+                inputs['gt_poly'],
+                inputs['pad_mask']) if 'gt_poly' in inputs else None
+            return self.loss(
+                outputs_bbox,
+                outputs_logit,
+                inputs['gt_bbox'],
+                inputs['gt_class'],
+                masks=outputs_seg,
+                gt_mask=gt_mask)
+        else:
+            return (outputs_bbox[-1], outputs_logit[-1], outputs_seg)
+
+
+@register
+class DeformableDETRHead(nn.Layer):
+    __shared__ = ['num_classes', 'hidden_dim']
+    __inject__ = ['loss']
+
+    def __init__(self,
+                 num_classes=80,
+                 hidden_dim=512,
+                 nhead=8,
+                 num_mlp_layers=3,
+                 loss='DETRLoss'):
+        super(DeformableDETRHead, self).__init__()
+        self.num_classes = num_classes
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+        self.loss = loss
+
+        self.score_head = nn.Linear(hidden_dim, self.num_classes)
+        self.bbox_head = MLP(hidden_dim,
+                             hidden_dim,
+                             output_dim=4,
+                             num_layers=num_mlp_layers)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.score_head)
+        constant_(self.score_head.bias, -4.595)
+        constant_(self.bbox_head.layers[-1].weight)
+
+        with paddle.no_grad():
+            bias = paddle.zeros_like(self.bbox_head.layers[-1].bias)
+            bias[2:] = -2.0
+            self.bbox_head.layers[-1].bias.set_value(bias)
+
+    @classmethod
+    def from_config(cls, cfg, hidden_dim, nhead, input_shape):
+        return {'hidden_dim': hidden_dim, 'nhead': nhead}
+
+    def forward(self, out_transformer, body_feats, inputs=None):
+        r"""
+        Args:
+            out_transformer (Tuple): (feats: [num_levels, batch_size,
+                                                num_queries, hidden_dim],
+                            memory: [batch_size,
+                                \sum_{l=0}^{L-1} H_l \cdot W_l, hidden_dim],
+                            reference_points: [batch_size, num_queries, 2])
+            body_feats (List(Tensor)): list[[B, C, H, W]]
+            inputs (dict): dict(inputs)
+        """
+        feats, memory, reference_points = out_transformer
+        reference_points = inverse_sigmoid(reference_points.unsqueeze(0))
+        outputs_bbox = self.bbox_head(feats)
+
+        # It's equivalent to "outputs_bbox[:, :, :, :2] += reference_points",
+        # but the gradient is wrong in paddle.
+        outputs_bbox = paddle.concat(
+            [
+                outputs_bbox[:, :, :, :2] + reference_points,
+                outputs_bbox[:, :, :, 2:]
+            ],
+            axis=-1)
+
+        outputs_bbox = F.sigmoid(outputs_bbox)
+        outputs_logit = self.score_head(feats)
+
+        if self.training:
+            assert inputs is not None
+            assert 'gt_bbox' in inputs and 'gt_class' in inputs
+
+            return self.loss(outputs_bbox, outputs_logit, inputs['gt_bbox'],
+                             inputs['gt_class'])
+        else:
+            return (outputs_bbox[-1], outputs_logit[-1], None)
+
+
+@register
+class DINOHead(nn.Layer):
+    __inject__ = ['loss']
+
+    def __init__(self, loss='DINOLoss'):
+        super(DINOHead, self).__init__()
+        self.loss = loss
+
+    def forward(self, out_transformer, body_feats, inputs=None):
+        (dec_out_bboxes, dec_out_logits, enc_topk_bboxes, enc_topk_logits,
+         dn_meta) = out_transformer
+        if self.training:
+            assert inputs is not None
+            assert 'gt_bbox' in inputs and 'gt_class' in inputs
+
+            if dn_meta is not None:
+                if isinstance(dn_meta, list):
+                    dual_groups = len(dn_meta) - 1
+                    dec_out_bboxes = paddle.split(
+                        dec_out_bboxes, dual_groups + 1, axis=2)
+                    dec_out_logits = paddle.split(
+                        dec_out_logits, dual_groups + 1, axis=2)
+                    enc_topk_bboxes = paddle.split(
+                        enc_topk_bboxes, dual_groups + 1, axis=1)
+                    enc_topk_logits = paddle.split(
+                        enc_topk_logits, dual_groups + 1, axis=1)
+
+                    dec_out_bboxes_list = []
+                    dec_out_logits_list = []
+                    dn_out_bboxes_list = []
+                    dn_out_logits_list = []
+                    loss = {}
+                    for g_id in range(dual_groups + 1):
+                        if dn_meta[g_id] is not None:
+                            dn_out_bboxes_gid, dec_out_bboxes_gid = paddle.split(
+                                dec_out_bboxes[g_id],
+                                dn_meta[g_id]['dn_num_split'],
+                                axis=2)
+                            dn_out_logits_gid, dec_out_logits_gid = paddle.split(
+                                dec_out_logits[g_id],
+                                dn_meta[g_id]['dn_num_split'],
+                                axis=2)
+                        else:
+                            dn_out_bboxes_gid, dn_out_logits_gid = None, None
+                            dec_out_bboxes_gid = dec_out_bboxes[g_id]
+                            dec_out_logits_gid = dec_out_logits[g_id]
+                        out_bboxes_gid = paddle.concat([
+                            enc_topk_bboxes[g_id].unsqueeze(0),
+                            dec_out_bboxes_gid
+                        ])
+                        out_logits_gid = paddle.concat([
+                            enc_topk_logits[g_id].unsqueeze(0),
+                            dec_out_logits_gid
+                        ])
+                        loss_gid = self.loss(
+                            out_bboxes_gid,
+                            out_logits_gid,
+                            inputs['gt_bbox'],
+                            inputs['gt_class'],
+                            dn_out_bboxes=dn_out_bboxes_gid,
+                            dn_out_logits=dn_out_logits_gid,
+                            dn_meta=dn_meta[g_id])
+                        # sum loss
+                        for key, value in loss_gid.items():
+                            loss.update({
+                                key: loss.get(key, paddle.zeros([1])) + value
+                            })
+
+                    # average across (dual_groups + 1)
+                    for key, value in loss.items():
+                        loss.update({key: value / (dual_groups + 1)})
+                    return loss
+                else:
+                    dn_out_bboxes, dec_out_bboxes = paddle.split(
+                        dec_out_bboxes, dn_meta['dn_num_split'], axis=2)
+                    dn_out_logits, dec_out_logits = paddle.split(
+                        dec_out_logits, dn_meta['dn_num_split'], axis=2)
+            else:
+                dn_out_bboxes, dn_out_logits = None, None
+
+            out_bboxes = paddle.concat(
+                [enc_topk_bboxes.unsqueeze(0), dec_out_bboxes])
+            out_logits = paddle.concat(
+                [enc_topk_logits.unsqueeze(0), dec_out_logits])
+
+            return self.loss(
+                out_bboxes,
+                out_logits,
+                inputs['gt_bbox'],
+                inputs['gt_class'],
+                dn_out_bboxes=dn_out_bboxes,
+                dn_out_logits=dn_out_logits,
+                dn_meta=dn_meta)
+        else:
+            return (dec_out_bboxes[-1], dec_out_logits[-1], None)
+
+
+@register
+class MaskDINOHead(nn.Layer):
+    __inject__ = ['loss']
+
+    def __init__(self, loss='DINOLoss'):
+        super(MaskDINOHead, self).__init__()
+        self.loss = loss
+
+    def forward(self, out_transformer, body_feats, inputs=None):
+        (dec_out_logits, dec_out_bboxes, dec_out_masks, enc_out, init_out,
+         dn_meta) = out_transformer
+        if self.training:
+            assert inputs is not None
+            assert 'gt_bbox' in inputs and 'gt_class' in inputs
+            assert 'gt_segm' in inputs
+
+            if dn_meta is not None:
+                dn_out_logits, dec_out_logits = paddle.split(
+                    dec_out_logits, dn_meta['dn_num_split'], axis=2)
+                dn_out_bboxes, dec_out_bboxes = paddle.split(
+                    dec_out_bboxes, dn_meta['dn_num_split'], axis=2)
+                dn_out_masks, dec_out_masks = paddle.split(
+                    dec_out_masks, dn_meta['dn_num_split'], axis=2)
+                if init_out is not None:
+                    init_out_logits, init_out_bboxes, init_out_masks = init_out
+                    init_out_logits_dn, init_out_logits = paddle.split(
+                        init_out_logits, dn_meta['dn_num_split'], axis=1)
+                    init_out_bboxes_dn, init_out_bboxes = paddle.split(
+                        init_out_bboxes, dn_meta['dn_num_split'], axis=1)
+                    init_out_masks_dn, init_out_masks = paddle.split(
+                        init_out_masks, dn_meta['dn_num_split'], axis=1)
+
+                    dec_out_logits = paddle.concat(
+                        [init_out_logits.unsqueeze(0), dec_out_logits])
+                    dec_out_bboxes = paddle.concat(
+                        [init_out_bboxes.unsqueeze(0), dec_out_bboxes])
+                    dec_out_masks = paddle.concat(
+                        [init_out_masks.unsqueeze(0), dec_out_masks])
+
+                    dn_out_logits = paddle.concat(
+                        [init_out_logits_dn.unsqueeze(0), dn_out_logits])
+                    dn_out_bboxes = paddle.concat(
+                        [init_out_bboxes_dn.unsqueeze(0), dn_out_bboxes])
+                    dn_out_masks = paddle.concat(
+                        [init_out_masks_dn.unsqueeze(0), dn_out_masks])
+            else:
+                dn_out_bboxes, dn_out_logits = None, None
+                dn_out_masks = None
+
+            enc_out_logits, enc_out_bboxes, enc_out_masks = enc_out
+            out_logits = paddle.concat(
+                [enc_out_logits.unsqueeze(0), dec_out_logits])
+            out_bboxes = paddle.concat(
+                [enc_out_bboxes.unsqueeze(0), dec_out_bboxes])
+            out_masks = paddle.concat(
+                [enc_out_masks.unsqueeze(0), dec_out_masks])
+
+            return self.loss(
+                out_bboxes,
+                out_logits,
+                inputs['gt_bbox'],
+                inputs['gt_class'],
+                masks=out_masks,
+                gt_mask=inputs['gt_segm'],
+                dn_out_logits=dn_out_logits,
+                dn_out_bboxes=dn_out_bboxes,
+                dn_out_masks=dn_out_masks,
+                dn_meta=dn_meta)
+        else:
+            return (dec_out_bboxes[-1], dec_out_logits[-1], dec_out_masks[-1])
--- a/rtdetr_paddle/ppdet/modeling/initializer.py
+++ b/rtdetr_paddle/ppdet/modeling/initializer.py
@@ -0,0 +1,325 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file.
+"""
+
+import math
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+
+__all__ = [
+    'uniform_',
+    'normal_',
+    'constant_',
+    'ones_',
+    'zeros_',
+    'xavier_uniform_',
+    'xavier_normal_',
+    'kaiming_uniform_',
+    'kaiming_normal_',
+    'linear_init_',
+    'conv_init_',
+    'reset_initialized_parameter',
+]
+
+
+def _no_grad_uniform_(tensor, a, b):
+    with paddle.no_grad():
+        tensor.set_value(
+            paddle.uniform(
+                shape=tensor.shape, dtype=tensor.dtype, min=a, max=b))
+    return tensor
+
+
+def _no_grad_normal_(tensor, mean=0., std=1.):
+    with paddle.no_grad():
+        tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape))
+    return tensor
+
+
+def _no_grad_fill_(tensor, value=0.):
+    with paddle.no_grad():
+        tensor.set_value(paddle.full_like(tensor, value, dtype=tensor.dtype))
+    return tensor
+
+
+def uniform_(tensor, a, b):
+    """
+    Modified tensor inspace using uniform_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        a (float|int): min value.
+        b (float|int): max value.
+    Return:
+        tensor
+    """
+    return _no_grad_uniform_(tensor, a, b)
+
+
+def normal_(tensor, mean=0., std=1.):
+    """
+    Modified tensor inspace using normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mean (float|int): mean value.
+        std (float|int): std value.
+    Return:
+        tensor
+    """
+    return _no_grad_normal_(tensor, mean, std)
+
+
+def constant_(tensor, value=0.):
+    """
+    Modified tensor inspace using constant_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        value (float|int): value to fill tensor.
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, value)
+
+
+def ones_(tensor):
+    """
+    Modified tensor inspace using ones_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, 1)
+
+
+def zeros_(tensor):
+    """
+    Modified tensor inspace using zeros_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, 0)
+
+
+def vector_(tensor, vector):
+    with paddle.no_grad():
+        tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype))
+    return tensor
+
+
+def _calculate_fan_in_and_fan_out(tensor, reverse=False):
+    """
+    Calculate (fan_in, _fan_out) for tensor
+
+    Args:
+        tensor (Tensor): paddle.Tensor
+        reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True
+
+    Return:
+        Tuple[fan_in, fan_out]
+    """
+    if tensor.ndim < 2:
+        raise ValueError(
+            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
+        )
+
+    if reverse:
+        num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
+    else:
+        num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0]
+
+    receptive_field_size = 1
+    if tensor.ndim > 2:
+        receptive_field_size = np.prod(tensor.shape[2:])
+
+    fan_in = num_input_fmaps * receptive_field_size
+    fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+def xavier_uniform_(tensor, gain=1., reverse=False):
+    """
+    Modified tensor inspace using xavier_uniform_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        gain (float): super parameter, 1. default.
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    k = math.sqrt(3.0) * std
+    return _no_grad_uniform_(tensor, -k, k)
+
+
+def xavier_normal_(tensor, gain=1., reverse=False):
+    """
+    Modified tensor inspace using xavier_normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        gain (float): super parameter, 1. default.
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    return _no_grad_normal_(tensor, 0, std)
+
+
+# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html
+def _calculate_correct_fan(tensor, mode, reverse=False):
+    mode = mode.lower()
+    valid_modes = ['fan_in', 'fan_out']
+    if mode not in valid_modes:
+        raise ValueError("Mode {} not supported, please use one of {}".format(
+            mode, valid_modes))
+
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
+
+    return fan_in if mode == 'fan_in' else fan_out
+
+
+def _calculate_gain(nonlinearity, param=None):
+    linear_fns = [
+        'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',
+        'conv_transpose2d', 'conv_transpose3d'
+    ]
+    if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
+        return 1
+    elif nonlinearity == 'tanh':
+        return 5.0 / 3
+    elif nonlinearity == 'relu':
+        return math.sqrt(2.0)
+    elif nonlinearity == 'leaky_relu':
+        if param is None:
+            negative_slope = 0.01
+        elif not isinstance(param, bool) and isinstance(
+                param, int) or isinstance(param, float):
+            # True/False are instances of int, hence check above
+            negative_slope = param
+        else:
+            raise ValueError("negative_slope {} not a valid number".format(
+                param))
+        return math.sqrt(2.0 / (1 + negative_slope**2))
+    elif nonlinearity == 'selu':
+        return 3.0 / 4
+    else:
+        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
+
+
+def kaiming_uniform_(tensor,
+                     a=0,
+                     mode='fan_in',
+                     nonlinearity='leaky_relu',
+                     reverse=False):
+    """
+    Modified tensor inspace using kaiming_uniform method
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+        nonlinearity (str): nonlinearity method name
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = _calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    k = math.sqrt(3.0) * std
+    return _no_grad_uniform_(tensor, -k, k)
+
+
+def kaiming_normal_(tensor,
+                    a=0,
+                    mode='fan_in',
+                    nonlinearity='leaky_relu',
+                    reverse=False):
+    """
+    Modified tensor inspace using kaiming_normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+        nonlinearity (str): nonlinearity method name
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = _calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    return _no_grad_normal_(tensor, 0, std)
+
+
+def linear_init_(module):
+    bound = 1 / math.sqrt(module.weight.shape[0])
+    uniform_(module.weight, -bound, bound)
+    if hasattr(module, "bias") and module.bias is not None:
+        uniform_(module.bias, -bound, bound)
+
+
+def conv_init_(module):
+    bound = 1 / np.sqrt(np.prod(module.weight.shape[1:]))
+    uniform_(module.weight, -bound, bound)
+    if module.bias is not None:
+        uniform_(module.bias, -bound, bound)
+
+
+def bias_init_with_prob(prior_prob=0.01):
+    """initialize conv/fc bias value according to a given probability value."""
+    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
+    return bias_init
+
+
+@paddle.no_grad()
+def reset_initialized_parameter(model, include_self=True):
+    """
+    Reset initialized parameter using following method for [conv, linear, embedding, bn]
+
+    Args:
+        model (paddle.Layer): paddle Layer
+        include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself
+    Return:
+        None
+    """
+    for _, m in model.named_sublayers(include_self=include_self):
+        if isinstance(m, nn.Conv2D):
+            k = float(m._groups) / (m._in_channels * m._kernel_size[0] *
+                                    m._kernel_size[1])
+            k = math.sqrt(k)
+            _no_grad_uniform_(m.weight, -k, k)
+            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
+                _no_grad_uniform_(m.bias, -k, k)
+
+        elif isinstance(m, nn.Linear):
+            k = math.sqrt(1. / m.weight.shape[0])
+            _no_grad_uniform_(m.weight, -k, k)
+            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
+                _no_grad_uniform_(m.bias, -k, k)
+
+        elif isinstance(m, nn.Embedding):
+            _no_grad_normal_(m.weight, mean=0., std=1.)
+
+        elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)):
+            _no_grad_fill_(m.weight, 1.)
+            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
+                _no_grad_fill_(m.bias, 0)
--- a/rtdetr_paddle/ppdet/modeling/keypoint_utils.py
+++ b/rtdetr_paddle/ppdet/modeling/keypoint_utils.py
@@ -0,0 +1,403 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+this code is based on https://github.com/open-mmlab/mmpose
+"""
+
+import cv2
+import numpy as np
+import paddle.nn.functional as F
+
+
+def get_affine_mat_kernel(h, w, s, inv=False):
+    if w < h:
+        w_ = s
+        h_ = int(np.ceil((s / w * h) / 64.) * 64)
+        scale_w = w
+        scale_h = h_ / w_ * w
+
+    else:
+        h_ = s
+        w_ = int(np.ceil((s / h * w) / 64.) * 64)
+        scale_h = h
+        scale_w = w_ / h_ * h
+
+    center = np.array([np.round(w / 2.), np.round(h / 2.)])
+
+    size_resized = (w_, h_)
+    trans = get_affine_transform(
+        center, np.array([scale_w, scale_h]), 0, size_resized, inv=inv)
+
+    return trans, size_resized
+
+
+def get_affine_transform(center,
+                         input_size,
+                         rot,
+                         output_size,
+                         shift=(0., 0.),
+                         inv=False):
+    """Get the affine transform matrix, given the center/scale/rot/output_size.
+
+    Args:
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        input_size (np.ndarray[2, ]): Size of input feature (width, height).
+        rot (float): Rotation angle (degree).
+        output_size (np.ndarray[2, ]): Size of the destination heatmaps.
+        shift (0-100%): Shift translation ratio wrt the width/height.
+            Default (0., 0.).
+        inv (bool): Option to inverse the affine transform direction.
+            (inv=False: src->dst or inv=True: dst->src)
+
+    Returns:
+        np.ndarray: The transform matrix.
+    """
+    assert len(center) == 2
+    assert len(output_size) == 2
+    assert len(shift) == 2
+
+    if not isinstance(input_size, (np.ndarray, list)):
+        input_size = np.array([input_size, input_size], dtype=np.float32)
+    scale_tmp = input_size
+
+    shift = np.array(shift)
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = rotate_point([0., src_w * -0.5], rot_rad)
+    dst_dir = np.array([0., dst_w * -0.5])
+
+    src = np.zeros((3, 2), dtype=np.float32)
+
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    src[2, :] = _get_3rd_point(src[0, :], src[1, :])
+
+    dst = np.zeros((3, 2), dtype=np.float32)
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+def get_warp_matrix(theta, size_input, size_dst, size_target):
+    """This code is based on
+        https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py
+
+        Calculate the transformation matrix under the constraint of unbiased.
+    Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased
+    Data Processing for Human Pose Estimation (CVPR 2020).
+
+    Args:
+        theta (float): Rotation angle in degrees.
+        size_input (np.ndarray): Size of input image [w, h].
+        size_dst (np.ndarray): Size of output image [w, h].
+        size_target (np.ndarray): Size of ROI in input plane [w, h].
+
+    Returns:
+        matrix (np.ndarray): A matrix for transformation.
+    """
+    theta = np.deg2rad(theta)
+    matrix = np.zeros((2, 3), dtype=np.float32)
+    scale_x = size_dst[0] / size_target[0]
+    scale_y = size_dst[1] / size_target[1]
+    matrix[0, 0] = np.cos(theta) * scale_x
+    matrix[0, 1] = -np.sin(theta) * scale_x
+    matrix[0, 2] = scale_x * (
+        -0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] *
+        np.sin(theta) + 0.5 * size_target[0])
+    matrix[1, 0] = np.sin(theta) * scale_y
+    matrix[1, 1] = np.cos(theta) * scale_y
+    matrix[1, 2] = scale_y * (
+        -0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] *
+        np.cos(theta) + 0.5 * size_target[1])
+    return matrix
+
+
+def _get_3rd_point(a, b):
+    """To calculate the affine matrix, three pairs of points are required. This
+    function is used to get the 3rd point, given 2D points a & b.
+
+    The 3rd point is defined by rotating vector `a - b` by 90 degrees
+    anticlockwise, using b as the rotation center.
+
+    Args:
+        a (np.ndarray): point(x,y)
+        b (np.ndarray): point(x,y)
+
+    Returns:
+        np.ndarray: The 3rd point.
+    """
+    assert len(
+        a) == 2, 'input of _get_3rd_point should be point with length of 2'
+    assert len(
+        b) == 2, 'input of _get_3rd_point should be point with length of 2'
+    direction = a - b
+    third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)
+
+    return third_pt
+
+
+def rotate_point(pt, angle_rad):
+    """Rotate a point by an angle.
+
+    Args:
+        pt (list[float]): 2 dimensional point to be rotated
+        angle_rad (float): rotation angle by radian
+
+    Returns:
+        list[float]: Rotated point.
+    """
+    assert len(pt) == 2
+    sn, cs = np.sin(angle_rad), np.cos(angle_rad)
+    new_x = pt[0] * cs - pt[1] * sn
+    new_y = pt[0] * sn + pt[1] * cs
+    rotated_pt = [new_x, new_y]
+
+    return rotated_pt
+
+
+def transpred(kpts, h, w, s):
+    trans, _ = get_affine_mat_kernel(h, w, s, inv=True)
+
+    return warp_affine_joints(kpts[..., :2].copy(), trans)
+
+
+def warp_affine_joints(joints, mat):
+    """Apply affine transformation defined by the transform matrix on the
+    joints.
+
+    Args:
+        joints (np.ndarray[..., 2]): Origin coordinate of joints.
+        mat (np.ndarray[3, 2]): The affine matrix.
+
+    Returns:
+        matrix (np.ndarray[..., 2]): Result coordinate of joints.
+    """
+    joints = np.array(joints)
+    shape = joints.shape
+    joints = joints.reshape(-1, 2)
+    return np.dot(np.concatenate(
+        (joints, joints[:, 0:1] * 0 + 1), axis=1),
+                  mat.T).reshape(shape)
+
+
+def affine_transform(pt, t):
+    new_pt = np.array([pt[0], pt[1], 1.]).T
+    new_pt = np.dot(t, new_pt)
+    return new_pt[:2]
+
+
+def transform_preds(coords, center, scale, output_size):
+    target_coords = np.zeros(coords.shape)
+    trans = get_affine_transform(center, scale * 200, 0, output_size, inv=1)
+    for p in range(coords.shape[0]):
+        target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
+    return target_coords
+
+
+def oks_iou(g, d, a_g, a_d, sigmas=None, in_vis_thre=None):
+    if not isinstance(sigmas, np.ndarray):
+        sigmas = np.array([
+            .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07,
+            .87, .87, .89, .89
+        ]) / 10.0
+    vars = (sigmas * 2)**2
+    xg = g[0::3]
+    yg = g[1::3]
+    vg = g[2::3]
+    ious = np.zeros((d.shape[0]))
+    for n_d in range(0, d.shape[0]):
+        xd = d[n_d, 0::3]
+        yd = d[n_d, 1::3]
+        vd = d[n_d, 2::3]
+        dx = xd - xg
+        dy = yd - yg
+        e = (dx**2 + dy**2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2
+        if in_vis_thre is not None:
+            ind = list(vg > in_vis_thre) and list(vd > in_vis_thre)
+            e = e[ind]
+        ious[n_d] = np.sum(np.exp(-e)) / e.shape[0] if e.shape[0] != 0 else 0.0
+    return ious
+
+
+def oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None):
+    """greedily select boxes with high confidence and overlap with current maximum <= thresh
+    rule out overlap >= thresh
+
+    Args:
+        kpts_db (list): The predicted keypoints within the image
+        thresh (float): The threshold to select the boxes
+        sigmas (np.array): The variance to calculate the oks iou
+            Default: None
+        in_vis_thre (float): The threshold to select the high confidence boxes
+            Default: None
+
+    Return:
+        keep (list): indexes to keep
+    """
+
+    if len(kpts_db) == 0:
+        return []
+
+    scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))])
+    kpts = np.array(
+        [kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))])
+    areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))])
+
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+
+        oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
+                          sigmas, in_vis_thre)
+
+        inds = np.where(oks_ovr <= thresh)[0]
+        order = order[inds + 1]
+
+    return keep
+
+
+def rescore(overlap, scores, thresh, type='gaussian'):
+    assert overlap.shape[0] == scores.shape[0]
+    if type == 'linear':
+        inds = np.where(overlap >= thresh)[0]
+        scores[inds] = scores[inds] * (1 - overlap[inds])
+    else:
+        scores = scores * np.exp(-overlap**2 / thresh)
+
+    return scores
+
+
+def soft_oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None):
+    """greedily select boxes with high confidence and overlap with current maximum <= thresh
+    rule out overlap >= thresh
+
+    Args:
+        kpts_db (list): The predicted keypoints within the image
+        thresh (float): The threshold to select the boxes
+        sigmas (np.array): The variance to calculate the oks iou
+            Default: None
+        in_vis_thre (float): The threshold to select the high confidence boxes
+            Default: None
+
+    Return:
+        keep (list): indexes to keep
+    """
+
+    if len(kpts_db) == 0:
+        return []
+
+    scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))])
+    kpts = np.array(
+        [kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))])
+    areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))])
+
+    order = scores.argsort()[::-1]
+    scores = scores[order]
+
+    # max_dets = order.size
+    max_dets = 20
+    keep = np.zeros(max_dets, dtype=np.intp)
+    keep_cnt = 0
+    while order.size > 0 and keep_cnt < max_dets:
+        i = order[0]
+
+        oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
+                          sigmas, in_vis_thre)
+
+        order = order[1:]
+        scores = rescore(oks_ovr, scores[1:], thresh)
+
+        tmp = scores.argsort()[::-1]
+        order = order[tmp]
+        scores = scores[tmp]
+
+        keep[keep_cnt] = i
+        keep_cnt += 1
+
+    keep = keep[:keep_cnt]
+
+    return keep
+
+
+def resize(input,
+           size=None,
+           scale_factor=None,
+           mode='nearest',
+           align_corners=None,
+           warning=True):
+    if warning:
+        if size is not None and align_corners:
+            input_h, input_w = tuple(int(x) for x in input.shape[2:])
+            output_h, output_w = tuple(int(x) for x in size)
+            if output_h > input_h or output_w > output_h:
+                if ((output_h > 1 and output_w > 1 and input_h > 1 and
+                     input_w > 1) and (output_h - 1) % (input_h - 1) and
+                    (output_w - 1) % (input_w - 1)):
+                    warnings.warn(
+                        f'When align_corners={align_corners}, '
+                        'the output would more aligned if '
+                        f'input size {(input_h, input_w)} is `x+1` and '
+                        f'out size {(output_h, output_w)} is `nx+1`')
+
+    return F.interpolate(input, size, scale_factor, mode, align_corners)
+
+
+def flip_back(output_flipped, flip_pairs, target_type='GaussianHeatmap'):
+    """Flip the flipped heatmaps back to the original form.
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+        - heatmap height: H
+        - heatmap width: W
+    Args:
+        output_flipped (np.ndarray[N, K, H, W]): The output heatmaps obtained
+            from the flipped images.
+        flip_pairs (list[tuple()): Pairs of keypoints which are mirrored
+            (for example, left ear -- right ear).
+        target_type (str): GaussianHeatmap or CombinedTarget
+    Returns:
+        np.ndarray: heatmaps that flipped back to the original image
+    """
+    assert len(output_flipped.shape) == 4, \
+        'output_flipped should be [batch_size, num_keypoints, height, width]'
+    shape_ori = output_flipped.shape
+    channels = 1
+    if target_type.lower() == 'CombinedTarget'.lower():
+        channels = 3
+        output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...]
+    output_flipped = output_flipped.reshape((shape_ori[0], -1, channels,
+                                             shape_ori[2], shape_ori[3]))
+    output_flipped_back = output_flipped.clone()
+
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        output_flipped_back[:, left, ...] = output_flipped[:, right, ...]
+        output_flipped_back[:, right, ...] = output_flipped[:, left, ...]
+    output_flipped_back = output_flipped_back.reshape(shape_ori)
+    # Flip horizontally
+    output_flipped_back = output_flipped_back[..., ::-1]
+    return output_flipped_back
--- a/rtdetr_paddle/ppdet/modeling/layers.py
+++ b/rtdetr_paddle/ppdet/modeling/layers.py
--- a/rtdetr_paddle/ppdet/modeling/losses/init.py
+++ b/rtdetr_paddle/ppdet/modeling/losses/init.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .iou_loss import *
+from .gfocal_loss import *
+from .detr_loss import *
+from .focal_loss import *
+from .smooth_l1_loss import *
--- a/rtdetr_paddle/ppdet/modeling/losses/detr_loss.py
+++ b/rtdetr_paddle/ppdet/modeling/losses/detr_loss.py
@@ -0,0 +1,578 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+from .iou_loss import GIoULoss
+from ..transformers import bbox_cxcywh_to_xyxy, sigmoid_focal_loss, varifocal_loss_with_logits
+from ..bbox_utils import bbox_iou
+
+__all__ = ['DETRLoss', 'DINOLoss']
+
+
+@register
+class DETRLoss(nn.Layer):
+    __shared__ = ['num_classes', 'use_focal_loss']
+    __inject__ = ['matcher']
+
+    def __init__(self,
+                 num_classes=80,
+                 matcher='HungarianMatcher',
+                 loss_coeff={
+                     'class': 1,
+                     'bbox': 5,
+                     'giou': 2,
+                     'no_object': 0.1,
+                     'mask': 1,
+                     'dice': 1
+                 },
+                 aux_loss=True,
+                 use_focal_loss=False,
+                 use_vfl=False,
+                 use_uni_match=False,
+                 uni_match_ind=0):
+        r"""
+        Args:
+            num_classes (int): The number of classes.
+            matcher (HungarianMatcher): It computes an assignment between the targets
+                and the predictions of the network.
+            loss_coeff (dict): The coefficient of loss.
+            aux_loss (bool): If 'aux_loss = True', loss at each decoder layer are to be used.
+            use_focal_loss (bool): Use focal loss or not.
+        """
+        super(DETRLoss, self).__init__()
+
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.loss_coeff = loss_coeff
+        self.aux_loss = aux_loss
+        self.use_focal_loss = use_focal_loss
+        self.use_vfl = use_vfl
+        self.use_uni_match = use_uni_match
+        self.uni_match_ind = uni_match_ind
+
+        if not self.use_focal_loss:
+            self.loss_coeff['class'] = paddle.full([num_classes + 1],
+                                                   loss_coeff['class'])
+            self.loss_coeff['class'][-1] = loss_coeff['no_object']
+        self.giou_loss = GIoULoss()
+
+    def _get_loss_class(self,
+                        logits,
+                        gt_class,
+                        match_indices,
+                        bg_index,
+                        num_gts,
+                        postfix="",
+                        iou_score=None):
+        # logits: [b, query, num_classes], gt_class: list[[n, 1]]
+        name_class = "loss_class" + postfix
+
+        target_label = paddle.full(logits.shape[:2], bg_index, dtype='int64')
+        bs, num_query_objects = target_label.shape
+        num_gt = sum(len(a) for a in gt_class)
+        if num_gt > 0:
+            index, updates = self._get_index_updates(num_query_objects,
+                                                     gt_class, match_indices)
+            target_label = paddle.scatter(
+                target_label.reshape([-1, 1]), index, updates.astype('int64'))
+            target_label = target_label.reshape([bs, num_query_objects])
+        if self.use_focal_loss:
+            target_label = F.one_hot(target_label,
+                                     self.num_classes + 1)[..., :-1]
+            if iou_score is not None and self.use_vfl:
+                target_score = paddle.zeros([bs, num_query_objects])
+                if num_gt > 0:
+                    target_score = paddle.scatter(
+                        target_score.reshape([-1, 1]), index, iou_score)
+                target_score = target_score.reshape(
+                    [bs, num_query_objects, 1]) * target_label
+                loss_ = self.loss_coeff['class'] * varifocal_loss_with_logits(
+                    logits, target_score, target_label,
+                    num_gts / num_query_objects)
+            else:
+                loss_ = self.loss_coeff['class'] * sigmoid_focal_loss(
+                    logits, target_label, num_gts / num_query_objects)
+        else:
+            loss_ = F.cross_entropy(
+                logits, target_label, weight=self.loss_coeff['class'])
+        return {name_class: loss_}
+
+    def _get_loss_bbox(self, boxes, gt_bbox, match_indices, num_gts,
+                       postfix=""):
+        # boxes: [b, query, 4], gt_bbox: list[[n, 4]]
+        name_bbox = "loss_bbox" + postfix
+        name_giou = "loss_giou" + postfix
+
+        loss = dict()
+        if sum(len(a) for a in gt_bbox) == 0:
+            loss[name_bbox] = paddle.to_tensor([0.])
+            loss[name_giou] = paddle.to_tensor([0.])
+            return loss
+
+        src_bbox, target_bbox = self._get_src_target_assign(boxes, gt_bbox,
+                                                            match_indices)
+        loss[name_bbox] = self.loss_coeff['bbox'] * F.l1_loss(
+            src_bbox, target_bbox, reduction='sum') / num_gts
+        loss[name_giou] = self.giou_loss(
+            bbox_cxcywh_to_xyxy(src_bbox), bbox_cxcywh_to_xyxy(target_bbox))
+        loss[name_giou] = loss[name_giou].sum() / num_gts
+        loss[name_giou] = self.loss_coeff['giou'] * loss[name_giou]
+        return loss
+
+    def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts,
+                       postfix=""):
+        # masks: [b, query, h, w], gt_mask: list[[n, H, W]]
+        name_mask = "loss_mask" + postfix
+        name_dice = "loss_dice" + postfix
+
+        loss = dict()
+        if sum(len(a) for a in gt_mask) == 0:
+            loss[name_mask] = paddle.to_tensor([0.])
+            loss[name_dice] = paddle.to_tensor([0.])
+            return loss
+
+        src_masks, target_masks = self._get_src_target_assign(masks, gt_mask,
+                                                              match_indices)
+        src_masks = F.interpolate(
+            src_masks.unsqueeze(0),
+            size=target_masks.shape[-2:],
+            mode="bilinear")[0]
+        loss[name_mask] = self.loss_coeff['mask'] * F.sigmoid_focal_loss(
+            src_masks,
+            target_masks,
+            paddle.to_tensor(
+                [num_gts], dtype='float32'))
+        loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss(
+            src_masks, target_masks, num_gts)
+        return loss
+
+    def _dice_loss(self, inputs, targets, num_gts):
+        inputs = F.sigmoid(inputs)
+        inputs = inputs.flatten(1)
+        targets = targets.flatten(1)
+        numerator = 2 * (inputs * targets).sum(1)
+        denominator = inputs.sum(-1) + targets.sum(-1)
+        loss = 1 - (numerator + 1) / (denominator + 1)
+        return loss.sum() / num_gts
+
+    def _get_loss_aux(self,
+                      boxes,
+                      logits,
+                      gt_bbox,
+                      gt_class,
+                      bg_index,
+                      num_gts,
+                      dn_match_indices=None,
+                      postfix="",
+                      masks=None,
+                      gt_mask=None):
+        loss_class = []
+        loss_bbox, loss_giou = [], []
+        loss_mask, loss_dice = [], []
+        if dn_match_indices is not None:
+            match_indices = dn_match_indices
+        elif self.use_uni_match:
+            match_indices = self.matcher(
+                boxes[self.uni_match_ind],
+                logits[self.uni_match_ind],
+                gt_bbox,
+                gt_class,
+                masks=masks[self.uni_match_ind] if masks is not None else None,
+                gt_mask=gt_mask)
+        for i, (aux_boxes, aux_logits) in enumerate(zip(boxes, logits)):
+            aux_masks = masks[i] if masks is not None else None
+            if not self.use_uni_match and dn_match_indices is None:
+                match_indices = self.matcher(
+                    aux_boxes,
+                    aux_logits,
+                    gt_bbox,
+                    gt_class,
+                    masks=aux_masks,
+                    gt_mask=gt_mask)
+            if self.use_vfl:
+                if sum(len(a) for a in gt_bbox) > 0:
+                    src_bbox, target_bbox = self._get_src_target_assign(
+                        aux_boxes.detach(), gt_bbox, match_indices)
+                    iou_score = bbox_iou(
+                        bbox_cxcywh_to_xyxy(src_bbox).split(4, -1),
+                        bbox_cxcywh_to_xyxy(target_bbox).split(4, -1))
+                else:
+                    iou_score = None
+            else:
+                iou_score = None
+            loss_class.append(
+                self._get_loss_class(aux_logits, gt_class, match_indices,
+                                     bg_index, num_gts, postfix, iou_score)[
+                                         'loss_class' + postfix])
+            loss_ = self._get_loss_bbox(aux_boxes, gt_bbox, match_indices,
+                                        num_gts, postfix)
+            loss_bbox.append(loss_['loss_bbox' + postfix])
+            loss_giou.append(loss_['loss_giou' + postfix])
+            if masks is not None and gt_mask is not None:
+                loss_ = self._get_loss_mask(aux_masks, gt_mask, match_indices,
+                                            num_gts, postfix)
+                loss_mask.append(loss_['loss_mask' + postfix])
+                loss_dice.append(loss_['loss_dice' + postfix])
+        loss = {
+            "loss_class_aux" + postfix: paddle.add_n(loss_class),
+            "loss_bbox_aux" + postfix: paddle.add_n(loss_bbox),
+            "loss_giou_aux" + postfix: paddle.add_n(loss_giou)
+        }
+        if masks is not None and gt_mask is not None:
+            loss["loss_mask_aux" + postfix] = paddle.add_n(loss_mask)
+            loss["loss_dice_aux" + postfix] = paddle.add_n(loss_dice)
+        return loss
+
+    def _get_index_updates(self, num_query_objects, target, match_indices):
+        batch_idx = paddle.concat([
+            paddle.full_like(src, i) for i, (src, _) in enumerate(match_indices)
+        ])
+        src_idx = paddle.concat([src for (src, _) in match_indices])
+        src_idx += (batch_idx * num_query_objects)
+        target_assign = paddle.concat([
+            paddle.gather(
+                t, dst, axis=0) for t, (_, dst) in zip(target, match_indices)
+        ])
+        return src_idx, target_assign
+
+    def _get_src_target_assign(self, src, target, match_indices):
+        src_assign = paddle.concat([
+            paddle.gather(
+                t, I, axis=0) if len(I) > 0 else paddle.zeros([0, t.shape[-1]])
+            for t, (I, _) in zip(src, match_indices)
+        ])
+        target_assign = paddle.concat([
+            paddle.gather(
+                t, J, axis=0) if len(J) > 0 else paddle.zeros([0, t.shape[-1]])
+            for t, (_, J) in zip(target, match_indices)
+        ])
+        return src_assign, target_assign
+
+    def _get_num_gts(self, targets, dtype="float32"):
+        num_gts = sum(len(a) for a in targets)
+        num_gts = paddle.to_tensor([num_gts], dtype=dtype)
+        if paddle.distributed.get_world_size() > 1:
+            paddle.distributed.all_reduce(num_gts)
+            num_gts /= paddle.distributed.get_world_size()
+        num_gts = paddle.clip(num_gts, min=1.)
+        return num_gts
+
+    def _get_prediction_loss(self,
+                             boxes,
+                             logits,
+                             gt_bbox,
+                             gt_class,
+                             masks=None,
+                             gt_mask=None,
+                             postfix="",
+                             dn_match_indices=None,
+                             num_gts=1):
+        if dn_match_indices is None:
+            match_indices = self.matcher(
+                boxes, logits, gt_bbox, gt_class, masks=masks, gt_mask=gt_mask)
+        else:
+            match_indices = dn_match_indices
+
+        if self.use_vfl:
+            if sum(len(a) for a in gt_bbox) > 0:
+                src_bbox, target_bbox = self._get_src_target_assign(
+                    boxes.detach(), gt_bbox, match_indices)
+                iou_score = bbox_iou(
+                    bbox_cxcywh_to_xyxy(src_bbox).split(4, -1),
+                    bbox_cxcywh_to_xyxy(target_bbox).split(4, -1))
+            else:
+                iou_score = None
+        else:
+            iou_score = None
+
+        loss = dict()
+        loss.update(
+            self._get_loss_class(logits, gt_class, match_indices,
+                                 self.num_classes, num_gts, postfix, iou_score))
+        loss.update(
+            self._get_loss_bbox(boxes, gt_bbox, match_indices, num_gts,
+                                postfix))
+        if masks is not None and gt_mask is not None:
+            loss.update(
+                self._get_loss_mask(masks, gt_mask, match_indices, num_gts,
+                                    postfix))
+        return loss
+
+    def forward(self,
+                boxes,
+                logits,
+                gt_bbox,
+                gt_class,
+                masks=None,
+                gt_mask=None,
+                postfix="",
+                **kwargs):
+        r"""
+        Args:
+            boxes (Tensor): [l, b, query, 4]
+            logits (Tensor): [l, b, query, num_classes]
+            gt_bbox (List(Tensor)): list[[n, 4]]
+            gt_class (List(Tensor)): list[[n, 1]]
+            masks (Tensor, optional): [l, b, query, h, w]
+            gt_mask (List(Tensor), optional): list[[n, H, W]]
+            postfix (str): postfix of loss name
+        """
+
+        dn_match_indices = kwargs.get("dn_match_indices", None)
+        num_gts = kwargs.get("num_gts", None)
+        if num_gts is None:
+            num_gts = self._get_num_gts(gt_class)
+
+        total_loss = self._get_prediction_loss(
+            boxes[-1],
+            logits[-1],
+            gt_bbox,
+            gt_class,
+            masks=masks[-1] if masks is not None else None,
+            gt_mask=gt_mask,
+            postfix=postfix,
+            dn_match_indices=dn_match_indices,
+            num_gts=num_gts)
+
+        if self.aux_loss:
+            total_loss.update(
+                self._get_loss_aux(
+                    boxes[:-1],
+                    logits[:-1],
+                    gt_bbox,
+                    gt_class,
+                    self.num_classes,
+                    num_gts,
+                    dn_match_indices,
+                    postfix,
+                    masks=masks[:-1] if masks is not None else None,
+                    gt_mask=gt_mask))
+
+        return total_loss
+
+
+@register
+class DINOLoss(DETRLoss):
+    def forward(self,
+                boxes,
+                logits,
+                gt_bbox,
+                gt_class,
+                masks=None,
+                gt_mask=None,
+                postfix="",
+                dn_out_bboxes=None,
+                dn_out_logits=None,
+                dn_meta=None,
+                **kwargs):
+        num_gts = self._get_num_gts(gt_class)
+        total_loss = super(DINOLoss, self).forward(
+            boxes, logits, gt_bbox, gt_class, num_gts=num_gts)
+
+        if dn_meta is not None:
+            dn_positive_idx, dn_num_group = \
+                dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
+            assert len(gt_class) == len(dn_positive_idx)
+
+            # denoising match indices
+            dn_match_indices = self.get_dn_match_indices(
+                gt_class, dn_positive_idx, dn_num_group)
+
+            # compute denoising training loss
+            num_gts *= dn_num_group
+            dn_loss = super(DINOLoss, self).forward(
+                dn_out_bboxes,
+                dn_out_logits,
+                gt_bbox,
+                gt_class,
+                postfix="_dn",
+                dn_match_indices=dn_match_indices,
+                num_gts=num_gts)
+            total_loss.update(dn_loss)
+        else:
+            total_loss.update(
+                {k + '_dn': paddle.to_tensor([0.])
+                 for k in total_loss.keys()})
+
+        return total_loss
+
+    @staticmethod
+    def get_dn_match_indices(labels, dn_positive_idx, dn_num_group):
+        dn_match_indices = []
+        for i in range(len(labels)):
+            num_gt = len(labels[i])
+            if num_gt > 0:
+                gt_idx = paddle.arange(end=num_gt, dtype="int64")
+                gt_idx = gt_idx.tile([dn_num_group])
+                assert len(dn_positive_idx[i]) == len(gt_idx)
+                dn_match_indices.append((dn_positive_idx[i], gt_idx))
+            else:
+                dn_match_indices.append((paddle.zeros(
+                    [0], dtype="int64"), paddle.zeros(
+                        [0], dtype="int64")))
+        return dn_match_indices
+
+
+@register
+class MaskDINOLoss(DETRLoss):
+    __shared__ = ['num_classes', 'use_focal_loss', 'num_sample_points']
+    __inject__ = ['matcher']
+
+    def __init__(self,
+                 num_classes=80,
+                 matcher='HungarianMatcher',
+                 loss_coeff={
+                     'class': 4,
+                     'bbox': 5,
+                     'giou': 2,
+                     'mask': 5,
+                     'dice': 5
+                 },
+                 aux_loss=True,
+                 use_focal_loss=False,
+                 num_sample_points=12544,
+                 oversample_ratio=3.0,
+                 important_sample_ratio=0.75):
+        super(MaskDINOLoss, self).__init__(num_classes, matcher, loss_coeff,
+                                           aux_loss, use_focal_loss)
+        assert oversample_ratio >= 1
+        assert important_sample_ratio <= 1 and important_sample_ratio >= 0
+
+        self.num_sample_points = num_sample_points
+        self.oversample_ratio = oversample_ratio
+        self.important_sample_ratio = important_sample_ratio
+        self.num_oversample_points = int(num_sample_points * oversample_ratio)
+        self.num_important_points = int(num_sample_points *
+                                        important_sample_ratio)
+        self.num_random_points = num_sample_points - self.num_important_points
+
+    def forward(self,
+                boxes,
+                logits,
+                gt_bbox,
+                gt_class,
+                masks=None,
+                gt_mask=None,
+                postfix="",
+                dn_out_bboxes=None,
+                dn_out_logits=None,
+                dn_out_masks=None,
+                dn_meta=None,
+                **kwargs):
+        num_gts = self._get_num_gts(gt_class)
+        total_loss = super(MaskDINOLoss, self).forward(
+            boxes,
+            logits,
+            gt_bbox,
+            gt_class,
+            masks=masks,
+            gt_mask=gt_mask,
+            num_gts=num_gts)
+
+        if dn_meta is not None:
+            dn_positive_idx, dn_num_group = \
+                dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
+            assert len(gt_class) == len(dn_positive_idx)
+
+            # denoising match indices
+            dn_match_indices = DINOLoss.get_dn_match_indices(
+                gt_class, dn_positive_idx, dn_num_group)
+
+            # compute denoising training loss
+            num_gts *= dn_num_group
+            dn_loss = super(MaskDINOLoss, self).forward(
+                dn_out_bboxes,
+                dn_out_logits,
+                gt_bbox,
+                gt_class,
+                masks=dn_out_masks,
+                gt_mask=gt_mask,
+                postfix="_dn",
+                dn_match_indices=dn_match_indices,
+                num_gts=num_gts)
+            total_loss.update(dn_loss)
+        else:
+            total_loss.update(
+                {k + '_dn': paddle.to_tensor([0.])
+                 for k in total_loss.keys()})
+
+        return total_loss
+
+    def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts,
+                       postfix=""):
+        # masks: [b, query, h, w], gt_mask: list[[n, H, W]]
+        name_mask = "loss_mask" + postfix
+        name_dice = "loss_dice" + postfix
+
+        loss = dict()
+        if sum(len(a) for a in gt_mask) == 0:
+            loss[name_mask] = paddle.to_tensor([0.])
+            loss[name_dice] = paddle.to_tensor([0.])
+            return loss
+
+        src_masks, target_masks = self._get_src_target_assign(masks, gt_mask,
+                                                              match_indices)
+        # sample points
+        sample_points = self._get_point_coords_by_uncertainty(src_masks)
+        sample_points = 2.0 * sample_points.unsqueeze(1) - 1.0
+
+        src_masks = F.grid_sample(
+            src_masks.unsqueeze(1), sample_points,
+            align_corners=False).squeeze([1, 2])
+
+        target_masks = F.grid_sample(
+            target_masks.unsqueeze(1), sample_points,
+            align_corners=False).squeeze([1, 2]).detach()
+
+        loss[name_mask] = self.loss_coeff[
+            'mask'] * F.binary_cross_entropy_with_logits(
+                src_masks, target_masks,
+                reduction='none').mean(1).sum() / num_gts
+        loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss(
+            src_masks, target_masks, num_gts)
+        return loss
+
+    def _get_point_coords_by_uncertainty(self, masks):
+        # Sample points based on their uncertainty.
+        masks = masks.detach()
+        num_masks = masks.shape[0]
+        sample_points = paddle.rand(
+            [num_masks, 1, self.num_oversample_points, 2])
+
+        out_mask = F.grid_sample(
+            masks.unsqueeze(1), 2.0 * sample_points - 1.0,
+            align_corners=False).squeeze([1, 2])
+        out_mask = -paddle.abs(out_mask)
+
+        _, topk_ind = paddle.topk(out_mask, self.num_important_points, axis=1)
+        batch_ind = paddle.arange(end=num_masks, dtype=topk_ind.dtype)
+        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_important_points])
+        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
+
+        sample_points = paddle.gather_nd(sample_points.squeeze(1), topk_ind)
+        if self.num_random_points > 0:
+            sample_points = paddle.concat(
+                [
+                    sample_points,
+                    paddle.rand([num_masks, self.num_random_points, 2])
+                ],
+                axis=1)
+        return sample_points
--- a/rtdetr_paddle/ppdet/modeling/losses/focal_loss.py
+++ b/rtdetr_paddle/ppdet/modeling/losses/focal_loss.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn.functional as F
+import paddle.nn as nn
+from ppdet.core.workspace import register
+
+__all__ = ['FocalLoss', 'Weighted_FocalLoss']
+
+@register
+class FocalLoss(nn.Layer):
+    """A wrapper around paddle.nn.functional.sigmoid_focal_loss.
+    Args:
+        use_sigmoid (bool): currently only support use_sigmoid=True
+        alpha (float): parameter alpha in Focal Loss
+        gamma (float): parameter gamma in Focal Loss
+        loss_weight (float): final loss will be multiplied by this
+    """
+    def __init__(self,
+                 use_sigmoid=True,
+                 alpha=0.25,
+                 gamma=2.0,
+                 loss_weight=1.0):
+        super(FocalLoss, self).__init__()
+        assert use_sigmoid == True, \
+            'Focal Loss only supports sigmoid at the moment'
+        self.use_sigmoid = use_sigmoid
+        self.alpha = alpha
+        self.gamma = gamma
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, reduction='none'):
+        """forward function.
+        Args:
+            pred (Tensor): logits of class prediction, of shape (N, num_classes)
+            target (Tensor): target class label, of shape (N, )
+            reduction (str): the way to reduce loss, one of (none, sum, mean)
+        """
+        num_classes = pred.shape[1]
+        target = F.one_hot(target, num_classes+1).cast(pred.dtype)
+        target = target[:, :-1].detach()
+        loss = F.sigmoid_focal_loss(
+            pred, target, alpha=self.alpha, gamma=self.gamma,
+            reduction=reduction)
+        return loss * self.loss_weight
+
+
+@register
+class Weighted_FocalLoss(FocalLoss):
+    """A wrapper around paddle.nn.functional.sigmoid_focal_loss.
+    Args:
+        use_sigmoid (bool): currently only support use_sigmoid=True
+        alpha (float): parameter alpha in Focal Loss
+        gamma (float): parameter gamma in Focal Loss
+        loss_weight (float): final loss will be multiplied by this
+    """
+    def __init__(self,
+                 use_sigmoid=True,
+                 alpha=0.25,
+                 gamma=2.0,
+                 loss_weight=1.0,
+                 reduction="mean"):
+        super(FocalLoss, self).__init__()
+        assert use_sigmoid == True, \
+            'Focal Loss only supports sigmoid at the moment'
+        self.use_sigmoid = use_sigmoid
+        self.alpha = alpha
+        self.gamma = gamma
+        self.loss_weight = loss_weight
+        self.reduction = reduction
+
+    def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None):
+        """forward function.
+        Args:
+            pred (Tensor): logits of class prediction, of shape (N, num_classes)
+            target (Tensor): target class label, of shape (N, )
+            reduction (str): the way to reduce loss, one of (none, sum, mean)
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        num_classes = pred.shape[1]
+        target = F.one_hot(target, num_classes + 1).astype(pred.dtype)
+        target = target[:, :-1].detach()
+        loss = F.sigmoid_focal_loss(
+            pred, target, alpha=self.alpha, gamma=self.gamma,
+            reduction='none')
+
+        if weight is not None:
+            if weight.shape != loss.shape:
+                if weight.shape[0] == loss.shape[0]:
+                    # For most cases, weight is of shape (num_priors, ),
+                    #  which means it does not have the second axis num_class
+                    weight = weight.reshape((-1, 1))
+                else:
+                    # Sometimes, weight per anchor per class is also needed. e.g.
+                    #  in FSAF. But it may be flattened of shape
+                    #  (num_priors x num_class, ), while loss is still of shape
+                    #  (num_priors, num_class).
+                    assert weight.numel() == loss.numel()
+                    weight = weight.reshape((loss.shape[0], -1))
+            assert weight.ndim == loss.ndim
+            loss = loss * weight
+
+        # if avg_factor is not specified, just reduce the loss
+        if avg_factor is None:
+            if reduction == 'mean':
+                loss = loss.mean()
+            elif reduction == 'sum':
+                loss = loss.sum()
+        else:
+            # if reduction is mean, then average the loss by avg_factor
+            if reduction == 'mean':
+                # Avoid causing ZeroDivisionError when avg_factor is 0.0,
+                # i.e., all labels of an image belong to ignore index.
+                eps = 1e-10
+                loss = loss.sum() / (avg_factor + eps)
+            # if reduction is 'none', then do nothing, otherwise raise an error
+            elif reduction != 'none':
+                raise ValueError('avg_factor can not be used with reduction="sum"')
+
+        return loss * self.loss_weight
--- a/Show More
+++ b/Show More