commit ec2379914810d4ca503155e838a427a7d83b3329
Author: 陈赣 <gavin@gavinm2.local>
Date:   Wed Jun 3 12:42:47 2026 +0800

    first commit

diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
new file mode 100644
index 0000000..8dee225
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1,13 @@
+# These are supported funding model platforms
+
+github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
+patreon: # Replace with a single Patreon username
+open_collective: # Replace with a single Open Collective username
+ko_fi: # Replace with a single Ko-fi username
+tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
+community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
+liberapay: # Replace with a single Liberapay username
+issuehunt: # Replace with a single IssueHunt username
+otechie: # Replace with a single Otechie username
+lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
+custom: ['https://github.com/lyuwenyu/cvperception/assets/17582080/2b4bfcd5-5c0f-45fd-badf-3f6e5b0249ac']# Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000..53c082a
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,21 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: lyuwenyu
+
+---
+
+**Star RTDETR**
+请先在RTDETR主页点击**star**以支持本项目
+Star RTDETR to help more people discover this project. 
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is. 
+If applicable, add screenshots to help explain your problem. 
+
+**To Reproduce**
+Steps to reproduce the behavior.
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3a63153
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,172 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+
+.DS_Store
+*.ipynb
+*.pth 
+*.pdparams
+*.onnx
+test.py
+rtdetr_pytorch/output/
+rtdetr_pytorch/dataset/
+rtdetrv2_pytorch/output/
+rtdetrv2_pytorch/dataset/
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..261eeb9
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e04aa87
--- /dev/null
+++ b/README.md
@@ -0,0 +1,140 @@
+English | [简体中文](README_cn.md)
+
+
+<h2 align="center">RT-DETR: DETRs Beat YOLOs on Real-time Object Detection</h2>
+<p align="center">
+    <!-- <a href="https://github.com/lyuwenyu/RT-DETR/blob/main/LICENSE">
+        <img alt="license" src="https://img.shields.io/badge/LICENSE-Apache%202.0-blue">
+    </a> -->
+    <a href="https://github.com/lyuwenyu/RT-DETR/blob/main/LICENSE">
+        <img alt="license" src="https://img.shields.io/github/license/lyuwenyu/RT-DETR">
+    </a>
+    <a href="https://github.com/lyuwenyu/RT-DETR/pulls">
+        <img alt="prs" src="https://img.shields.io/github/issues-pr/lyuwenyu/RT-DETR">
+    </a>
+    <a href="https://github.com/lyuwenyu/RT-DETR/issues">
+        <img alt="issues" src="https://img.shields.io/github/issues/lyuwenyu/RT-DETR?color=pink">
+    </a>
+    <a href="https://github.com/lyuwenyu/RT-DETR">
+        <img alt="issues" src="https://img.shields.io/github/stars/lyuwenyu/RT-DETR">
+    </a>
+    <a href="https://arxiv.org/abs/2304.08069">
+        <img alt="arXiv" src="https://img.shields.io/badge/arXiv-2304.08069-red">
+    </a>
+    <a href="mailto: lyuwenyu@foxmail.com">
+        <img alt="emal" src="https://img.shields.io/badge/contact_me-email-yellow">
+    </a>
+</p>
+
+---
+
+
+This is the official implementation of papers 
+- [DETRs Beat YOLOs on Real-time Object Detection](https://arxiv.org/abs/2304.08069)
+- [RT-DETRv2: Improved Baseline with Bag-of-Freebies for Real-Time Detection Transformer](https://arxiv.org/abs/2407.17140)
+
+
+<details>
+<summary>Fig</summary>
+
+<table><tr>
+<td><img src=https://github.com/lyuwenyu/RT-DETR/assets/77494834/0ede1dc1-a854-43b6-9986-cf9090f11a61 border=0 width=500></td>
+<td><img src=https://github.com/user-attachments/assets/437877e9-1d4f-4d30-85e8-aafacfa0ec56 border=0 width=500></td>
+</tr></table>
+</details>
+
+
+## 🚀 Updates
+- \[2025.11.18\] Release the **newest** member of the RT-DETR family: [RT-DETRv4:Painlessly Furthering Real-Time Object Detection with Vision Foundation Models](https://github.com/RT-DETRs/RT-DETRv4).
+By harnessing the rapidly evolving capabilities of Vision Foundation Models (VFMs), we boost lightweight detectors and, without incurring any extra inference latency, significantly improve the performance of the full-size model.
+- \[2024.11.28\] Add torch tool for parameters and flops statistics. see [run_profile.py](./rtdetrv2_pytorch/tools/run_profile.py)
+- \[2024.10.10\] Add sliced inference support for small object detecion. [#468](https://github.com/lyuwenyu/RT-DETR/pull/468)
+- \[2024.09.23\] Add ✅[Regnet and DLA34](https://github.com/lyuwenyu/RT-DETR/tree/main/rtdetr_pytorch) for RTDETR.
+- \[2024.08.27\] Add hubconf.py file to support torch hub.
+- \[2024.08.22\] Improve the performance of ✅ [RT-DETRv2-S](./rtdetrv2_pytorch/) to 48.1 mAP (<font color=green>+1.6</font> compared to RT-DETR-R18).
+- \[2024.07.24\] Release ✅ [RT-DETRv2](./rtdetrv2_pytorch/)!
+- \[2024.02.27\] Our work has been accepted to CVPR 2024!
+- \[2024.01.23\] Fix difference on data augmentation with paper in rtdetr_pytorch [#84](https://github.com/lyuwenyu/RT-DETR/commit/5dc64138e439247b4e707dd6cebfe19d8d77f5b1).
+- \[2023.11.07\] Add pytorch ✅ *rtdetr_r34vd* for requests [#107](https://github.com/lyuwenyu/RT-DETR/issues/107), [#114](https://github.com/lyuwenyu/RT-DETR/issues/114).
+- \[2023.11.05\] Upgrade the logic of `remap_mscoco_category` to facilitate training of custom datasets, see detils in [*Train custom data*](./rtdetr_pytorch/) part. [#81](https://github.com/lyuwenyu/RT-DETR/commit/95fc522fd7cf26c64ffd2ad0c622c392d29a9ebf).
+- \[2023.10.23\] Add [*discussion for deployments*](https://github.com/lyuwenyu/RT-DETR/issues/95), supported onnxruntime, TensorRT, openVINO.
+- \[2023.10.12\] Add tuning code for pytorch version, now you can tuning rtdetr based on pretrained weights.
+- \[2023.09.19\] Upload ✅ [*pytorch weights*](https://github.com/lyuwenyu/RT-DETR/issues/42) convert from paddle version.
+- \[2023.08.24] Release RT-DETR-R18 pretrained models on objects365. *49.2 mAP* and *217 FPS*.
+- \[2023.08.22\] Upload ✅ [*rtdetr_pytorch*](./rtdetr_pytorch/) source code. Please enjoy it!
+- \[2023.08.15\] Release RT-DETR-R101 pretrained models on objects365. *56.2 mAP* and *74 FPS*.
+- \[2023.07.30\] Release RT-DETR-R50 pretrained models on objects365. *55.3 mAP* and *108 FPS*.
+- \[2023.07.28\] Fix some bugs, and add some comments. [1](https://github.com/lyuwenyu/RT-DETR/pull/14), [2](https://github.com/lyuwenyu/RT-DETR/commit/3b5cbcf8ae3b907e6b8bb65498a6be7c6736eabc).
+- \[2023.07.13\] Upload ✅ [*training logs on coco*](https://github.com/lyuwenyu/RT-DETR/issues/8).
+- \[2023.05.17\] Release RT-DETR-R18, RT-DETR-R34, RT-DETR-R50-m（example for scaled).
+- \[2023.04.17\] Release RT-DETR-R50, RT-DETR-R101, RT-DETR-L, RT-DETR-X.
+
+## 📣 News
+- RTDETR and RTDETRv2 are now available in Hugging Face Transformers. [#413](https://github.com/lyuwenyu/RT-DETR/issues/413), [#549](https://github.com/lyuwenyu/RT-DETR/issues/549)
+- RTDETR is now available in [ultralytics/ultralytics](https://docs.ultralytics.com/zh/models/rtdetr/).
+
+## 📍 Implementations
+- 🔥 RT-DETRv2
+  - paddle: [code&weight](./rtdetrv2_paddle/)
+  - pytorch: [code&weight](./rtdetrv2_pytorch/)
+- 🔥 RT-DETR 
+  - paddle: [code&weight](./rtdetr_paddle)
+  - pytorch: [code&weight](./rtdetr_pytorch)
+
+
+| Model | Input shape | Dataset | $AP^{val}$ | $AP^{val}_{50}$| Params(M) | FLOPs(G) | T4 TensorRT FP16(FPS)
+|:---:|:---:| :---:|:---:|:---:|:---:|:---:|:---:|
+| RT-DETR-R18 | 640 | COCO | 46.5 | 63.8 | 20 | 60 | 217 |
+| RT-DETR-R34 | 640 | COCO | 48.9 | 66.8 | 31 | 92 | 161 |
+| RT-DETR-R50-m | 640 | COCO | 51.3 | 69.6 | 36 | 100 | 145 |
+| RT-DETR-R50 |  640 | COCO | 53.1 | 71.3 | 42 | 136 | 108 |
+| RT-DETR-R101 | 640 | COCO | 54.3 | 72.7 | 76 | 259 | 74 |
+| RT-DETR-HGNetv2-L | 640 | COCO | 53.0 | 71.6 | 32 | 110 | 114 |
+| RT-DETR-HGNetv2-X | 640 | COCO | 54.8 | 73.1 | 67 | 234 | 74 |
+| RT-DETR-R18 | 640 | COCO + Objects365 | **49.2** | **66.6** | 20 | 60 | **217** |
+| RT-DETR-R50 | 640 | COCO + Objects365 | **55.3** | **73.4** | 42 | 136 | **108** |
+| RT-DETR-R101 | 640 | COCO + Objects365 | **56.2** | **74.6** | 76 | 259 | **74** |
+**RT-DETRv2-S** | 640 | COCO  | **48.1** <font color=green>(+1.6)</font> | **65.1** | 20 | 60 | 217 |
+**RT-DETRv2-M**<sup>*<sup> | 640 | COCO  | **49.9** <font color=green>(+1.0)</font> | **67.5** | 31 | 92 | 161 |
+**RT-DETRv2-M** | 640 | COCO | **51.9** <font color=green>(+0.6)</font> | **69.9** | 36 | 100 | 145 |
+**RT-DETRv2-L** | 640 | COCO | **53.4** <font color=green>(+0.3)</font> | **71.6** | 42 | 136 | 108 |
+**RT-DETRv2-X** | 640 | COCO | 54.3 | **72.8** <font color=green>(+0.1)</font>  | 76 | 259| 74 |
+
+**Notes:**
+- `COCO + Objects365` in the table means finetuned model on COCO using pretrained weights trained on Objects365.
+
+
+## 🦄 Performance
+
+### 🏕️ Complex Scenarios
+<div align="center">
+  <img src="https://github.com/lyuwenyu/RT-DETR/assets/77494834/52743892-68c8-4e53-b782-9f89221739e4" width=500 >
+</div>
+
+### 🌋 Difficult Conditions
+<div align="center">
+  <img src="https://github.com/lyuwenyu/RT-DETR/assets/77494834/213cf795-6da6-4261-8549-11947292d3cb" width=500 >
+</div>
+
+## Citation
+If you use `RT-DETR` or `RTDETRv2` in your work, please use the following BibTeX entries:
+```
+@misc{lv2023detrs,
+      title={DETRs Beat YOLOs on Real-time Object Detection},
+      author={Yian Zhao and Wenyu Lv and Shangliang Xu and Jinman Wei and Guanzhong Wang and Qingqing Dang and Yi Liu and Jie Chen},
+      year={2023},
+      eprint={2304.08069},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+
+@misc{lv2024rtdetrv2improvedbaselinebagoffreebies,
+      title={RT-DETRv2: Improved Baseline with Bag-of-Freebies for Real-Time Detection Transformer}, 
+      author={Wenyu Lv and Yian Zhao and Qinyao Chang and Kui Huang and Guanzhong Wang and Yi Liu},
+      year={2024},
+      eprint={2407.17140},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2407.17140}, 
+}
+```
diff --git a/README_cn.md b/README_cn.md
new file mode 100644
index 0000000..8571b46
--- /dev/null
+++ b/README_cn.md
@@ -0,0 +1,64 @@
+简体中文 | [English](README.md)
+
+# RT-DETR 
+
+文章"[DETRs Beat YOLOs on Real-time Object Detection](https://arxiv.org/abs/2304.08069)"和"[RT-DETRv2: Improved Baseline with Bag-of-Freebies for Real-Time Detection Transformer](https://arxiv.org/abs/2407.17140)"的官方实现.
+
+<details>
+<summary>Fig</summary>
+
+<div align="center">
+  <img src="https://github.com/lyuwenyu/RT-DETR/assets/77494834/0ede1dc1-a854-43b6-9986-cf9090f11a61" width=500 >
+</div>
+
+</details>
+
+
+## 最新动态
+- 发布RT-DETRv2系列模型
+- 发布RT-DETR-R50, RT-DETR-R101模型
+- 发布RT-DETR-R50-m模型（scale模型的范例）
+- 发布RT-DETR-R34, RT-DETR-R18模型
+- 发布RT-DETR-L, RT-DETR-X模型
+
+
+## 代码仓库
+- 🔥 RT-DETRv2
+  - paddle: [code&weight](./rtdetrv2_paddle/)
+  - pytorch: [code&weight](./rtdetrv2_pytorch/)
+- 🔥 RT-DETR 
+  - paddle: [code&weight](./rtdetr_paddle)
+  - pytorch: [code&weight](./rtdetr_pytorch)
+
+
+## 简介
+<!-- We propose a **R**eal-**T**ime **DE**tection **TR**ansformer (RT-DETR), the first real-time end-to-end object detector to our best knowledge. Specifically, we design an efficient hybrid encoder to efficiently process multi-scale features by decoupling the intra-scale interaction and cross-scale fusion, and propose IoU-aware query selection to improve the initialization of object queries. In addition, our proposed detector supports flexibly adjustment of the inference speed by using different decoder layers without the need for retraining, which facilitates the practical application of real-time object detectors. Our RT-DETR-L achieves 53.0% AP on COCO val2017 and 114 FPS on T4 GPU, while RT-DETR-X achieves 54.8% AP and 74 FPS, outperforming all YOLO detectors of the same scale in both speed and accuracy. Furthermore, our RT-DETR-R50 achieves 53.1% AP and 108 FPS, outperforming DINO-Deformable-DETR-R50 by 2.2% AP in accuracy and by about 21 times in FPS.  -->
+RT-DETR是第一个实时端到端目标检测器。具体而言，我们设计了一个高效的混合编码器，通过解耦尺度内交互和跨尺度融合来高效处理多尺度特征，并提出了IoU感知的查询选择机制，以优化解码器查询的初始化。此外，RT-DETR支持通过使用不同的解码器层来灵活调整推理速度，而不需要重新训练，这有助于实时目标检测器的实际应用。RT-DETR-R50在COCO val2017上实现了53.1%的AP，在T4 GPU上实现了108FPS，RT-DETR-R101实现了54.3%的AP和74FPS，在速度和精度方面都优于相同规模的所有YOLO检测器。使用Objects365预训练之后, RT-DETR-R50 和 RT-DETR-R101 分别实现了 55.3% 和 56.2% AP的精度.
+若要了解更多细节，请参考我们的论文[paper](https://arxiv.org/abs/2304.08069).
+
+<div align="center">
+  <img src="https://github.com/lyuwenyu/RT-DETR/assets/77494834/c211a164-ddce-4084-8b71-fb73f29f363b" width=500 >
+</div>
+
+## 引用RT-DETR
+如果需要在你的研究中使用RT-DETR，请通过以下方式引用我们的论文：
+```
+@misc{lv2023detrs,
+      title={DETRs Beat YOLOs on Real-time Object Detection},
+      author={Yian Zhao and Wenyu Lv and Shangliang Xu and Jinman Wei and Guanzhong Wang and Qingqing Dang and Yi Liu and Jie Chen},
+      year={2023},
+      eprint={2304.08069},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+
+@misc{lv2024rtdetrv2improvedbaselinebagoffreebies,
+      title={RT-DETRv2: Improved Baseline with Bag-of-Freebies for Real-Time Detection Transformer}, 
+      author={Wenyu Lv and Yian Zhao and Qinyao Chang and Kui Huang and Guanzhong Wang and Yi Liu},
+      year={2024},
+      eprint={2407.17140},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2407.17140}, 
+}
+```
diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 0000000..779ce65
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,59 @@
+# 论文测速使用的部分代码和工具
+
+
+## 测试YOLO系列的速度 [in progress]
+以[yolov8](https://github.com/ultralytics/ultralytics)为例
+
+<details open>
+<summary>1. 转onnx </summary>  
+
+执行`yolov8_onnx.py`中的`export_onnx`函数，新增代码主要涉及输出格式的转换
+</details>
+
+
+<details>
+<summary>2. 插入nms </summary>
+
+使用`utils.py`中的`yolo_insert_nms`函数，导出onnx模型后使用[Netron](https://netron.app/)查看结构. <img width="924" alt="image" src="https://github.com/lyuwenyu/RT-DETR/assets/17582080/cb466483-d3a3-4f23-a68d-7ab8825059c8">
+</details>
+
+
+<details>
+<summary>3. 转tensorrt </summary>
+
+可以使用`trtexec.md`中的的脚本转换，或者使用`utils.py`中的Python代码转换
+```bash
+# trtexec -h
+trtexec --onnx=./yolov8l_w_nms.onnx --saveEngine=yolov8l_w_nms.engine --buildOnly --fp16
+```
+</details>
+
+
+<details>
+<summary>4. trtexec测速 </summary>
+
+可以使用`trtexec.md`中的的脚本转换，去掉`--buildOnly`参数
+
+</details>
+
+
+
+<details>
+<summary>5. profile分析（可选） </summary>
+
+在4的基础之上加以下命令
+```bash
+nsys profile --force-overwrite=true  -t 'nvtx,cuda,osrt,cudnn' -c cudaProfilerApi -o yolov8l_w_nms 
+```
+可以使用nsys可视化分析
+<img width="1090" alt="image" src="https://github.com/lyuwenyu/RT-DETR/assets/17582080/507d8bde-9e7c-4ae5-b571-976c540ef2c6">
+
+</details>
+
+
+<details>
+<summary>6. Python测速或者部署   </summary>
+
+在Coco val数据集上测模型的平均速度使用`trtinfer.py`中的代码推理
+
+</details>
diff --git a/benchmark/dataset.py b/benchmark/dataset.py
new file mode 100644
index 0000000..ce60173
--- /dev/null
+++ b/benchmark/dataset.py
@@ -0,0 +1,102 @@
+'''by lyuwenyu
+'''
+
+import os
+import glob
+from PIL import Image
+
+import torch
+import torch.utils.data as data
+import torchvision
+import torchvision.transforms as T 
+import torchvision.transforms.functional as F 
+
+
+class ToTensor(T.ToTensor):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def __call__(self, pic):
+        if isinstance(pic, torch.Tensor):
+            return pic 
+        return super().__call__(pic)
+
+class PadToSize(T.Pad):
+    def __init__(self, size, fill=0, padding_mode='constant'):
+        super().__init__(0, fill, padding_mode)
+        self.size = size
+        self.fill = fill
+
+    def __call__(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be padded.
+
+        Returns:
+            PIL Image or Tensor: Padded image.
+        """
+        w, h = F.get_image_size(img)
+        padding = (0, 0, self.size[0] - w, self.size[1] - h)
+        return F.pad(img, padding, self.fill, self.padding_mode)
+
+
+class Dataset(data.Dataset):
+    def __init__(self, img_dir: str='', preprocess: T.Compose=None, device='cuda:0') -> None:
+        super().__init__()
+
+        self.device = device
+        self.size = 640
+
+        self.im_path_list = list(glob.glob(os.path.join(img_dir, '*.jpg')))
+
+        if preprocess is None: 
+            self.preprocess = T.Compose([
+                    T.Resize(size=639, max_size=640),
+                    PadToSize(size=(640, 640), fill=114),
+                    ToTensor(),
+                    T.ConvertImageDtype(torch.float),
+            ])
+        else:
+            self.preprocess = preprocess
+
+    def __len__(self, ):
+        return len(self.im_path_list)
+
+    def __getitem__(self, index):
+        # im = Image.open(self.img_path_list[index]).convert('RGB')
+        im = torchvision.io.read_file(self.im_path_list[index])
+        im = torchvision.io.decode_jpeg(im, mode=torchvision.io.ImageReadMode.RGB, device=self.device)
+        _, h, w = im.shape # c,h,w
+
+        im = self.preprocess(im)
+
+        blob = {
+            'image': im, 
+            'im_shape': torch.tensor([self.size, self.size]).to(im.device),
+            'scale_factor': torch.tensor([self.size / h, self.size / w]).to(im.device),
+            'orig_size': torch.tensor([w, h]).to(im.device),
+        }
+
+        return blob
+
+    @staticmethod
+    def post_process():
+        pass
+
+    @staticmethod
+    def collate_fn():
+        pass
+
+
+def draw_nms_result(blob, outputs, draw_score_threshold=0.25, name=''):
+    '''show result
+    Keys:
+        'num_dets', 'det_boxes', 'det_scores', 'det_classes'
+    '''    
+    for i in range(blob['image'].shape[0]):
+        det_scores = outputs['det_scores'][i]
+        det_boxes = outputs['det_boxes'][i][det_scores > draw_score_threshold]
+        
+        im = (blob['image'][i] * 255).to(torch.uint8)
+        im = torchvision.utils.draw_bounding_boxes(im, boxes=det_boxes, width=2)
+        Image.fromarray(im.permute(1, 2, 0).cpu().numpy()).save(f'test_{name}_{i}.jpg')
diff --git a/benchmark/trtexec.md b/benchmark/trtexec.md
new file mode 100644
index 0000000..d41855b
--- /dev/null
+++ b/benchmark/trtexec.md
@@ -0,0 +1,13 @@
+
+```bash
+# build tensorrt engine 
+trtexec --onnx=./yolov8l_w_nms.onnx --saveEngine=yolov8l_w_nms.engine --buildOnly --fp16
+
+# using dynamic shapes
+# --explicitBatch --minShapes=image:1x3x640x640 --optShapes=image:8x3x640x640  --maxShapes=image:16x3x640x640 --shapes=image:8x3x640x640
+
+# timeline 
+nsys profile --force-overwrite=true  -t 'nvtx,cuda,osrt,cudnn' -c cudaProfilerApi -o yolov8l_w_nms  trtexec --loadEngine=./yolov8l_w_nms.engine --fp16 --avgRuns=10 --loadInputs='image:input_tensor.bin'
+
+# https://forums.developer.nvidia.com/t/about-loadinputs-in-trtexec/218880
+```
diff --git a/benchmark/trtinfer.py b/benchmark/trtinfer.py
new file mode 100644
index 0000000..80eefb0
--- /dev/null
+++ b/benchmark/trtinfer.py
@@ -0,0 +1,153 @@
+'''by lyuwenyu
+'''
+
+import time 
+import contextlib
+from collections import namedtuple, OrderedDict
+
+import torch
+import numpy as np
+import tensorrt as trt
+
+from utils import TimeProfiler
+
+class TRTInference(object):
+    def __init__(self, engine_path, device='cuda:0', backend='torch', max_batch_size=32, verbose=False):
+        self.engine_path = engine_path
+        self.device = device
+        self.backend = backend
+        self.max_batch_size = max_batch_size
+        
+        self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO)  
+
+        self.engine = self.load_engine(engine_path)
+
+        self.context = self.engine.create_execution_context()
+
+        self.bindings = self.get_bindings(self.engine, self.context, self.max_batch_size, self.device)
+        self.bindings_addr = OrderedDict((n, v.ptr) for n, v in self.bindings.items())
+
+        self.input_names = self.get_input_names()
+        self.output_names = self.get_output_names()
+        
+        if self.backend == 'cuda':
+            self.stream = cuda.Stream()
+
+        self.time_profile = TimeProfiler()
+
+    def init(self, ):
+        self.dynamic = False 
+
+    def load_engine(self, path):
+        '''load engine
+        '''
+        trt.init_libnvinfer_plugins(self.logger, '')
+        with open(path, 'rb') as f, trt.Runtime(self.logger) as runtime:
+            return runtime.deserialize_cuda_engine(f.read())
+    
+    def get_input_names(self, ):
+        names = []
+        for _, name in enumerate(self.engine):
+            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
+                names.append(name)
+        return names
+    
+    def get_output_names(self, ):
+        names = []
+        for _, name in enumerate(self.engine):
+            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
+                names.append(name)
+        return names
+
+    def get_bindings(self, engine, context, max_batch_size=32, device=None):
+        '''build binddings
+        '''
+        Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
+        bindings = OrderedDict()
+        # max_batch_size = 1
+
+        for i, name in enumerate(engine):
+            shape = engine.get_tensor_shape(name)
+            dtype = trt.nptype(engine.get_tensor_dtype(name))
+
+            if shape[0] == -1:
+                dynamic = True 
+                shape[0] = max_batch_size
+                if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:  # dynamic
+                    context.set_input_shape(name, shape)
+
+            if self.backend == 'cuda':
+                if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
+                    data = np.random.randn(*shape).astype(dtype)
+                    ptr = cuda.mem_alloc(data.nbytes)
+                    bindings[name] = Binding(name, dtype, shape, data, ptr) 
+                else:
+                    data = cuda.pagelocked_empty(trt.volume(shape), dtype)
+                    ptr = cuda.mem_alloc(data.nbytes)
+                    bindings[name] = Binding(name, dtype, shape, data, ptr) 
+
+            else:
+                data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
+                bindings[name] = Binding(name, dtype, shape, data, data.data_ptr())
+
+        return bindings
+
+    def run_torch(self, blob):
+        '''torch input
+        '''
+        for n in self.input_names:
+            if self.bindings[n].shape != blob[n].shape:
+                self.context.set_input_shape(n, blob[n].shape) 
+                self.bindings[n] = self.bindings[n]._replace(shape=blob[n].shape)
+
+        self.bindings_addr.update({n: blob[n].data_ptr() for n in self.input_names})
+        self.context.execute_v2(list(self.bindings_addr.values()))
+        outputs = {n: self.bindings[n].data for n in self.output_names}
+
+        return outputs
+
+
+    def async_run_cuda(self, blob):
+        '''numpy input
+        '''
+        for n in self.input_names:
+            cuda.memcpy_htod_async(self.bindings_addr[n], blob[n], self.stream)
+        
+        bindings_addr = [int(v) for _, v in self.bindings_addr.items()]
+        self.context.execute_async_v2(bindings=bindings_addr, stream_handle=self.stream.handle)
+        
+        outputs = {}
+        for n in self.output_names:
+            cuda.memcpy_dtoh_async(self.bindings[n].data, self.bindings[n].ptr, self.stream)
+            outputs[n] = self.bindings[n].data
+        
+        self.stream.synchronize()
+        
+        return outputs
+    
+    def __call__(self, blob):
+        if self.backend == 'torch':
+            return self.run_torch(blob)
+
+        elif self.backend == 'cuda':
+            return self.async_run_cuda(blob)
+
+    def synchronize(self, ):
+        if self.backend == 'torch' and torch.cuda.is_available():
+            torch.cuda.synchronize()
+
+        elif self.backend == 'cuda':
+            self.stream.synchronize()
+    
+    def warmup(self, blob, n):
+        for _ in range(n):
+            _ = self(blob)
+
+    def speed(self, blob, n):
+        self.time_profile.reset()
+        for _ in range(n):
+            with self.time_profile:
+                _ = self(blob)
+
+        return self.time_profile.total / n 
+
diff --git a/benchmark/utils.py b/benchmark/utils.py
new file mode 100644
index 0000000..f47ea2e
--- /dev/null
+++ b/benchmark/utils.py
@@ -0,0 +1,83 @@
+'''by lyuwenyu
+'''
+
+import time 
+import contextlib
+import numpy as np
+from PIL import Image
+from collections import OrderedDict
+
+import onnx
+import torch 
+import onnx_graphsurgeon
+
+
+def to_binary_data(path, size=(640, 640), output_name='input_tensor.bin'):
+    '''--loadInputs='image:input_tensor.bin'
+    '''
+    im = Image.open(path).resize(size)
+    data = np.asarray(im, dtype=np.float32).transpose(2, 0, 1)[None] / 255.
+    data.tofile(output_name)
+
+
+def yolo_insert_nms(path, score_threshold=0.01, iou_threshold=0.7, max_output_boxes=300, simplify=False):
+    '''
+    http://www.xavierdupre.fr/app/onnxcustom/helpsphinx/api/onnxops/onnx__EfficientNMS_TRT.html
+    https://huggingface.co/spaces/muttalib1326/Punjabi_Character_Detection/blob/3dd1e17054c64e5f6b2254278f96cfa2bf418cd4/utils/add_nms.py
+    '''
+    onnx_model = onnx.load(path)
+
+    if simplify:
+        from onnxsim import simplify
+        onnx_model, _ = simplify(onnx_model,  overwrite_input_shapes={'image': [1, 3, 640, 640]})
+
+    graph = onnx_graphsurgeon.import_onnx(onnx_model)
+    graph.toposort()
+    graph.fold_constants()
+    graph.cleanup()
+
+    topk = max_output_boxes
+    attrs = OrderedDict(plugin_version='1',
+                        background_class=-1,
+                        max_output_boxes=topk,
+                        score_threshold=score_threshold,
+                        iou_threshold=iou_threshold,
+                        score_activation=False,
+                        box_coding=0, )
+
+    outputs = [onnx_graphsurgeon.Variable('num_dets', np.int32, [-1, 1]),
+               onnx_graphsurgeon.Variable('det_boxes', np.float32, [-1, topk, 4]),
+               onnx_graphsurgeon.Variable('det_scores', np.float32, [-1, topk]),
+               onnx_graphsurgeon.Variable('det_classes', np.int32, [-1, topk])]
+
+    graph.layer(op='EfficientNMS_TRT', 
+                name="batched_nms", 
+                inputs=[graph.outputs[0], 
+                        graph.outputs[1]], 
+                outputs=outputs, 
+                attrs=attrs, )
+
+    graph.outputs = outputs
+    graph.cleanup().toposort()
+
+    onnx.save(onnx_graphsurgeon.export_onnx(graph), f'yolo_w_nms.onnx')
+
+
+class TimeProfiler(contextlib.ContextDecorator):
+    def __init__(self, ):
+        self.total = 0
+        
+    def __enter__(self, ):
+        self.start = self.time()
+        return self 
+    
+    def __exit__(self, type, value, traceback):
+        self.total += self.time() - self.start
+    
+    def reset(self, ):
+        self.total = 0
+    
+    def time(self, ):
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        return time.time()
diff --git a/benchmark/yolov8_onnx.py b/benchmark/yolov8_onnx.py
new file mode 100644
index 0000000..efa1a0c
--- /dev/null
+++ b/benchmark/yolov8_onnx.py
@@ -0,0 +1,73 @@
+'''by lyuwenyu
+'''
+
+import torch 
+import torchvision
+
+import numpy as np 
+import onnxruntime as ort 
+
+from utils import yolo_insert_nms
+
+class YOLOv8(torch.nn.Module):
+    def __init__(self, name) -> None:
+        super().__init__()
+        from ultralytics import YOLO
+        # Load a model
+        # build a new model from scratch
+        # model = YOLO(f'{name}.yaml')  
+
+        # load a pretrained model (recommended for training)
+        model = YOLO(f'{name}.pt')  
+        self.model = model.model
+
+    def forward(self, x):
+        '''https://github.com/ultralytics/ultralytics/blob/main/ultralytics/nn/tasks.py#L216
+        '''
+        pred: torch.Tensor = self.model(x)[0] # n 84 8400,
+        pred = pred.permute(0, 2, 1)
+        nc = pred.shape[-1] - 4
+        boxes, scores = pred.split([4, nc], dim=-1)
+        boxes = torchvision.ops.box_convert(boxes, in_fmt='cxcywh', out_fmt='xyxy')
+
+        return boxes, scores
+
+
+
+def export_onnx(name='yolov8n'):
+    '''export onnx
+    '''
+    m = YOLOv8(name)
+
+    x = torch.rand(1, 3, 640, 640)
+    dynamic_axes = {
+        'image': {0: '-1'}
+    }
+    torch.onnx.export(m, x, f'{name}.onnx', 
+                      input_names=['image'], 
+                      output_names=['boxes', 'scores'], 
+                      opset_version=13, 
+                      dynamic_axes=dynamic_axes)
+
+    data = np.random.rand(1, 3, 640, 640).astype(np.float32)
+    sess = ort.InferenceSession(f'{name}.onnx')
+    _ = sess.run(output_names=None, input_feed={'image': data})
+
+
+if __name__ == '__main__':
+
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--name', type=str, default='yolov8l')
+    parser.add_argument('--score_threshold', type=float, default=0.001)
+    parser.add_argument('--iou_threshold', type=float, default=0.7)
+    parser.add_argument('--max_output_boxes', type=int, default=300)
+    args = parser.parse_args()
+
+    export_onnx(name=args.name)
+    
+    yolo_insert_nms(path=f'{args.name}.onnx', 
+                    score_threshold=args.score_threshold, 
+                    iou_threshold=args.iou_threshold, 
+                    max_output_boxes=args.max_output_boxes, )
+
diff --git a/hubconf.py b/hubconf.py
new file mode 100644
index 0000000..27ff792
--- /dev/null
+++ b/hubconf.py
@@ -0,0 +1,174 @@
+"""Copyright(c) 2024 lyuwenyu. All Rights Reserved.
+"""
+
+
+import os
+import sys
+from pathlib import Path
+from urllib.parse import urlparse
+
+ROOT = Path(__file__).absolute().parent / 'rtdetrv2_pytorch'
+sys.path.append(str(ROOT))
+
+from src.core import YAMLConfig
+
+import torch
+import torch.nn as nn
+
+dependencies = ['torch', 'torchvision',]
+
+
+def _load_checkpoint(path: str, map_location='cpu'):
+    scheme = urlparse(str(path)).scheme
+    if not scheme:
+        state = torch.load(path, map_location=map_location)
+    else:
+        state = torch.hub.load_state_dict_from_url(path, map_location=map_location)
+    return state
+
+
+def _build_model(args, ):
+    """main
+    """
+    cfg = YAMLConfig(args.config)
+
+    if args.resume:
+        checkpoint = _load_checkpoint(args.resume, map_location='cpu') 
+        if 'ema' in checkpoint:
+            state = checkpoint['ema']['module']
+        else:
+            state = checkpoint['model']
+
+        # NOTE load train mode state
+        cfg.model.load_state_dict(state)
+
+
+    class Model(nn.Module):
+        def __init__(self, ) -> None:
+            super().__init__()
+            self.model = cfg.model.deploy()
+            self.postprocessor = cfg.postprocessor.deploy()
+            
+        def forward(self, images, orig_target_sizes):
+            outputs = self.model(images)
+            outputs = self.postprocessor(outputs, orig_target_sizes)
+            return outputs
+
+    return Model()
+
+
+CONFIG = {
+    # rtdetr
+    'rtdetr_r18vd': {
+        'config': ROOT / 'configs/rtdetr/rtdetr_r18vd_6x_coco.yml',
+        'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r18vd_dec3_6x_coco_from_paddle.pth',
+    },
+    'rtdetr_r34vd': {
+        'config': ROOT / 'configs/rtdetr/rtdetr_r34vd_6x_coco.yml',
+        'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r34vd_dec4_6x_coco_from_paddle.pth',
+    },
+    'rtdetr_r50vd_m': {
+        'config': ROOT / 'configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml',
+        'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_m_6x_coco_from_paddle.pth',
+    },
+    'rtdetr_r50vd': {
+        'config': ROOT / 'configs/rtdetr/rtdetr_r50vd_6x_coco.yml',
+        'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_6x_coco_from_paddle.pth',
+    },
+    'rtdetr_r101vd': {
+        'config': ROOT / 'configs/rtdetr/rtdetr_r101vd_6x_coco.yml',
+        'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r101vd_6x_coco_from_paddle.pth',
+    },
+
+    # rtdetrv2
+    'rtdetrv2_r18vd': {
+        'config': ROOT / 'configs/rtdetrv2/rtdetrv2_r18vd_120e_coco.yml',
+        'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.2/rtdetrv2_r18vd_120e_coco_rerun_48.1.pth',
+    },
+    'rtdetrv2_r34vd': {
+        'config': ROOT / 'configs/rtdetrv2/rtdetrv2_r34vd_120e_coco.yml',
+        'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r34vd_120e_coco_ema.pth',
+    },
+    'rtdetrv2_r50vd_m': {
+        'config': ROOT / 'configs/rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml',
+        'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_m_7x_coco_ema.pth',
+    },
+    'rtdetrv2_r50vd': {
+        'config': ROOT / 'configs/rtdetrv2/rtdetrv2_r50vd_6x_coco.yml',
+        'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_6x_coco_ema.pth',
+    },
+    'rtdetrv2_r101vd': {
+        'config': ROOT / 'configs/rtdetrv2/rtdetrv2_r101vd_6x_coco.yml',
+        'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r101vd_6x_coco_from_paddle.pth',
+    },
+}
+
+
+# rtdetr
+def rtdetr_r18vd(pretrained=True,):
+    args = type('Args', (), CONFIG['rtdetr_r18vd'])()
+    args.resume = args.resume if pretrained else ''
+    return _build_model(args, )
+
+
+def rtdetr_r34vd(pretrained=True,):
+    args = type('Args', (), CONFIG['rtdetr_r34vd'])()
+    args.resume = args.resume if pretrained else ''
+    return _build_model(args, )
+
+
+def rtdetr_r50vd_m(pretrained=True):
+    args = type('Args', (), CONFIG['rtdetr_r50vd_m'])()
+    args.resume = args.resume if pretrained else ''
+    return _build_model(args, )
+
+
+def rtdetr_r50vd(pretrained=True):
+    args = type('Args', (), CONFIG['rtdetr_r50vd'])()
+    args.resume = args.resume if pretrained else ''
+    return _build_model(args, )
+
+
+def rtdetr_r101vd(pretrained=True):
+    args = type('Args', (), CONFIG['rtdetr_r101vd'])()
+    args.resume = args.resume if pretrained else ''
+    return _build_model(args, )
+
+
+# rtdetrv2
+def rtdetrv2_r18vd(pretrained=True,):
+    args = type('Args', (), CONFIG['rtdetrv2_r18vd'])()
+    args.resume = args.resume if pretrained else ''
+    return _build_model(args, )
+
+
+def rtdetrv2_r34vd(pretrained=True,):
+    args = type('Args', (), CONFIG['rtdetrv2_r34vd'])()
+    args.resume = args.resume if pretrained else ''
+    return _build_model(args, )
+
+
+def rtdetrv2_r50vd_m(pretrained=True):
+    args = type('Args', (), CONFIG['rtdetrv2_r50vd_m'])()
+    args.resume = args.resume if pretrained else ''
+    return _build_model(args, )
+
+
+def rtdetrv2_r50vd(pretrained=True):
+    args = type('Args', (), CONFIG['rtdetrv2_r50vd'])()
+    args.resume = args.resume if pretrained else ''
+    return _build_model(args, )
+
+
+def rtdetrv2_r101vd(pretrained=True):
+    args = type('Args', (), CONFIG['rtdetrv2_r101vd'])()
+    args.resume = args.resume if pretrained else ''
+    return _build_model(args, )
+
+
+rtdetrv2_s = rtdetrv2_r18vd
+rtdetrv2_m_r34 = rtdetrv2_r34vd
+rtdetrv2_m_r50 = rtdetrv2_r50vd_m
+rtdetrv2_l = rtdetrv2_r50vd
+rtdetrv2_x = rtdetrv2_r101vd
+
diff --git a/rtdetr_paddle/README.md b/rtdetr_paddle/README.md
new file mode 100644
index 0000000..98312fc
--- /dev/null
+++ b/rtdetr_paddle/README.md
@@ -0,0 +1,244 @@
+English | [简体中文](README_cn.md)
+
+## Model Zoo on COCO
+
+| Model | Epoch | Backbone  | Input shape | $AP^{val}$ | $AP^{val}_{50}$| Params(M) | FLOPs(G) |  T4 TensorRT FP16(FPS) | Weight | Config | Log
+|:--------------:|:-----:|:----------:| :-------:|:--------------------------:|:---------------------------:|:---------:|:--------:| :---------------------: |:------------------------------------------------------------------------------------:|:-------------------------------------------:|:---|
+| RT-DETR-R18 | 6x |  ResNet-18 | 640 | 46.5 | 63.8 | 20 | 60 | 217 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r18vd_dec3_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r18vd_6x_coco.yml) | [rtdetr_r18vd_dec3_6x_coco_log.txt](https://github.com/lyuwenyu/RT-DETR/files/12038864/rtdetr_r18vd_dec3_6x_coco_log.txt)
+| RT-DETR-R34 | 6x |  ResNet-34 | 640 | 48.9 | 66.8 | 31 | 92 | 161 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r34vd_dec4_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r34vd_6x_coco.yml) | [rtdetr_r34vd_dec4_6x_coco_log.txt](https://github.com/lyuwenyu/RT-DETR/files/12038861/rtdetr_r34vd_dec4_6x_coco_log.txt)
+| RT-DETR-R50-m | 6x |  ResNet-50 | 640 | 51.3 | 69.6 | 36 | 100 | 145 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_m_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml) | -
+| RT-DETR-R50 | 6x |  ResNet-50 | 640 | 53.1 | 71.3 | 42 | 136 | 108 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r50vd_6x_coco.yml) | [rtdetr_r50vd_6x_coco_log.txt](https://github.com/lyuwenyu/RT-DETR/files/12038669/rtdetr_r50vd_6x_coco_log.txt)
+| RT-DETR-R101 | 6x |  ResNet-101 | 640 | 54.3 | 72.7 | 76 | 259 | 74 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r101vd_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r101vd_6x_coco.yml) | [rtdetr_r101vd_6x_coco_log.txt](https://github.com/lyuwenyu/RT-DETR/files/12038707/rtdetr_r101vd_6x_coco_log.txt)
+| RT-DETR-L | 6x |  HGNetv2 | 640 | 53.0 | 71.6 | 32 | 110 | 114 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_hgnetv2_l_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_hgnetv2_l_6x_coco.yml) | [rtdetr_hgnetv2_l_6x_coco_log.txt](https://github.com/lyuwenyu/RT-DETR/files/12038753/rtdetr_hgnetv2_l_6x_coco_log.txt)
+| RT-DETR-X | 6x |  HGNetv2 | 640 | 54.8 | 73.1 | 67 | 234 | 74 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_hgnetv2_x_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_hgnetv2_x_6x_coco.yml) | [rtdetr_hgnetv2_x_6x_coco_log.txt](https://github.com/lyuwenyu/RT-DETR/files/12038795/rtdetr_hgnetv2_x_6x_coco_log.txt)
+
+**Notes:**
+- RT-DETR uses 4 GPUs for training.
+- RT-DETR was trained on COCO train2017 and evaluated on val2017.
+
+
+## Model Zoo on Objects365
+| Model | Epoch | Dataset | Input shape | $AP^{val}$ | $AP^{val}_{50}$ | T4 TensorRT FP16(FPS) | Weight | Log
+|:---:|:---:|:---:| :---:|:---:|:---:|:---:|:---:|:---:|
+RT-DETR-R18 | 1x | Objects365 | 640 | 22.9 | 31.2 | - | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r18vd_1x_objects365.pdparams) | [log.txt](https://github.com/lyuwenyu/RT-DETR/files/12394706/rtdetr_r18vd_1x_objects365_log.txt)
+RT-DETR-R18 | 5x | COCO + Objects365 | 640 | **49.2** | **66.6** | **217** | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r18vd_5x_coco_objects365.pdparams) | [log.txt](https://github.com/lyuwenyu/RT-DETR/files/12416808/rtdetr_r18vd_5x_coco_objects365_log.txt)
+RT-DETR-R50 | 1x | Objects365 | 640 | 35.1 | 46.2 | - | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_1x_objects365.pdparams) |[log.txt](https://github.com/lyuwenyu/RT-DETR/files/12193246/rtdetr_r50vd_1x_objects365_log.txt)
+RT-DETR-R50 | 2x | COCO + Objects365 | 640 | **55.3** | **73.4** | **108** | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_2x_coco_objects365.pdparams) | [log.txt](https://github.com/lyuwenyu/RT-DETR/files/12208338/rtdetr_r50vd_2x_coco_objects365_log.txt)
+RT-DETR-R101 | 1x | Objects365 | 640 | 36.8 | 48.3 | - | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r101vd_1x_objects365.pdparams) | [log.txt](https://github.com/lyuwenyu/RT-DETR/files/12340691/rtdetr_r101vd_1x_objects365_log.txt)
+RT-DETR-R101 | 2x | COCO + Objects365 | 640 | **56.2** | **74.6** | **74** |[download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r101vd_2x_coco_objects365.pdparams) | [log.txt](https://github.com/lyuwenyu/RT-DETR/files/12340672/rtdetr_r101vd_2x_coco_objects365_log.txt)
+
+
+**Notes:**
+- `COCO + Objects365` in the table means finetuned model on COCO using pretrained weights trained on Objects365.
+
+
+
+## Quick start
+
+<details open>
+<summary>Install requirements</summary>
+
+<!-- - PaddlePaddle == 2.4.2 -->
+```bash
+pip install -r requirements.txt
+```
+
+</details>
+
+<details>
+<summary>Compile (optional)</summary>
+
+```bash
+cd ./ppdet/modeling/transformers/ext_op/
+
+python setup_ms_deformable_attn_op.py install
+```
+See [details](./ppdet/modeling/transformers/ext_op/)
+</details>
+
+
+<details>
+<summary>Data preparation</summary>
+
+- Download and extract COCO 2017 train and val images.
+```
+path/to/coco/
+  annotations/  # annotation json files
+  train2017/    # train images
+  val2017/      # val images
+```
+- Modify config [`dataset_dir`](configs/datasets/coco_detection.yml)
+</details>
+
+
+<details>
+<summary>Training & Evaluation & Testing</summary>
+
+- Training on a Single GPU:
+
+```shell
+# training on single-GPU
+export CUDA_VISIBLE_DEVICES=0
+python tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml --eval
+```
+
+- Training on Multiple GPUs:
+
+```shell
+# training on multi-GPU
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python -m paddle.distributed.launch --gpus 0,1,2,3 tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml --fleet --eval
+```
+
+- Evaluation:
+
+```shell
+python tools/eval.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \
+              -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams
+```
+
+- Inference:
+
+```shell
+python tools/infer.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \
+              -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams \
+              --infer_img=./demo/000000570688.jpg
+```
+
+</details>
+
+
+## Finetune
+<details>
+<summary>Details</summary>
+
+1. prepare data as coco format.
+```
+path/to/custom/data/
+    annotations/  # annotation json files
+    train/    # train images
+    val/      # val images
+```
+2. Modify dataset config [`dataset_dir`, `image_dir`, `anno_path`](configs/datasets/coco_detection.yml)
+
+3. Modify model config [`pretrain_weights`](configs/rtdetr/_base_/rtdetr_r50vd.yml) to coco pretrained parameters url in model zoo.
+
+```bash
+# or modified in command line
+
+fleetrun --gpus=0,1,2,3 tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml -o pretrain_weights=https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams --eval
+```
+</details>
+
+
+
+## Deploy
+
+<details open>
+<summary>1. Export model </summary>
+
+```shell
+python tools/export_model.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \
+              -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams trt=True \
+              --output_dir=output_inference
+```
+
+</details>
+
+<details>
+<summary>2. Convert to ONNX </summary>
+
+- Install [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX) and ONNX
+
+```shell
+pip install onnx==1.13.0
+pip install paddle2onnx==1.0.5
+```
+
+- Convert:
+
+```shell
+paddle2onnx --model_dir=./output_inference/rtdetr_r50vd_6x_coco/ \
+            --model_filename model.pdmodel  \
+            --params_filename model.pdiparams \
+            --opset_version 16 \
+            --save_file rtdetr_r50vd_6x_coco.onnx
+```
+</details>
+
+<details>
+<summary>3. Convert to TensorRT </summary>
+
+- TensorRT version >= 8.5.1
+- Inference can refer to [Bennchmark](../benchmark)
+
+```shell
+trtexec --onnx=./rtdetr_r50vd_6x_coco.onnx \
+        --workspace=4096 \
+        --shapes=image:1x3x640x640 \
+        --saveEngine=rtdetr_r50vd_6x_coco.trt \
+        --avgRuns=100 \
+        --fp16
+```
+
+-
+</details>
+
+
+## Others
+
+<details>
+<summary>1. Parameters and FLOPs </summary>
+
+1. Find and modify paddle [`dynamic_flops.py` ](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/hapi/dynamic_flops.py#L28) source code in your local machine
+
+```python
+# eg. /path/to/anaconda3/lib/python3.8/site-packages/paddle/hapi/dynamic_flops.py
+
+def flops(net, input_size, inputs=None, custom_ops=None, print_detail=False):
+    if isinstance(net, nn.Layer):
+        # If net is a dy2stat model, net.forward is StaticFunction instance,
+        # we set net.forward to original forward function.
+        _, net.forward = unwrap_decorators(net.forward)
+
+        # by lyuwenyu
+        if inputs is None:
+            inputs = paddle.randn(input_size)
+
+        return dynamic_flops(
+            net, inputs=inputs, custom_ops=custom_ops, print_detail=print_detail
+        )
+    elif isinstance(net, paddle.static.Program):
+        return static_flops(net, print_detail=print_detail)
+    else:
+        warnings.warn(
+            "Your model must be an instance of paddle.nn.Layer or paddle.static.Program."
+        )
+        return -1
+```
+
+2. Run below code
+
+```python
+import paddle
+from ppdet.core.workspace import load_config, merge_config
+from ppdet.core.workspace import create
+
+cfg_path = './configs/rtdetr/rtdetr_r50vd_6x_coco.yml'
+cfg = load_config(cfg_path)
+model = create(cfg.architecture)
+
+blob = {
+    'image': paddle.randn([1, 3, 640, 640]),
+    'im_shape': paddle.to_tensor([[640, 640]]),
+    'scale_factor': paddle.to_tensor([[1., 1.]])
+}
+paddle.flops(model, None, blob, custom_ops=None, print_detail=False)
+
+# Outpus
+# Total Flops: 68348108800     Total Params: 41514204
+
+```
+
+
+</details>
diff --git a/rtdetr_paddle/README_cn.md b/rtdetr_paddle/README_cn.md
new file mode 100644
index 0000000..ec3bf00
--- /dev/null
+++ b/rtdetr_paddle/README_cn.md
@@ -0,0 +1,202 @@
+简体中文 | [English](README_en.md)
+
+## 模型
+
+| Model | Epoch | backbone  | input shape | $AP^{val}$ | $AP^{val}_{50}$| Params(M) | FLOPs(G) |  T4 TensorRT FP16(FPS) | Pretrained Model | config |
+|:--------------:|:-----:|:----------:| :-------:|:--------------------------:|:---------------------------:|:---------:|:--------:| :---------------------: |:------------------------------------------------------------------------------------:|:-------------------------------------------:|
+| RT-DETR-R18 | 6x |  ResNet-18 | 640 | 46.5 | 63.8 | 20 | 60 | 217 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r18vd_dec3_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r18vd_6x_coco.yml)
+| RT-DETR-R34 | 6x |  ResNet-34 | 640 | 48.9 | 66.8 | 31 | 92 | 161 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r34vd_dec4_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r34vd_6x_coco.yml)
+| RT-DETR-R50-m | 6x |  ResNet-50 | 640 | 51.3 | 69.6 | 36 | 100 | 145 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_m_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml)
+| RT-DETR-R50 | 6x |  ResNet-50 | 640 | 53.1 | 71.3 | 42 | 136 | 108 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r50vd_6x_coco.yml)
+| RT-DETR-R101 | 6x |  ResNet-101 | 640 | 54.3 | 72.7 | 76 | 259 | 74 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r101vd_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r101vd_6x_coco.yml)
+| RT-DETR-L | 6x |  HGNetv2 | 640 | 53.0 | 71.6 | 32 | 110 | 114 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_hgnetv2_l_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_hgnetv2_l_6x_coco.yml)
+| RT-DETR-X | 6x |  HGNetv2 | 640 | 54.8 | 73.1 | 67 | 234 | 74 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_hgnetv2_x_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_hgnetv2_x_6x_coco.yml)
+
+
+**注意事项:**
+- RT-DETR 使用4个GPU训练。
+- RT-DETR 在COCO train2017上训练，并在val2017上评估。
+
+## 快速开始
+
+<details open>
+<summary>依赖包</summary>
+
+<!-- - PaddlePaddle == 2.4.2 -->
+```bash
+pip install -r requirements.txt
+```
+
+</details>
+
+<details>
+<summary>准备数据</summary>
+
+- 修改[配置文件`dataset_dir`](configs/datasets/coco_detection.yml)
+</details>
+
+
+<details>
+<summary>训练&评估</summary>
+
+- 单卡GPU上训练:
+
+```shell
+# training on single-GPU
+export CUDA_VISIBLE_DEVICES=0
+python tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml --eval
+```
+
+- 多卡GPU上训练:
+
+```shell
+# training on multi-GPU
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python -m paddle.distributed.launch --gpus 0,1,2,3 tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml --fleet --eval
+```
+
+- 评估:
+
+```shell
+python tools/eval.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \
+              -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams
+```
+
+- 测试:
+
+```shell
+python tools/infer.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \
+              -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams \
+              --infer_img=./demo/000000570688.jpg
+```
+
+详情请参考[快速开始文档](https://github.com/PaddlePaddle/PaddleDetection/blob/develop/docs/tutorials/GETTING_STARTED.md).
+
+</details>
+
+## 部署
+
+<details open>
+<summary>1. 导出模型 </summary>
+
+```shell
+python tools/export_model.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \
+              -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams trt=True \
+              --output_dir=output_inference
+```
+
+</details>
+
+<details>
+<summary>2. 转换模型至ONNX </summary>
+
+- 安装[Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX) 和 ONNX
+
+```shell
+pip install onnx==1.13.0
+pip install paddle2onnx==1.0.5
+```
+
+- 转换模型:
+
+```shell
+paddle2onnx --model_dir=./output_inference/rtdetr_r50vd_6x_coco/ \
+            --model_filename model.pdmodel  \
+            --params_filename model.pdiparams \
+            --opset_version 16 \
+            --save_file rtdetr_r50vd_6x_coco.onnx
+```
+</details>
+
+<details>
+<summary>3. 转换成TensorRT </summary>
+
+- 确保TensorRT的版本>=8.5.1
+- TRT推理可以参考[RT-DETR](https://github.com/lyuwenyu/RT-DETR)的部分代码或者其他网络资源
+
+```shell
+trtexec --onnx=./rtdetr_r50vd_6x_coco.onnx \
+        --workspace=4096 \
+        --shapes=image:1x3x640x640 \
+        --saveEngine=rtdetr_r50vd_6x_coco.trt \
+        --avgRuns=100 \
+        --fp16
+```
+
+-
+</details>
+
+
+## 其他
+
+<details>
+<summary>1. 参数量和计算量统计 </summary>
+
+1. 找到[本地安装paddle的flops源代码](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/hapi/dynamic_flops.py#L28), 并修改为
+
+```python
+# anaconda3/lib/python3.8/site-packages/paddle/hapi/dynamic_flops.py
+def flops(net, input_size, inputs=None, custom_ops=None, print_detail=False):
+    if isinstance(net, nn.Layer):
+        # If net is a dy2stat model, net.forward is StaticFunction instance,
+        # we set net.forward to original forward function.
+        _, net.forward = unwrap_decorators(net.forward)
+
+        # by lyuwenyu
+        if inputs is None:
+            inputs = paddle.randn(input_size)
+
+        return dynamic_flops(
+            net, inputs=inputs, custom_ops=custom_ops, print_detail=print_detail
+        )
+    elif isinstance(net, paddle.static.Program):
+        return static_flops(net, print_detail=print_detail)
+    else:
+        warnings.warn(
+            "Your model must be an instance of paddle.nn.Layer or paddle.static.Program."
+        )
+        return -1
+```
+
+2. 使用以下代码片段实现参数量和计算量的统计
+
+```python
+import paddle
+from ppdet.core.workspace import load_config, merge_config
+from ppdet.core.workspace import create
+
+cfg_path = './configs/rtdetr/rtdetr_r50vd_6x_coco.yml'
+cfg = load_config(cfg_path)
+model = create(cfg.architecture)
+
+blob = {
+    'image': paddle.randn([1, 3, 640, 640]),
+    'im_shape': paddle.to_tensor([[640, 640]]),
+    'scale_factor': paddle.to_tensor([[1., 1.]])
+}
+paddle.flops(model, None, blob, custom_ops=None, print_detail=False)
+```
+</details>
+
+
+<details open>
+<summary>2. YOLOs端到端速度测速 </summary>
+
+- 可以参考[RT-DETR](https://github.com/lyuwenyu/RT-DETR) benchmark部分或者其他网络资源
+
+</details>
+
+
+
+## 引用RT-DETR
+如果需要在你的研究中使用RT-DETR，请通过以下方式引用我们的论文：
+```
+@misc{lv2023detrs,
+      title={DETRs Beat YOLOs on Real-time Object Detection},
+      author={Wenyu Lv and Shangliang Xu and Yian Zhao and Guanzhong Wang and Jinman Wei and Cheng Cui and Yuning Du and Qingqing Dang and Yi Liu},
+      year={2023},
+      eprint={2304.08069},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
diff --git a/rtdetr_paddle/configs/datasets/coco_detection.yml b/rtdetr_paddle/configs/datasets/coco_detection.yml
new file mode 100644
index 0000000..176ba27
--- /dev/null
+++ b/rtdetr_paddle/configs/datasets/coco_detection.yml
@@ -0,0 +1,21 @@
+metric: COCO
+num_classes: 80
+
+TrainDataset:
+  name: COCODataSet
+  image_dir: train2017
+  anno_path: annotations/instances_train2017.json
+  dataset_dir: dataset/coco
+  data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
+
+EvalDataset:
+  name: COCODataSet
+  image_dir: val2017
+  anno_path: annotations/instances_val2017.json
+  dataset_dir: dataset/coco
+  allow_empty: true
+
+TestDataset:
+  name: ImageFolder
+  anno_path: annotations/instances_val2017.json # also support txt (like VOC's label_list.txt)
+  dataset_dir: dataset/coco # if set, anno_path will be 'dataset_dir/anno_path'
diff --git a/rtdetr_paddle/configs/datasets/voc.yml b/rtdetr_paddle/configs/datasets/voc.yml
new file mode 100644
index 0000000..72182be
--- /dev/null
+++ b/rtdetr_paddle/configs/datasets/voc.yml
@@ -0,0 +1,21 @@
+metric: VOC
+map_type: 11point
+num_classes: 20
+
+TrainDataset:
+  name: VOCDataSet
+  dataset_dir: dataset/voc
+  anno_path: trainval.txt
+  label_list: label_list.txt
+  data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult']
+
+EvalDataset:
+  name: VOCDataSet
+  dataset_dir: dataset/voc
+  anno_path: test.txt
+  label_list: label_list.txt
+  data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult']
+
+TestDataset:
+  name: ImageFolder
+  anno_path: dataset/voc/label_list.txt
diff --git a/rtdetr_paddle/configs/rtdetr/_base_/optimizer_6x.yml b/rtdetr_paddle/configs/rtdetr/_base_/optimizer_6x.yml
new file mode 100644
index 0000000..5abe2f7
--- /dev/null
+++ b/rtdetr_paddle/configs/rtdetr/_base_/optimizer_6x.yml
@@ -0,0 +1,19 @@
+epoch: 72
+
+LearningRate:
+  base_lr: 0.0001
+  schedulers:
+  - !PiecewiseDecay
+    gamma: 1.0
+    milestones: [100]
+    use_warmup: true
+  - !LinearWarmup
+    start_factor: 0.001
+    steps: 2000
+
+OptimizerBuilder:
+  clip_grad_by_norm: 0.1
+  regularizer: false
+  optimizer:
+    type: AdamW
+    weight_decay: 0.0001
diff --git a/rtdetr_paddle/configs/rtdetr/_base_/rtdetr_r50vd.yml b/rtdetr_paddle/configs/rtdetr/_base_/rtdetr_r50vd.yml
new file mode 100644
index 0000000..7859dfb
--- /dev/null
+++ b/rtdetr_paddle/configs/rtdetr/_base_/rtdetr_r50vd.yml
@@ -0,0 +1,71 @@
+architecture: DETR
+pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams
+norm_type: sync_bn
+use_ema: True
+ema_decay: 0.9999
+ema_decay_type: "exponential"
+ema_filter_no_grad: True
+hidden_dim: 256
+use_focal_loss: True
+eval_size: [640, 640] # h, w
+
+
+DETR:
+  backbone: ResNet
+  neck: HybridEncoder
+  transformer: RTDETRTransformer
+  detr_head: DINOHead
+  post_process: DETRPostProcess
+
+ResNet:
+  # index 0 stands for res2
+  depth: 50
+  variant: d
+  norm_type: bn
+  freeze_at: 0
+  return_idx: [1, 2, 3]
+  lr_mult_list: [0.1, 0.1, 0.1, 0.1]
+  num_stages: 4
+  freeze_stem_only: True
+
+HybridEncoder:
+  hidden_dim: 256
+  use_encoder_idx: [2]
+  num_encoder_layers: 1
+  encoder_layer:
+    name: TransformerLayer
+    d_model: 256
+    nhead: 8
+    dim_feedforward: 1024
+    dropout: 0.
+    activation: 'gelu'
+  expansion: 1.0
+
+
+RTDETRTransformer:
+  num_queries: 300
+  position_embed_type: sine
+  feat_strides: [8, 16, 32]
+  num_levels: 3
+  nhead: 8
+  num_decoder_layers: 6
+  dim_feedforward: 1024
+  dropout: 0.0
+  activation: relu
+  num_denoising: 100
+  label_noise_ratio: 0.5
+  box_noise_scale: 1.0
+  learnt_init_query: False
+
+DINOHead:
+  loss:
+    name: DINOLoss
+    loss_coeff: {class: 1, bbox: 5, giou: 2}
+    aux_loss: True
+    use_vfl: True
+    matcher:
+      name: HungarianMatcher
+      matcher_coeff: {class: 2, bbox: 5, giou: 2}
+
+DETRPostProcess:
+  num_top_queries: 300
diff --git a/rtdetr_paddle/configs/rtdetr/_base_/rtdetr_reader.yml b/rtdetr_paddle/configs/rtdetr/_base_/rtdetr_reader.yml
new file mode 100644
index 0000000..b1a2a00
--- /dev/null
+++ b/rtdetr_paddle/configs/rtdetr/_base_/rtdetr_reader.yml
@@ -0,0 +1,43 @@
+worker_num: 4
+TrainReader:
+  sample_transforms:
+    - Decode: {}
+    - RandomDistort: {prob: 0.8}
+    - RandomExpand: {fill_value: [123.675, 116.28, 103.53]}
+    - RandomCrop: {prob: 0.8}
+    - RandomFlip: {}
+  batch_transforms:
+    - BatchRandomResize: {target_size: [480, 512, 544, 576, 608, 640, 640, 640, 672, 704, 736, 768, 800], random_size: True, random_interp: True, keep_ratio: False}
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - NormalizeBox: {}
+    - BboxXYXY2XYWH: {}
+    - Permute: {}
+  batch_size: 4
+  shuffle: true
+  drop_last: true
+  collate_batch: false
+  use_shared_memory: false
+
+
+EvalReader:
+  sample_transforms:
+    - Decode: {}
+    - Resize: {target_size: [640, 640], keep_ratio: False, interp: 2} # target_size: (h, w)
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - Permute: {}
+  batch_size: 4
+  shuffle: false
+  drop_last: false
+
+
+TestReader:
+  inputs_def:
+    image_shape: [3, 640, 640]
+  sample_transforms:
+    - Decode: {}
+    - Resize: {target_size: [640, 640], keep_ratio: False, interp: 2}
+    - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
+    - Permute: {}
+  batch_size: 1
+  shuffle: false
+  drop_last: false
diff --git a/rtdetr_paddle/configs/rtdetr/rtdetr_hgnetv2_l_6x_coco.yml b/rtdetr_paddle/configs/rtdetr/rtdetr_hgnetv2_l_6x_coco.yml
new file mode 100644
index 0000000..4f3e77d
--- /dev/null
+++ b/rtdetr_paddle/configs/rtdetr/rtdetr_hgnetv2_l_6x_coco.yml
@@ -0,0 +1,24 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_6x.yml',
+  '_base_/rtdetr_r50vd.yml',
+  '_base_/rtdetr_reader.yml',
+]
+
+weights: output/rtdetr_hgnetv2_l_6x_coco/model_final
+pretrain_weights: https://bj.bcebos.com/v1/paddledet/models/pretrained/PPHGNetV2_L_ssld_pretrained.pdparams
+find_unused_parameters: True
+log_iter: 200
+
+
+DETR:
+  backbone: PPHGNetV2
+
+PPHGNetV2:
+  arch: 'L'
+  return_idx: [1, 2, 3]
+  freeze_stem_only: True
+  freeze_at: 0
+  freeze_norm: True
+  lr_mult_list: [0., 0.05, 0.05, 0.05, 0.05]
diff --git a/rtdetr_paddle/configs/rtdetr/rtdetr_hgnetv2_x_6x_coco.yml b/rtdetr_paddle/configs/rtdetr/rtdetr_hgnetv2_x_6x_coco.yml
new file mode 100644
index 0000000..37f5d17
--- /dev/null
+++ b/rtdetr_paddle/configs/rtdetr/rtdetr_hgnetv2_x_6x_coco.yml
@@ -0,0 +1,40 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_6x.yml',
+  '_base_/rtdetr_r50vd.yml',
+  '_base_/rtdetr_reader.yml',
+]
+
+weights: output/rtdetr_hgnetv2_l_6x_coco/model_final
+pretrain_weights: https://bj.bcebos.com/v1/paddledet/models/pretrained/PPHGNetV2_X_ssld_pretrained.pdparams
+find_unused_parameters: True
+log_iter: 200
+
+
+
+DETR:
+  backbone: PPHGNetV2
+
+
+PPHGNetV2:
+  arch: 'X'
+  return_idx: [1, 2, 3]
+  freeze_stem_only: True
+  freeze_at: 0
+  freeze_norm: True
+  lr_mult_list: [0., 0.01, 0.01, 0.01, 0.01]
+
+
+HybridEncoder:
+  hidden_dim: 384
+  use_encoder_idx: [2]
+  num_encoder_layers: 1
+  encoder_layer:
+    name: TransformerLayer
+    d_model: 384
+    nhead: 8
+    dim_feedforward: 2048
+    dropout: 0.
+    activation: 'gelu'
+  expansion: 1.0
diff --git a/rtdetr_paddle/configs/rtdetr/rtdetr_r101vd_6x_coco.yml b/rtdetr_paddle/configs/rtdetr/rtdetr_r101vd_6x_coco.yml
new file mode 100644
index 0000000..fd2f55a
--- /dev/null
+++ b/rtdetr_paddle/configs/rtdetr/rtdetr_r101vd_6x_coco.yml
@@ -0,0 +1,37 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_6x.yml',
+  '_base_/rtdetr_r50vd.yml',
+  '_base_/rtdetr_reader.yml',
+]
+
+weights: output/rtdetr_r101vd_6x_coco/model_final
+find_unused_parameters: True
+log_iter: 200
+
+pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet101_vd_ssld_pretrained.pdparams
+
+ResNet:
+  # index 0 stands for res2
+  depth: 101
+  variant: d
+  norm_type: bn
+  freeze_at: 0
+  return_idx: [1, 2, 3]
+  lr_mult_list: [0.01, 0.01, 0.01, 0.01]
+  num_stages: 4
+  freeze_stem_only: True
+
+HybridEncoder:
+  hidden_dim: 384
+  use_encoder_idx: [2]
+  num_encoder_layers: 1
+  encoder_layer:
+    name: TransformerLayer
+    d_model: 384
+    nhead: 8
+    dim_feedforward: 2048
+    dropout: 0.
+    activation: 'gelu'
+  expansion: 1.0
diff --git a/rtdetr_paddle/configs/rtdetr/rtdetr_r18vd_6x_coco.yml b/rtdetr_paddle/configs/rtdetr/rtdetr_r18vd_6x_coco.yml
new file mode 100644
index 0000000..8cf9818
--- /dev/null
+++ b/rtdetr_paddle/configs/rtdetr/rtdetr_r18vd_6x_coco.yml
@@ -0,0 +1,38 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_6x.yml',
+  '_base_/rtdetr_r50vd.yml',
+  '_base_/rtdetr_reader.yml',
+]
+
+weights: output/rtdetr_r18_6x_coco/model_final
+find_unused_parameters: True
+log_iter: 200
+
+pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet18_vd_pretrained.pdparams
+ResNet:
+  depth: 18
+  variant: d
+  return_idx: [1, 2, 3]
+  freeze_at: -1
+  freeze_norm: false
+  norm_decay: 0.
+
+HybridEncoder:
+  hidden_dim: 256
+  use_encoder_idx: [2]
+  num_encoder_layers: 1
+  encoder_layer:
+    name: TransformerLayer
+    d_model: 256
+    nhead: 8
+    dim_feedforward: 1024
+    dropout: 0.
+    activation: 'gelu'
+  expansion: 0.5
+  depth_mult: 1.0
+
+RTDETRTransformer:
+  eval_idx: -1
+  num_decoder_layers: 3
diff --git a/rtdetr_paddle/configs/rtdetr/rtdetr_r34vd_6x_coco.yml b/rtdetr_paddle/configs/rtdetr/rtdetr_r34vd_6x_coco.yml
new file mode 100644
index 0000000..2ab07ba
--- /dev/null
+++ b/rtdetr_paddle/configs/rtdetr/rtdetr_r34vd_6x_coco.yml
@@ -0,0 +1,38 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_6x.yml',
+  '_base_/rtdetr_r50vd.yml',
+  '_base_/rtdetr_reader.yml',
+]
+
+weights: output/rtdetr_r34vd_6x_coco/model_final
+find_unused_parameters: True
+log_iter: 200
+
+pretrain_weights: https://bj.bcebos.com/v1/paddledet/models/pretrained/ResNet34_vd_pretrained.pdparams
+ResNet:
+  depth: 34
+  variant: d
+  return_idx: [1, 2, 3]
+  freeze_at: -1
+  freeze_norm: false
+  norm_decay: 0.
+
+HybridEncoder:
+  hidden_dim: 256
+  use_encoder_idx: [2]
+  num_encoder_layers: 1
+  encoder_layer:
+    name: TransformerLayer
+    d_model: 256
+    nhead: 8
+    dim_feedforward: 1024
+    dropout: 0.
+    activation: 'gelu'
+  expansion: 0.5
+  depth_mult: 1.0
+
+RTDETRTransformer:
+  eval_idx: -1
+  num_decoder_layers: 4
diff --git a/rtdetr_paddle/configs/rtdetr/rtdetr_r50vd_6x_coco.yml b/rtdetr_paddle/configs/rtdetr/rtdetr_r50vd_6x_coco.yml
new file mode 100644
index 0000000..51bf443
--- /dev/null
+++ b/rtdetr_paddle/configs/rtdetr/rtdetr_r50vd_6x_coco.yml
@@ -0,0 +1,11 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_6x.yml',
+  '_base_/rtdetr_r50vd.yml',
+  '_base_/rtdetr_reader.yml',
+]
+
+weights: output/rtdetr_r50vd_6x_coco/model_final
+find_unused_parameters: True
+log_iter: 200
diff --git a/rtdetr_paddle/configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml b/rtdetr_paddle/configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml
new file mode 100644
index 0000000..d4ab6f9
--- /dev/null
+++ b/rtdetr_paddle/configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml
@@ -0,0 +1,28 @@
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  '_base_/optimizer_6x.yml',
+  '_base_/rtdetr_r50vd.yml',
+  '_base_/rtdetr_reader.yml',
+]
+
+weights: output/rtdetr_r50vd_m_6x_coco/model_final
+find_unused_parameters: True
+log_iter: 200
+
+HybridEncoder:
+  hidden_dim: 256
+  use_encoder_idx: [2]
+  num_encoder_layers: 1
+  encoder_layer:
+    name: TransformerLayer
+    d_model: 256
+    nhead: 8
+    dim_feedforward: 1024
+    dropout: 0.
+    activation: 'gelu'
+  expansion: 0.5
+  depth_mult: 1.0
+
+RTDETRTransformer:
+  eval_idx: 2 # use 3th decoder layer to eval
diff --git a/rtdetr_paddle/configs/runtime.yml b/rtdetr_paddle/configs/runtime.yml
new file mode 100644
index 0000000..a58b171
--- /dev/null
+++ b/rtdetr_paddle/configs/runtime.yml
@@ -0,0 +1,16 @@
+use_gpu: true
+use_xpu: false
+use_mlu: false
+use_npu: false
+log_iter: 20
+save_dir: output
+snapshot_epoch: 1
+print_flops: false
+print_params: false
+
+# Exporting the model
+export:
+  post_process: True  # Whether post-processing is included in the network when export model.
+  nms: True           # Whether NMS is included in the network when export model.
+  benchmark: False    # It is used to testing model performance, if set `True`, post-process and NMS will not be exported.
+  fuse_conv_bn: False
diff --git a/rtdetr_paddle/dataset/coco/download_coco.py b/rtdetr_paddle/dataset/coco/download_coco.py
new file mode 100644
index 0000000..993218f
--- /dev/null
+++ b/rtdetr_paddle/dataset/coco/download_coco.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os.path as osp
+import logging
+# add python path of PaddleDetection to sys.path
+parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))
+if parent_path not in sys.path:
+    sys.path.append(parent_path)
+
+from ppdet.utils.download import download_dataset
+
+logging.basicConfig(level=logging.INFO)
+
+download_path = osp.split(osp.realpath(sys.argv[0]))[0]
+download_dataset(download_path, 'coco')
diff --git a/rtdetr_paddle/dataset/voc/create_list.py b/rtdetr_paddle/dataset/voc/create_list.py
new file mode 100644
index 0000000..7696073
--- /dev/null
+++ b/rtdetr_paddle/dataset/voc/create_list.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os.path as osp
+import logging
+# add python path of PaddleDetection to sys.path
+parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))
+if parent_path not in sys.path:
+    sys.path.append(parent_path)
+
+from ppdet.utils.download import create_voc_list
+
+logging.basicConfig(level=logging.INFO)
+
+voc_path = osp.split(osp.realpath(sys.argv[0]))[0]
+create_voc_list(voc_path)
diff --git a/rtdetr_paddle/dataset/voc/download_voc.py b/rtdetr_paddle/dataset/voc/download_voc.py
new file mode 100644
index 0000000..2375fbf
--- /dev/null
+++ b/rtdetr_paddle/dataset/voc/download_voc.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os.path as osp
+import logging
+# add python path of PaddleDetection to sys.path
+parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3)))
+if parent_path not in sys.path:
+    sys.path.append(parent_path)
+
+from ppdet.utils.download import download_dataset
+
+logging.basicConfig(level=logging.INFO)
+
+download_path = osp.split(osp.realpath(sys.argv[0]))[0]
+download_dataset(download_path, 'voc')
diff --git a/rtdetr_paddle/dataset/voc/label_list.txt b/rtdetr_paddle/dataset/voc/label_list.txt
new file mode 100644
index 0000000..8420ab3
--- /dev/null
+++ b/rtdetr_paddle/dataset/voc/label_list.txt
@@ -0,0 +1,20 @@
+aeroplane
+bicycle
+bird
+boat
+bottle
+bus
+car
+cat
+chair
+cow
+diningtable
+dog
+horse
+motorbike
+person
+pottedplant
+sheep
+sofa
+train
+tvmonitor
diff --git a/rtdetr_paddle/ppdet/__init__.py b/rtdetr_paddle/ppdet/__init__.py
new file mode 100644
index 0000000..fa1d8af
--- /dev/null
+++ b/rtdetr_paddle/ppdet/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import (core, data, engine, modeling, optimizer, metrics, utils)
+
+
+try:
+    from .version import full_version as __version__
+    from .version import commit as __git_commit__
+except ImportError:
+    import sys
+    sys.stderr.write("Warning: import ppdet from source directory " \
+            "without installing, run 'python setup.py install' to " \
+            "install ppdet firstly\n")
diff --git a/rtdetr_paddle/ppdet/core/__init__.py b/rtdetr_paddle/ppdet/core/__init__.py
new file mode 100644
index 0000000..d042771
--- /dev/null
+++ b/rtdetr_paddle/ppdet/core/__init__.py
@@ -0,0 +1,15 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import config
diff --git a/rtdetr_paddle/ppdet/core/config/__init__.py b/rtdetr_paddle/ppdet/core/config/__init__.py
new file mode 100644
index 0000000..d0c32e2
--- /dev/null
+++ b/rtdetr_paddle/ppdet/core/config/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/rtdetr_paddle/ppdet/core/config/schema.py b/rtdetr_paddle/ppdet/core/config/schema.py
new file mode 100644
index 0000000..2e41b5c
--- /dev/null
+++ b/rtdetr_paddle/ppdet/core/config/schema.py
@@ -0,0 +1,248 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import inspect
+import importlib
+import re
+
+try:
+    from docstring_parser import parse as doc_parse
+except Exception:
+
+    def doc_parse(*args):
+        pass
+
+
+try:
+    from typeguard import check_type
+except Exception:
+
+    def check_type(*args):
+        pass
+
+
+__all__ = ['SchemaValue', 'SchemaDict', 'SharedConfig', 'extract_schema']
+
+
+class SchemaValue(object):
+    def __init__(self, name, doc='', type=None):
+        super(SchemaValue, self).__init__()
+        self.name = name
+        self.doc = doc
+        self.type = type
+
+    def set_default(self, value):
+        self.default = value
+
+    def has_default(self):
+        return hasattr(self, 'default')
+
+
+class SchemaDict(dict):
+    def __init__(self, **kwargs):
+        super(SchemaDict, self).__init__()
+        self.schema = {}
+        self.strict = False
+        self.doc = ""
+        self.update(kwargs)
+
+    def __setitem__(self, key, value):
+        # XXX also update regular dict to SchemaDict??
+        if isinstance(value, dict) and key in self and isinstance(self[key],
+                                                                  SchemaDict):
+            self[key].update(value)
+        else:
+            super(SchemaDict, self).__setitem__(key, value)
+
+    def __missing__(self, key):
+        if self.has_default(key):
+            return self.schema[key].default
+        elif key in self.schema:
+            return self.schema[key]
+        else:
+            raise KeyError(key)
+
+    def copy(self):
+        newone = SchemaDict()
+        newone.__dict__.update(self.__dict__)
+        newone.update(self)
+        return newone
+
+    def set_schema(self, key, value):
+        assert isinstance(value, SchemaValue)
+        self.schema[key] = value
+
+    def set_strict(self, strict):
+        self.strict = strict
+
+    def has_default(self, key):
+        return key in self.schema and self.schema[key].has_default()
+
+    def is_default(self, key):
+        if not self.has_default(key):
+            return False
+        if hasattr(self[key], '__dict__'):
+            return True
+        else:
+            return key not in self or self[key] == self.schema[key].default
+
+    def find_default_keys(self):
+        return [
+            k for k in list(self.keys()) + list(self.schema.keys())
+            if self.is_default(k)
+        ]
+
+    def mandatory(self):
+        return any([k for k in self.schema.keys() if not self.has_default(k)])
+
+    def find_missing_keys(self):
+        missing = [
+            k for k in self.schema.keys()
+            if k not in self and not self.has_default(k)
+        ]
+        placeholders = [k for k in self if self[k] in ('<missing>', '<value>')]
+        return missing + placeholders
+
+    def find_extra_keys(self):
+        return list(set(self.keys()) - set(self.schema.keys()))
+
+    def find_mismatch_keys(self):
+        mismatch_keys = []
+        for arg in self.schema.values():
+            if arg.type is not None:
+                try:
+                    check_type("{}.{}".format(self.name, arg.name),
+                               self[arg.name], arg.type)
+                except Exception:
+                    mismatch_keys.append(arg.name)
+        return mismatch_keys
+
+    def validate(self):
+        missing_keys = self.find_missing_keys()
+        if missing_keys:
+            raise ValueError("Missing param for class<{}>: {}".format(
+                self.name, ", ".join(missing_keys)))
+        extra_keys = self.find_extra_keys()
+        if extra_keys and self.strict:
+            raise ValueError("Extraneous param for class<{}>: {}".format(
+                self.name, ", ".join(extra_keys)))
+        mismatch_keys = self.find_mismatch_keys()
+        if mismatch_keys:
+            raise TypeError("Wrong param type for class<{}>: {}".format(
+                self.name, ", ".join(mismatch_keys)))
+
+
+class SharedConfig(object):
+    """
+    Representation class for `__shared__` annotations, which work as follows:
+
+    - if `key` is set for the module in config file, its value will take
+      precedence
+    - if `key` is not set for the module but present in the config file, its
+      value will be used
+    - otherwise, use the provided `default_value` as fallback
+
+    Args:
+        key: config[key] will be injected
+        default_value: fallback value
+    """
+
+    def __init__(self, key, default_value=None):
+        super(SharedConfig, self).__init__()
+        self.key = key
+        self.default_value = default_value
+
+
+def extract_schema(cls):
+    """
+    Extract schema from a given class
+
+    Args:
+        cls (type): Class from which to extract.
+
+    Returns:
+        schema (SchemaDict): Extracted schema.
+    """
+    ctor = cls.__init__
+    # python 2 compatibility
+    if hasattr(inspect, 'getfullargspec'):
+        argspec = inspect.getfullargspec(ctor)
+        annotations = argspec.annotations
+        has_kwargs = argspec.varkw is not None
+    else:
+        argspec = inspect.getfullargspec(ctor)
+        # python 2 type hinting workaround, see pep-3107
+        # however, since `typeguard` does not support python 2, type checking
+        # is still python 3 only for now
+        annotations = getattr(ctor, '__annotations__', {})
+        has_kwargs = argspec.varkw is not None
+
+    names = [arg for arg in argspec.args if arg != 'self']
+    defaults = argspec.defaults
+    num_defaults = argspec.defaults is not None and len(argspec.defaults) or 0
+    num_required = len(names) - num_defaults
+
+    docs = cls.__doc__
+    if docs is None and getattr(cls, '__category__', None) == 'op':
+        docs = cls.__call__.__doc__
+    try:
+        docstring = doc_parse(docs)
+    except Exception:
+        docstring = None
+
+    if docstring is None:
+        comments = {}
+    else:
+        comments = {}
+        for p in docstring.params:
+            match_obj = re.match('^([a-zA-Z_]+[a-zA-Z_0-9]*).*', p.arg_name)
+            if match_obj is not None:
+                comments[match_obj.group(1)] = p.description
+
+    schema = SchemaDict()
+    schema.name = cls.__name__
+    schema.doc = ""
+    if docs is not None:
+        start_pos = docs[0] == '\n' and 1 or 0
+        schema.doc = docs[start_pos:].split("\n")[0].strip()
+    # XXX handle paddle's weird doc convention
+    if '**' == schema.doc[:2] and '**' == schema.doc[-2:]:
+        schema.doc = schema.doc[2:-2].strip()
+    schema.category = hasattr(cls, '__category__') and getattr(
+        cls, '__category__') or 'module'
+    schema.strict = not has_kwargs
+    schema.pymodule = importlib.import_module(cls.__module__)
+    schema.inject = getattr(cls, '__inject__', [])
+    schema.shared = getattr(cls, '__shared__', [])
+    for idx, name in enumerate(names):
+        comment = name in comments and comments[name] or name
+        if name in schema.inject:
+            type_ = None
+        else:
+            type_ = name in annotations and annotations[name] or None
+        value_schema = SchemaValue(name, comment, type_)
+        if name in schema.shared:
+            assert idx >= num_required, "shared config must have default value"
+            default = defaults[idx - num_required]
+            value_schema.set_default(SharedConfig(name, default))
+        elif idx >= num_required:
+            default = defaults[idx - num_required]
+            value_schema.set_default(default)
+        schema.set_schema(name, value_schema)
+
+    return schema
diff --git a/rtdetr_paddle/ppdet/core/config/yaml_helpers.py b/rtdetr_paddle/ppdet/core/config/yaml_helpers.py
new file mode 100644
index 0000000..181cfe6
--- /dev/null
+++ b/rtdetr_paddle/ppdet/core/config/yaml_helpers.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import inspect
+
+import yaml
+from .schema import SharedConfig
+
+__all__ = ['serializable', 'Callable']
+
+
+def represent_dictionary_order(self, dict_data):
+    return self.represent_mapping('tag:yaml.org,2002:map', dict_data.items())
+
+
+def setup_orderdict():
+    from collections import OrderedDict
+    yaml.add_representer(OrderedDict, represent_dictionary_order)
+
+
+def _make_python_constructor(cls):
+    def python_constructor(loader, node):
+        if isinstance(node, yaml.SequenceNode):
+            args = loader.construct_sequence(node, deep=True)
+            return cls(*args)
+        else:
+            kwargs = loader.construct_mapping(node, deep=True)
+            try:
+                return cls(**kwargs)
+            except Exception as ex:
+                print("Error when construct {} instance from yaml config".
+                      format(cls.__name__))
+                raise ex
+
+    return python_constructor
+
+
+def _make_python_representer(cls):
+    # python 2 compatibility
+    if hasattr(inspect, 'getfullargspec'):
+        argspec = inspect.getfullargspec(cls)
+    else:
+        argspec = inspect.getfullargspec(cls.__init__)
+    argnames = [arg for arg in argspec.args if arg != 'self']
+
+    def python_representer(dumper, obj):
+        if argnames:
+            data = {name: getattr(obj, name) for name in argnames}
+        else:
+            data = obj.__dict__
+        if '_id' in data:
+            del data['_id']
+        return dumper.represent_mapping(u'!{}'.format(cls.__name__), data)
+
+    return python_representer
+
+
+def serializable(cls):
+    """
+    Add loader and dumper for given class, which must be
+    "trivially serializable"
+
+    Args:
+        cls: class to be serialized
+
+    Returns: cls
+    """
+    yaml.add_constructor(u'!{}'.format(cls.__name__),
+                         _make_python_constructor(cls))
+    yaml.add_representer(cls, _make_python_representer(cls))
+    return cls
+
+
+yaml.add_representer(SharedConfig,
+                     lambda d, o: d.represent_data(o.default_value))
+
+
+@serializable
+class Callable(object):
+    """
+    Helper to be used in Yaml for creating arbitrary class objects
+
+    Args:
+        full_type (str): the full module path to target function
+    """
+
+    def __init__(self, full_type, args=[], kwargs={}):
+        super(Callable, self).__init__()
+        self.full_type = full_type
+        self.args = args
+        self.kwargs = kwargs
+
+    def __call__(self):
+        if '.' in self.full_type:
+            idx = self.full_type.rfind('.')
+            module = importlib.import_module(self.full_type[:idx])
+            func_name = self.full_type[idx + 1:]
+        else:
+            try:
+                module = importlib.import_module('builtins')
+            except Exception:
+                module = importlib.import_module('__builtin__')
+            func_name = self.full_type
+
+        func = getattr(module, func_name)
+        return func(*self.args, **self.kwargs)
diff --git a/rtdetr_paddle/ppdet/core/workspace.py b/rtdetr_paddle/ppdet/core/workspace.py
new file mode 100644
index 0000000..6735bcf
--- /dev/null
+++ b/rtdetr_paddle/ppdet/core/workspace.py
@@ -0,0 +1,292 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import importlib
+import os
+import sys
+
+import yaml
+import collections
+
+try:
+    collectionsAbc = collections.abc
+except AttributeError:
+    collectionsAbc = collections
+
+from .config.schema import SchemaDict, SharedConfig, extract_schema
+from .config.yaml_helpers import serializable
+
+__all__ = [
+    'global_config',
+    'load_config',
+    'merge_config',
+    'get_registered_modules',
+    'create',
+    'register',
+    'serializable',
+    'dump_value',
+]
+
+
+def dump_value(value):
+    # XXX this is hackish, but collections.abc is not available in python 2
+    if hasattr(value, '__dict__') or isinstance(value, (dict, tuple, list)):
+        value = yaml.dump(value, default_flow_style=True)
+        value = value.replace('\n', '')
+        value = value.replace('...', '')
+        return "'{}'".format(value)
+    else:
+        # primitive types
+        return str(value)
+
+
+class AttrDict(dict):
+    """Single level attribute dict, NOT recursive"""
+
+    def __init__(self, **kwargs):
+        super(AttrDict, self).__init__()
+        super(AttrDict, self).update(kwargs)
+
+    def __getattr__(self, key):
+        if key in self:
+            return self[key]
+        raise AttributeError("object has no attribute '{}'".format(key))
+
+    def __setattr__(self, key, value):
+        self[key] = value
+
+    def copy(self):
+        new_dict = AttrDict()
+        for k, v in self.items():
+            new_dict.update({k: v})
+        return new_dict
+
+
+global_config = AttrDict()
+
+BASE_KEY = '_BASE_'
+
+
+# parse and load _BASE_ recursively
+def _load_config_with_base(file_path):
+    with open(file_path) as f:
+        file_cfg = yaml.load(f, Loader=yaml.Loader)
+
+    # NOTE: cfgs outside have higher priority than cfgs in _BASE_
+    if BASE_KEY in file_cfg:
+        all_base_cfg = AttrDict()
+        base_ymls = list(file_cfg[BASE_KEY])
+        for base_yml in base_ymls:
+            if base_yml.startswith("~"):
+                base_yml = os.path.expanduser(base_yml)
+            if not base_yml.startswith('/'):
+                base_yml = os.path.join(os.path.dirname(file_path), base_yml)
+
+            with open(base_yml) as f:
+                base_cfg = _load_config_with_base(base_yml)
+                all_base_cfg = merge_config(base_cfg, all_base_cfg)
+
+        del file_cfg[BASE_KEY]
+        return merge_config(file_cfg, all_base_cfg)
+
+    return file_cfg
+
+
+def load_config(file_path):
+    """
+    Load config from file.
+
+    Args:
+        file_path (str): Path of the config file to be loaded.
+
+    Returns: global config
+    """
+    _, ext = os.path.splitext(file_path)
+    assert ext in ['.yml', '.yaml'], "only support yaml files for now"
+
+    # load config from file and merge into global config
+    cfg = _load_config_with_base(file_path)
+    cfg['filename'] = os.path.splitext(os.path.split(file_path)[-1])[0]
+    merge_config(cfg)
+
+    return global_config
+
+
+def dict_merge(dct, merge_dct):
+    """ Recursive dict merge. Inspired by :meth:``dict.update()``, instead of
+    updating only top-level keys, dict_merge recurses down into dicts nested
+    to an arbitrary depth, updating keys. The ``merge_dct`` is merged into
+    ``dct``.
+
+    Args:
+        dct: dict onto which the merge is executed
+        merge_dct: dct merged into dct
+
+    Returns: dct
+    """
+    for k, v in merge_dct.items():
+        if (k in dct and isinstance(dct[k], dict) and
+                isinstance(merge_dct[k], collectionsAbc.Mapping)):
+            dict_merge(dct[k], merge_dct[k])
+        else:
+            dct[k] = merge_dct[k]
+    return dct
+
+
+def merge_config(config, another_cfg=None):
+    """
+    Merge config into global config or another_cfg.
+
+    Args:
+        config (dict): Config to be merged.
+
+    Returns: global config
+    """
+    global global_config
+    dct = another_cfg or global_config
+    return dict_merge(dct, config)
+
+
+def get_registered_modules():
+    return {k: v for k, v in global_config.items() if isinstance(v, SchemaDict)}
+
+
+def make_partial(cls):
+    op_module = importlib.import_module(cls.__op__.__module__)
+    op = getattr(op_module, cls.__op__.__name__)
+    cls.__category__ = getattr(cls, '__category__', None) or 'op'
+
+    def partial_apply(self, *args, **kwargs):
+        kwargs_ = self.__dict__.copy()
+        kwargs_.update(kwargs)
+        return op(*args, **kwargs_)
+
+    if getattr(cls, '__append_doc__', True):  # XXX should default to True?
+        if sys.version_info[0] > 2:
+            cls.__doc__ = "Wrapper for `{}` OP".format(op.__name__)
+            cls.__init__.__doc__ = op.__doc__
+            cls.__call__ = partial_apply
+            cls.__call__.__doc__ = op.__doc__
+        else:
+            # XXX work around for python 2
+            partial_apply.__doc__ = op.__doc__
+            cls.__call__ = partial_apply
+    return cls
+
+
+def register(cls):
+    """
+    Register a given module class.
+
+    Args:
+        cls (type): Module class to be registered.
+
+    Returns: cls
+    """
+    if cls.__name__ in global_config:
+        raise ValueError("Module class already registered: {}".format(
+            cls.__name__))
+    if hasattr(cls, '__op__'):
+        cls = make_partial(cls)
+    global_config[cls.__name__] = extract_schema(cls)
+    return cls
+
+
+def create(cls_or_name, **kwargs):
+    """
+    Create an instance of given module class.
+
+    Args:
+        cls_or_name (type or str): Class of which to create instance.
+
+    Returns: instance of type `cls_or_name`
+    """
+    assert type(cls_or_name) in [type, str
+                                 ], "should be a class or name of a class"
+    name = type(cls_or_name) == str and cls_or_name or cls_or_name.__name__
+    if name in global_config:
+        if isinstance(global_config[name], SchemaDict):
+            pass
+        elif hasattr(global_config[name], "__dict__"):
+            # support instance return directly
+            return global_config[name]
+        else:
+            raise ValueError("The module {} is not registered".format(name))
+    else:
+        raise ValueError("The module {} is not registered".format(name))
+
+    config = global_config[name]
+    cls = getattr(config.pymodule, name)
+    cls_kwargs = {}
+    cls_kwargs.update(global_config[name])
+
+    # parse `shared` annoation of registered modules
+    if getattr(config, 'shared', None):
+        for k in config.shared:
+            target_key = config[k]
+            shared_conf = config.schema[k].default
+            assert isinstance(shared_conf, SharedConfig)
+            if target_key is not None and not isinstance(target_key,
+                                                         SharedConfig):
+                continue  # value is given for the module
+            elif shared_conf.key in global_config:
+                # `key` is present in config
+                cls_kwargs[k] = global_config[shared_conf.key]
+            else:
+                cls_kwargs[k] = shared_conf.default_value
+
+    # parse `inject` annoation of registered modules
+    if getattr(cls, 'from_config', None):
+        cls_kwargs.update(cls.from_config(config, **kwargs))
+
+    if getattr(config, 'inject', None):
+        for k in config.inject:
+            target_key = config[k]
+            # optional dependency
+            if target_key is None:
+                continue
+
+            if isinstance(target_key, dict) or hasattr(target_key, '__dict__'):
+                if 'name' not in target_key.keys():
+                    continue
+                inject_name = str(target_key['name'])
+                if inject_name not in global_config:
+                    raise ValueError(
+                        "Missing injection name {} and check it's name in cfg file".
+                        format(k))
+                target = global_config[inject_name]
+                for i, v in target_key.items():
+                    if i == 'name':
+                        continue
+                    target[i] = v
+                if isinstance(target, SchemaDict):
+                    cls_kwargs[k] = create(inject_name)
+            elif isinstance(target_key, str):
+                if target_key not in global_config:
+                    raise ValueError("Missing injection config:", target_key)
+                target = global_config[target_key]
+                if isinstance(target, SchemaDict):
+                    cls_kwargs[k] = create(target_key)
+                elif hasattr(target, '__dict__'):  # serialized object
+                    cls_kwargs[k] = target
+            else:
+                raise ValueError("Unsupported injection type:", target_key)
+    # prevent modification of global config values of reference types
+    # (e.g., list, dict) from within the created module instances
+    #kwargs = copy.deepcopy(kwargs)
+    return cls(**cls_kwargs)
diff --git a/rtdetr_paddle/ppdet/data/__init__.py b/rtdetr_paddle/ppdet/data/__init__.py
new file mode 100644
index 0000000..a12aa32
--- /dev/null
+++ b/rtdetr_paddle/ppdet/data/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from . import source
+from . import transform
+from . import reader
+
+from .source import *
+from .transform import *
+from .reader import *
diff --git a/rtdetr_paddle/ppdet/data/reader.py b/rtdetr_paddle/ppdet/data/reader.py
new file mode 100644
index 0000000..587f3ae
--- /dev/null
+++ b/rtdetr_paddle/ppdet/data/reader.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+import traceback
+import six
+import sys
+if sys.version_info >= (3, 0):
+    pass
+else:
+    pass
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+from copy import deepcopy
+
+from paddle.io import DataLoader, DistributedBatchSampler
+from .utils import default_collate_fn
+
+from ppdet.core.workspace import register
+from . import transform
+from .shm_utils import _get_shared_memory_size_in_M
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('reader')
+
+MAIN_PID = os.getpid()
+
+
+class Compose(object):
+    def __init__(self, transforms, num_classes=80):
+        self.transforms = transforms
+        self.transforms_cls = []
+        for t in self.transforms:
+            for k, v in t.items():
+                op_cls = getattr(transform, k)
+                f = op_cls(**v)
+                if hasattr(f, 'num_classes'):
+                    f.num_classes = num_classes
+
+                self.transforms_cls.append(f)
+
+    def __call__(self, data):
+        for f in self.transforms_cls:
+            try:
+                data = f(data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map sample transform [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        return data
+
+
+class BatchCompose(Compose):
+    def __init__(self, transforms, num_classes=80, collate_batch=True):
+        super(BatchCompose, self).__init__(transforms, num_classes)
+        self.collate_batch = collate_batch
+
+    def __call__(self, data):
+        for f in self.transforms_cls:
+            try:
+                data = f(data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map batch transform [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        # remove keys which is not needed by model
+        extra_key = ['h', 'w', 'flipped']
+        for k in extra_key:
+            for sample in data:
+                if k in sample:
+                    sample.pop(k)
+
+        # batch data, if user-define batch function needed
+        # use user-defined here
+        if self.collate_batch:
+            batch_data = default_collate_fn(data)
+        else:
+            batch_data = {}
+            for k in data[0].keys():
+                tmp_data = []
+                for i in range(len(data)):
+                    tmp_data.append(data[i][k])
+                if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:
+                    tmp_data = np.stack(tmp_data, axis=0)
+                batch_data[k] = tmp_data
+        return batch_data
+
+
+class BaseDataLoader(object):
+    """
+    Base DataLoader implementation for detection models
+
+    Args:
+        sample_transforms (list): a list of transforms to perform
+                                  on each sample
+        batch_transforms (list): a list of transforms to perform
+                                 on batch
+        batch_size (int): batch size for batch collating, default 1.
+        shuffle (bool): whether to shuffle samples
+        drop_last (bool): whether to drop the last incomplete,
+                          default False
+        num_classes (int): class number of dataset, default 80
+        collate_batch (bool): whether to collate batch in dataloader.
+            If set to True, the samples will collate into batch according
+            to the batch size. Otherwise, the ground-truth will not collate,
+            which is used when the number of ground-truch is different in 
+            samples.
+        use_shared_memory (bool): whether to use shared memory to
+                accelerate data loading, enable this only if you
+                are sure that the shared memory size of your OS
+                is larger than memory cost of input datas of model.
+                Note that shared memory will be automatically
+                disabled if the shared memory of OS is less than
+                1G, which is not enough for detection models.
+                Default False.
+    """
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=False,
+                 drop_last=False,
+                 num_classes=80,
+                 collate_batch=True,
+                 use_shared_memory=False,
+                 **kwargs):
+        # sample transform
+        self._sample_transforms = Compose(
+            sample_transforms, num_classes=num_classes)
+
+        # batch transfrom 
+        self._batch_transforms = BatchCompose(batch_transforms, num_classes,
+                                              collate_batch)
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.use_shared_memory = use_shared_memory
+        self.kwargs = kwargs
+
+    def __call__(self,
+                 dataset,
+                 worker_num,
+                 batch_sampler=None,
+                 return_list=False):
+        self.dataset = dataset
+        self.dataset.check_or_download_dataset()
+        self.dataset.parse_dataset()
+        # get data
+        self.dataset.set_transform(self._sample_transforms)
+        # set kwargs
+        self.dataset.set_kwargs(**self.kwargs)
+        # batch sampler
+        if batch_sampler is None:
+            self._batch_sampler = DistributedBatchSampler(
+                self.dataset,
+                batch_size=self.batch_size,
+                shuffle=self.shuffle,
+                drop_last=self.drop_last)
+        else:
+            self._batch_sampler = batch_sampler
+
+        # DataLoader do not start sub-process in Windows and Mac
+        # system, do not need to use shared memory
+        use_shared_memory = self.use_shared_memory and \
+                            sys.platform not in ['win32', 'darwin']
+        # check whether shared memory size is bigger than 1G(1024M)
+        if use_shared_memory:
+            shm_size = _get_shared_memory_size_in_M()
+            if shm_size is not None and shm_size < 1024.:
+                logger.warning("Shared memory size is less than 1G, "
+                               "disable shared_memory in DataLoader")
+                use_shared_memory = False
+
+        self.dataloader = DataLoader(
+            dataset=self.dataset,
+            batch_sampler=self._batch_sampler,
+            collate_fn=self._batch_transforms,
+            num_workers=worker_num,
+            return_list=return_list,
+            use_shared_memory=use_shared_memory)
+        self.loader = iter(self.dataloader)
+
+        return self
+
+    def __len__(self):
+        return len(self._batch_sampler)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        try:
+            return next(self.loader)
+        except StopIteration:
+            self.loader = iter(self.dataloader)
+            six.reraise(*sys.exc_info())
+
+    def next(self):
+        # python2 compatibility
+        return self.__next__()
+
+
+@register
+class TrainReader(BaseDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=True,
+                 drop_last=True,
+                 num_classes=80,
+                 collate_batch=True,
+                 **kwargs):
+        super(TrainReader, self).__init__(sample_transforms, batch_transforms,
+                                          batch_size, shuffle, drop_last,
+                                          num_classes, collate_batch, **kwargs)
+
+
+@register
+class EvalReader(BaseDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=False,
+                 drop_last=False,
+                 num_classes=80,
+                 **kwargs):
+        super(EvalReader, self).__init__(sample_transforms, batch_transforms,
+                                         batch_size, shuffle, drop_last,
+                                         num_classes, **kwargs)
+
+
+@register
+class TestReader(BaseDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=False,
+                 drop_last=False,
+                 num_classes=80,
+                 **kwargs):
+        super(TestReader, self).__init__(sample_transforms, batch_transforms,
+                                         batch_size, shuffle, drop_last,
+                                         num_classes, **kwargs)
+
diff --git a/rtdetr_paddle/ppdet/data/shm_utils.py b/rtdetr_paddle/ppdet/data/shm_utils.py
new file mode 100644
index 0000000..a929a80
--- /dev/null
+++ b/rtdetr_paddle/ppdet/data/shm_utils.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+SIZE_UNIT = ['K', 'M', 'G', 'T']
+SHM_QUERY_CMD = 'df -h'
+SHM_KEY = 'shm'
+SHM_DEFAULT_MOUNT = '/dev/shm'
+
+# [ shared memory size check ]
+# In detection models, image/target data occupies a lot of memory, and
+# will occupy lots of shared memory in multi-process DataLoader, we use
+# following code to get shared memory size and perform a size check to
+# disable shared memory use if shared memory size is not enough.
+# Shared memory getting process as follows:
+# 1. use `df -h` get all mount info
+# 2. pick up spaces whose mount info contains 'shm'
+# 3. if 'shm' space number is only 1, return its size
+# 4. if there are multiple 'shm' space, try to find the default mount
+#    directory '/dev/shm' is Linux-like system, otherwise return the
+#    biggest space size.
+
+
+def _parse_size_in_M(size_str):
+    if size_str[-1] == 'B':
+        num, unit = size_str[:-2], size_str[-2]
+    else:
+        num, unit = size_str[:-1], size_str[-1]
+    assert unit in SIZE_UNIT, \
+            "unknown shm size unit {}".format(unit)
+    return float(num) * \
+            (1024 ** (SIZE_UNIT.index(unit) - 1))
+
+
+def _get_shared_memory_size_in_M():
+    try:
+        df_infos = os.popen(SHM_QUERY_CMD).readlines()
+    except:
+        return None
+    else:
+        shm_infos = []
+        for df_info in df_infos:
+            info = df_info.strip()
+            if info.find(SHM_KEY) >= 0:
+                shm_infos.append(info.split())
+
+        if len(shm_infos) == 0:
+            return None
+        elif len(shm_infos) == 1:
+            return _parse_size_in_M(shm_infos[0][3])
+        else:
+            default_mount_infos = [
+                si for si in shm_infos if si[-1] == SHM_DEFAULT_MOUNT
+            ]
+            if default_mount_infos:
+                return _parse_size_in_M(default_mount_infos[0][3])
+            else:
+                return max([_parse_size_in_M(si[3]) for si in shm_infos])
diff --git a/rtdetr_paddle/ppdet/data/source/__init__.py b/rtdetr_paddle/ppdet/data/source/__init__.py
new file mode 100644
index 0000000..0c44b43
--- /dev/null
+++ b/rtdetr_paddle/ppdet/data/source/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .coco import *
+from .voc import *
+from .category import *
+from .dataset import ImageFolder
diff --git a/rtdetr_paddle/ppdet/data/source/category.py b/rtdetr_paddle/ppdet/data/source/category.py
new file mode 100644
index 0000000..c927897
--- /dev/null
+++ b/rtdetr_paddle/ppdet/data/source/category.py
@@ -0,0 +1,926 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from ppdet.data.source.voc import pascalvoc_label
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['get_categories']
+
+
+def get_categories(metric_type, anno_file=None, arch=None):
+    """
+    Get class id to category id map and category id
+    to category name map from annotation file.
+
+    Args:
+        metric_type (str): metric type, currently support 'coco', 'voc', 'oid'
+            and 'widerface'.
+        anno_file (str): annotation file path
+    """
+    if arch == 'keypoint_arch':
+        return (None, {'id': 'keypoint'})
+
+    if anno_file == None or (not os.path.isfile(anno_file)):
+        logger.warning(
+            "anno_file '{}' is None or not set or not exist, "
+            "please recheck TrainDataset/EvalDataset/TestDataset.anno_path, "
+            "otherwise the default categories will be used by metric_type.".
+            format(anno_file))
+
+    if metric_type.lower() == 'coco' or metric_type.lower(
+    ) == 'rbox' or metric_type.lower() == 'snipercoco':
+        if anno_file and os.path.isfile(anno_file):
+            if anno_file.endswith('json'):
+                # lazy import pycocotools here
+                from pycocotools.coco import COCO
+                coco = COCO(anno_file)
+                cats = coco.loadCats(coco.getCatIds())
+
+                clsid2catid = {i: cat['id'] for i, cat in enumerate(cats)}
+                catid2name = {cat['id']: cat['name'] for cat in cats}
+
+            elif anno_file.endswith('txt'):
+                cats = []
+                with open(anno_file) as f:
+                    for line in f.readlines():
+                        cats.append(line.strip())
+                if cats[0] == 'background': cats = cats[1:]
+
+                clsid2catid = {i: i for i in range(len(cats))}
+                catid2name = {i: name for i, name in enumerate(cats)}
+
+            else:
+                raise ValueError("anno_file {} should be json or txt.".format(
+                    anno_file))
+            return clsid2catid, catid2name
+
+        # anno file not exist, load default categories of COCO17
+        else:
+            if metric_type.lower() == 'rbox':
+                logger.warning(
+                    "metric_type: {}, load default categories of DOTA.".format(
+                        metric_type))
+                return _dota_category()
+            logger.warning("metric_type: {}, load default categories of COCO.".
+                           format(metric_type))
+            return _coco17_category()
+
+    elif metric_type.lower() == 'voc':
+        if anno_file and os.path.isfile(anno_file):
+            cats = []
+            with open(anno_file) as f:
+                for line in f.readlines():
+                    cats.append(line.strip())
+
+            if cats[0] == 'background':
+                cats = cats[1:]
+
+            clsid2catid = {i: i for i in range(len(cats))}
+            catid2name = {i: name for i, name in enumerate(cats)}
+
+            return clsid2catid, catid2name
+
+        # anno file not exist, load default categories of
+        # VOC all 20 categories
+        else:
+            logger.warning("metric_type: {}, load default categories of VOC.".
+                           format(metric_type))
+            return _vocall_category()
+
+    elif metric_type.lower() == 'oid':
+        if anno_file and os.path.isfile(anno_file):
+            logger.warning("only default categories support for OID19")
+        return _oid19_category()
+
+    elif metric_type.lower() == 'keypointtopdowncocoeval' or metric_type.lower(
+    ) == 'keypointtopdownmpiieval':
+        return (None, {'id': 'keypoint'})
+
+    elif metric_type.lower() == 'pose3deval':
+        return (None, {'id': 'pose3d'})
+
+    elif metric_type.lower() in ['mot', 'motdet', 'reid']:
+        if anno_file and os.path.isfile(anno_file):
+            cats = []
+            with open(anno_file) as f:
+                for line in f.readlines():
+                    cats.append(line.strip())
+            if cats[0] == 'background':
+                cats = cats[1:]
+            clsid2catid = {i: i for i in range(len(cats))}
+            catid2name = {i: name for i, name in enumerate(cats)}
+            return clsid2catid, catid2name
+        # anno file not exist, load default category 'pedestrian'.
+        else:
+            logger.warning(
+                "metric_type: {}, load default categories of pedestrian MOT.".
+                format(metric_type))
+            return _mot_category(category='pedestrian')
+
+    elif metric_type.lower() in ['kitti', 'bdd100kmot']:
+        return _mot_category(category='vehicle')
+
+    elif metric_type.lower() in ['mcmot']:
+        if anno_file and os.path.isfile(anno_file):
+            cats = []
+            with open(anno_file) as f:
+                for line in f.readlines():
+                    cats.append(line.strip())
+            if cats[0] == 'background':
+                cats = cats[1:]
+            clsid2catid = {i: i for i in range(len(cats))}
+            catid2name = {i: name for i, name in enumerate(cats)}
+            return clsid2catid, catid2name
+        # anno file not exist, load default categories of visdrone all 10 categories
+        else:
+            logger.warning(
+                "metric_type: {}, load default categories of VisDrone.".format(
+                    metric_type))
+            return _visdrone_category()
+
+    else:
+        raise ValueError("unknown metric type {}".format(metric_type))
+
+
+def _mot_category(category='pedestrian'):
+    """
+    Get class id to category id map and category id
+    to category name map of mot dataset
+    """
+    label_map = {category: 0}
+    label_map = sorted(label_map.items(), key=lambda x: x[1])
+    cats = [l[0] for l in label_map]
+
+    clsid2catid = {i: i for i in range(len(cats))}
+    catid2name = {i: name for i, name in enumerate(cats)}
+
+    return clsid2catid, catid2name
+
+
+def _coco17_category():
+    """
+    Get class id to category id map and category id
+    to category name map of COCO2017 dataset
+
+    """
+    clsid2catid = {
+        1: 1,
+        2: 2,
+        3: 3,
+        4: 4,
+        5: 5,
+        6: 6,
+        7: 7,
+        8: 8,
+        9: 9,
+        10: 10,
+        11: 11,
+        12: 13,
+        13: 14,
+        14: 15,
+        15: 16,
+        16: 17,
+        17: 18,
+        18: 19,
+        19: 20,
+        20: 21,
+        21: 22,
+        22: 23,
+        23: 24,
+        24: 25,
+        25: 27,
+        26: 28,
+        27: 31,
+        28: 32,
+        29: 33,
+        30: 34,
+        31: 35,
+        32: 36,
+        33: 37,
+        34: 38,
+        35: 39,
+        36: 40,
+        37: 41,
+        38: 42,
+        39: 43,
+        40: 44,
+        41: 46,
+        42: 47,
+        43: 48,
+        44: 49,
+        45: 50,
+        46: 51,
+        47: 52,
+        48: 53,
+        49: 54,
+        50: 55,
+        51: 56,
+        52: 57,
+        53: 58,
+        54: 59,
+        55: 60,
+        56: 61,
+        57: 62,
+        58: 63,
+        59: 64,
+        60: 65,
+        61: 67,
+        62: 70,
+        63: 72,
+        64: 73,
+        65: 74,
+        66: 75,
+        67: 76,
+        68: 77,
+        69: 78,
+        70: 79,
+        71: 80,
+        72: 81,
+        73: 82,
+        74: 84,
+        75: 85,
+        76: 86,
+        77: 87,
+        78: 88,
+        79: 89,
+        80: 90
+    }
+
+    catid2name = {
+        0: 'background',
+        1: 'person',
+        2: 'bicycle',
+        3: 'car',
+        4: 'motorcycle',
+        5: 'airplane',
+        6: 'bus',
+        7: 'train',
+        8: 'truck',
+        9: 'boat',
+        10: 'traffic light',
+        11: 'fire hydrant',
+        13: 'stop sign',
+        14: 'parking meter',
+        15: 'bench',
+        16: 'bird',
+        17: 'cat',
+        18: 'dog',
+        19: 'horse',
+        20: 'sheep',
+        21: 'cow',
+        22: 'elephant',
+        23: 'bear',
+        24: 'zebra',
+        25: 'giraffe',
+        27: 'backpack',
+        28: 'umbrella',
+        31: 'handbag',
+        32: 'tie',
+        33: 'suitcase',
+        34: 'frisbee',
+        35: 'skis',
+        36: 'snowboard',
+        37: 'sports ball',
+        38: 'kite',
+        39: 'baseball bat',
+        40: 'baseball glove',
+        41: 'skateboard',
+        42: 'surfboard',
+        43: 'tennis racket',
+        44: 'bottle',
+        46: 'wine glass',
+        47: 'cup',
+        48: 'fork',
+        49: 'knife',
+        50: 'spoon',
+        51: 'bowl',
+        52: 'banana',
+        53: 'apple',
+        54: 'sandwich',
+        55: 'orange',
+        56: 'broccoli',
+        57: 'carrot',
+        58: 'hot dog',
+        59: 'pizza',
+        60: 'donut',
+        61: 'cake',
+        62: 'chair',
+        63: 'couch',
+        64: 'potted plant',
+        65: 'bed',
+        67: 'dining table',
+        70: 'toilet',
+        72: 'tv',
+        73: 'laptop',
+        74: 'mouse',
+        75: 'remote',
+        76: 'keyboard',
+        77: 'cell phone',
+        78: 'microwave',
+        79: 'oven',
+        80: 'toaster',
+        81: 'sink',
+        82: 'refrigerator',
+        84: 'book',
+        85: 'clock',
+        86: 'vase',
+        87: 'scissors',
+        88: 'teddy bear',
+        89: 'hair drier',
+        90: 'toothbrush'
+    }
+
+    clsid2catid = {k - 1: v for k, v in clsid2catid.items()}
+    catid2name.pop(0)
+
+    return clsid2catid, catid2name
+
+
+def _dota_category():
+    """
+    Get class id to category id map and category id
+    to category name map of dota dataset
+    """
+    catid2name = {
+        0: 'background',
+        1: 'plane',
+        2: 'baseball-diamond',
+        3: 'bridge',
+        4: 'ground-track-field',
+        5: 'small-vehicle',
+        6: 'large-vehicle',
+        7: 'ship',
+        8: 'tennis-court',
+        9: 'basketball-court',
+        10: 'storage-tank',
+        11: 'soccer-ball-field',
+        12: 'roundabout',
+        13: 'harbor',
+        14: 'swimming-pool',
+        15: 'helicopter'
+    }
+    catid2name.pop(0)
+    clsid2catid = {i: i + 1 for i in range(len(catid2name))}
+    return clsid2catid, catid2name
+
+
+def _vocall_category():
+    """
+    Get class id to category id map and category id
+    to category name map of mixup voc dataset
+
+    """
+    label_map = pascalvoc_label()
+    label_map = sorted(label_map.items(), key=lambda x: x[1])
+    cats = [l[0] for l in label_map]
+
+    clsid2catid = {i: i for i in range(len(cats))}
+    catid2name = {i: name for i, name in enumerate(cats)}
+
+    return clsid2catid, catid2name
+
+
+def _oid19_category():
+    clsid2catid = {k: k + 1 for k in range(500)}
+
+    catid2name = {
+        0: "background",
+        1: "Infant bed",
+        2: "Rose",
+        3: "Flag",
+        4: "Flashlight",
+        5: "Sea turtle",
+        6: "Camera",
+        7: "Animal",
+        8: "Glove",
+        9: "Crocodile",
+        10: "Cattle",
+        11: "House",
+        12: "Guacamole",
+        13: "Penguin",
+        14: "Vehicle registration plate",
+        15: "Bench",
+        16: "Ladybug",
+        17: "Human nose",
+        18: "Watermelon",
+        19: "Flute",
+        20: "Butterfly",
+        21: "Washing machine",
+        22: "Raccoon",
+        23: "Segway",
+        24: "Taco",
+        25: "Jellyfish",
+        26: "Cake",
+        27: "Pen",
+        28: "Cannon",
+        29: "Bread",
+        30: "Tree",
+        31: "Shellfish",
+        32: "Bed",
+        33: "Hamster",
+        34: "Hat",
+        35: "Toaster",
+        36: "Sombrero",
+        37: "Tiara",
+        38: "Bowl",
+        39: "Dragonfly",
+        40: "Moths and butterflies",
+        41: "Antelope",
+        42: "Vegetable",
+        43: "Torch",
+        44: "Building",
+        45: "Power plugs and sockets",
+        46: "Blender",
+        47: "Billiard table",
+        48: "Cutting board",
+        49: "Bronze sculpture",
+        50: "Turtle",
+        51: "Broccoli",
+        52: "Tiger",
+        53: "Mirror",
+        54: "Bear",
+        55: "Zucchini",
+        56: "Dress",
+        57: "Volleyball",
+        58: "Guitar",
+        59: "Reptile",
+        60: "Golf cart",
+        61: "Tart",
+        62: "Fedora",
+        63: "Carnivore",
+        64: "Car",
+        65: "Lighthouse",
+        66: "Coffeemaker",
+        67: "Food processor",
+        68: "Truck",
+        69: "Bookcase",
+        70: "Surfboard",
+        71: "Footwear",
+        72: "Bench",
+        73: "Necklace",
+        74: "Flower",
+        75: "Radish",
+        76: "Marine mammal",
+        77: "Frying pan",
+        78: "Tap",
+        79: "Peach",
+        80: "Knife",
+        81: "Handbag",
+        82: "Laptop",
+        83: "Tent",
+        84: "Ambulance",
+        85: "Christmas tree",
+        86: "Eagle",
+        87: "Limousine",
+        88: "Kitchen & dining room table",
+        89: "Polar bear",
+        90: "Tower",
+        91: "Football",
+        92: "Willow",
+        93: "Human head",
+        94: "Stop sign",
+        95: "Banana",
+        96: "Mixer",
+        97: "Binoculars",
+        98: "Dessert",
+        99: "Bee",
+        100: "Chair",
+        101: "Wood-burning stove",
+        102: "Flowerpot",
+        103: "Beaker",
+        104: "Oyster",
+        105: "Woodpecker",
+        106: "Harp",
+        107: "Bathtub",
+        108: "Wall clock",
+        109: "Sports uniform",
+        110: "Rhinoceros",
+        111: "Beehive",
+        112: "Cupboard",
+        113: "Chicken",
+        114: "Man",
+        115: "Blue jay",
+        116: "Cucumber",
+        117: "Balloon",
+        118: "Kite",
+        119: "Fireplace",
+        120: "Lantern",
+        121: "Missile",
+        122: "Book",
+        123: "Spoon",
+        124: "Grapefruit",
+        125: "Squirrel",
+        126: "Orange",
+        127: "Coat",
+        128: "Punching bag",
+        129: "Zebra",
+        130: "Billboard",
+        131: "Bicycle",
+        132: "Door handle",
+        133: "Mechanical fan",
+        134: "Ring binder",
+        135: "Table",
+        136: "Parrot",
+        137: "Sock",
+        138: "Vase",
+        139: "Weapon",
+        140: "Shotgun",
+        141: "Glasses",
+        142: "Seahorse",
+        143: "Belt",
+        144: "Watercraft",
+        145: "Window",
+        146: "Giraffe",
+        147: "Lion",
+        148: "Tire",
+        149: "Vehicle",
+        150: "Canoe",
+        151: "Tie",
+        152: "Shelf",
+        153: "Picture frame",
+        154: "Printer",
+        155: "Human leg",
+        156: "Boat",
+        157: "Slow cooker",
+        158: "Croissant",
+        159: "Candle",
+        160: "Pancake",
+        161: "Pillow",
+        162: "Coin",
+        163: "Stretcher",
+        164: "Sandal",
+        165: "Woman",
+        166: "Stairs",
+        167: "Harpsichord",
+        168: "Stool",
+        169: "Bus",
+        170: "Suitcase",
+        171: "Human mouth",
+        172: "Juice",
+        173: "Skull",
+        174: "Door",
+        175: "Violin",
+        176: "Chopsticks",
+        177: "Digital clock",
+        178: "Sunflower",
+        179: "Leopard",
+        180: "Bell pepper",
+        181: "Harbor seal",
+        182: "Snake",
+        183: "Sewing machine",
+        184: "Goose",
+        185: "Helicopter",
+        186: "Seat belt",
+        187: "Coffee cup",
+        188: "Microwave oven",
+        189: "Hot dog",
+        190: "Countertop",
+        191: "Serving tray",
+        192: "Dog bed",
+        193: "Beer",
+        194: "Sunglasses",
+        195: "Golf ball",
+        196: "Waffle",
+        197: "Palm tree",
+        198: "Trumpet",
+        199: "Ruler",
+        200: "Helmet",
+        201: "Ladder",
+        202: "Office building",
+        203: "Tablet computer",
+        204: "Toilet paper",
+        205: "Pomegranate",
+        206: "Skirt",
+        207: "Gas stove",
+        208: "Cookie",
+        209: "Cart",
+        210: "Raven",
+        211: "Egg",
+        212: "Burrito",
+        213: "Goat",
+        214: "Kitchen knife",
+        215: "Skateboard",
+        216: "Salt and pepper shakers",
+        217: "Lynx",
+        218: "Boot",
+        219: "Platter",
+        220: "Ski",
+        221: "Swimwear",
+        222: "Swimming pool",
+        223: "Drinking straw",
+        224: "Wrench",
+        225: "Drum",
+        226: "Ant",
+        227: "Human ear",
+        228: "Headphones",
+        229: "Fountain",
+        230: "Bird",
+        231: "Jeans",
+        232: "Television",
+        233: "Crab",
+        234: "Microphone",
+        235: "Home appliance",
+        236: "Snowplow",
+        237: "Beetle",
+        238: "Artichoke",
+        239: "Jet ski",
+        240: "Stationary bicycle",
+        241: "Human hair",
+        242: "Brown bear",
+        243: "Starfish",
+        244: "Fork",
+        245: "Lobster",
+        246: "Corded phone",
+        247: "Drink",
+        248: "Saucer",
+        249: "Carrot",
+        250: "Insect",
+        251: "Clock",
+        252: "Castle",
+        253: "Tennis racket",
+        254: "Ceiling fan",
+        255: "Asparagus",
+        256: "Jaguar",
+        257: "Musical instrument",
+        258: "Train",
+        259: "Cat",
+        260: "Rifle",
+        261: "Dumbbell",
+        262: "Mobile phone",
+        263: "Taxi",
+        264: "Shower",
+        265: "Pitcher",
+        266: "Lemon",
+        267: "Invertebrate",
+        268: "Turkey",
+        269: "High heels",
+        270: "Bust",
+        271: "Elephant",
+        272: "Scarf",
+        273: "Barrel",
+        274: "Trombone",
+        275: "Pumpkin",
+        276: "Box",
+        277: "Tomato",
+        278: "Frog",
+        279: "Bidet",
+        280: "Human face",
+        281: "Houseplant",
+        282: "Van",
+        283: "Shark",
+        284: "Ice cream",
+        285: "Swim cap",
+        286: "Falcon",
+        287: "Ostrich",
+        288: "Handgun",
+        289: "Whiteboard",
+        290: "Lizard",
+        291: "Pasta",
+        292: "Snowmobile",
+        293: "Light bulb",
+        294: "Window blind",
+        295: "Muffin",
+        296: "Pretzel",
+        297: "Computer monitor",
+        298: "Horn",
+        299: "Furniture",
+        300: "Sandwich",
+        301: "Fox",
+        302: "Convenience store",
+        303: "Fish",
+        304: "Fruit",
+        305: "Earrings",
+        306: "Curtain",
+        307: "Grape",
+        308: "Sofa bed",
+        309: "Horse",
+        310: "Luggage and bags",
+        311: "Desk",
+        312: "Crutch",
+        313: "Bicycle helmet",
+        314: "Tick",
+        315: "Airplane",
+        316: "Canary",
+        317: "Spatula",
+        318: "Watch",
+        319: "Lily",
+        320: "Kitchen appliance",
+        321: "Filing cabinet",
+        322: "Aircraft",
+        323: "Cake stand",
+        324: "Candy",
+        325: "Sink",
+        326: "Mouse",
+        327: "Wine",
+        328: "Wheelchair",
+        329: "Goldfish",
+        330: "Refrigerator",
+        331: "French fries",
+        332: "Drawer",
+        333: "Treadmill",
+        334: "Picnic basket",
+        335: "Dice",
+        336: "Cabbage",
+        337: "Football helmet",
+        338: "Pig",
+        339: "Person",
+        340: "Shorts",
+        341: "Gondola",
+        342: "Honeycomb",
+        343: "Doughnut",
+        344: "Chest of drawers",
+        345: "Land vehicle",
+        346: "Bat",
+        347: "Monkey",
+        348: "Dagger",
+        349: "Tableware",
+        350: "Human foot",
+        351: "Mug",
+        352: "Alarm clock",
+        353: "Pressure cooker",
+        354: "Human hand",
+        355: "Tortoise",
+        356: "Baseball glove",
+        357: "Sword",
+        358: "Pear",
+        359: "Miniskirt",
+        360: "Traffic sign",
+        361: "Girl",
+        362: "Roller skates",
+        363: "Dinosaur",
+        364: "Porch",
+        365: "Human beard",
+        366: "Submarine sandwich",
+        367: "Screwdriver",
+        368: "Strawberry",
+        369: "Wine glass",
+        370: "Seafood",
+        371: "Racket",
+        372: "Wheel",
+        373: "Sea lion",
+        374: "Toy",
+        375: "Tea",
+        376: "Tennis ball",
+        377: "Waste container",
+        378: "Mule",
+        379: "Cricket ball",
+        380: "Pineapple",
+        381: "Coconut",
+        382: "Doll",
+        383: "Coffee table",
+        384: "Snowman",
+        385: "Lavender",
+        386: "Shrimp",
+        387: "Maple",
+        388: "Cowboy hat",
+        389: "Goggles",
+        390: "Rugby ball",
+        391: "Caterpillar",
+        392: "Poster",
+        393: "Rocket",
+        394: "Organ",
+        395: "Saxophone",
+        396: "Traffic light",
+        397: "Cocktail",
+        398: "Plastic bag",
+        399: "Squash",
+        400: "Mushroom",
+        401: "Hamburger",
+        402: "Light switch",
+        403: "Parachute",
+        404: "Teddy bear",
+        405: "Winter melon",
+        406: "Deer",
+        407: "Musical keyboard",
+        408: "Plumbing fixture",
+        409: "Scoreboard",
+        410: "Baseball bat",
+        411: "Envelope",
+        412: "Adhesive tape",
+        413: "Briefcase",
+        414: "Paddle",
+        415: "Bow and arrow",
+        416: "Telephone",
+        417: "Sheep",
+        418: "Jacket",
+        419: "Boy",
+        420: "Pizza",
+        421: "Otter",
+        422: "Office supplies",
+        423: "Couch",
+        424: "Cello",
+        425: "Bull",
+        426: "Camel",
+        427: "Ball",
+        428: "Duck",
+        429: "Whale",
+        430: "Shirt",
+        431: "Tank",
+        432: "Motorcycle",
+        433: "Accordion",
+        434: "Owl",
+        435: "Porcupine",
+        436: "Sun hat",
+        437: "Nail",
+        438: "Scissors",
+        439: "Swan",
+        440: "Lamp",
+        441: "Crown",
+        442: "Piano",
+        443: "Sculpture",
+        444: "Cheetah",
+        445: "Oboe",
+        446: "Tin can",
+        447: "Mango",
+        448: "Tripod",
+        449: "Oven",
+        450: "Mouse",
+        451: "Barge",
+        452: "Coffee",
+        453: "Snowboard",
+        454: "Common fig",
+        455: "Salad",
+        456: "Marine invertebrates",
+        457: "Umbrella",
+        458: "Kangaroo",
+        459: "Human arm",
+        460: "Measuring cup",
+        461: "Snail",
+        462: "Loveseat",
+        463: "Suit",
+        464: "Teapot",
+        465: "Bottle",
+        466: "Alpaca",
+        467: "Kettle",
+        468: "Trousers",
+        469: "Popcorn",
+        470: "Centipede",
+        471: "Spider",
+        472: "Sparrow",
+        473: "Plate",
+        474: "Bagel",
+        475: "Personal care",
+        476: "Apple",
+        477: "Brassiere",
+        478: "Bathroom cabinet",
+        479: "studio couch",
+        480: "Computer keyboard",
+        481: "Table tennis racket",
+        482: "Sushi",
+        483: "Cabinetry",
+        484: "Street light",
+        485: "Towel",
+        486: "Nightstand",
+        487: "Rabbit",
+        488: "Dolphin",
+        489: "Dog",
+        490: "Jug",
+        491: "Wok",
+        492: "Fire hydrant",
+        493: "Human eye",
+        494: "Skyscraper",
+        495: "Backpack",
+        496: "Potato",
+        497: "Paper towel",
+        498: "Lifejacket",
+        499: "Bicycle wheel",
+        500: "Toilet",
+    }
+
+    return clsid2catid, catid2name
+
+
+def _visdrone_category():
+    clsid2catid = {i: i for i in range(10)}
+
+    catid2name = {
+        0: 'pedestrian',
+        1: 'people',
+        2: 'bicycle',
+        3: 'car',
+        4: 'van',
+        5: 'truck',
+        6: 'tricycle',
+        7: 'awning-tricycle',
+        8: 'bus',
+        9: 'motor'
+    }
+    return clsid2catid, catid2name
diff --git a/rtdetr_paddle/ppdet/data/source/coco.py b/rtdetr_paddle/ppdet/data/source/coco.py
new file mode 100644
index 0000000..330dae6
--- /dev/null
+++ b/rtdetr_paddle/ppdet/data/source/coco.py
@@ -0,0 +1,587 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import os
+import copy
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+import numpy as np
+from ppdet.core.workspace import register, serializable
+from .dataset import DetDataset
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['COCODataSet', 'SlicedCOCODataSet', 'SemiCOCODataSet']
+
+
+@register
+@serializable
+class COCODataSet(DetDataset):
+    """
+    Load dataset with COCO format.
+
+    Args:
+        dataset_dir (str): root directory for dataset.
+        image_dir (str): directory for images.
+        anno_path (str): coco annotation file path.
+        data_fields (list): key name of data dictionary, at least have 'image'.
+        sample_num (int): number of samples to load, -1 means all.
+        load_crowd (bool): whether to load crowded ground-truth. 
+            False as default
+        allow_empty (bool): whether to load empty entry. False as default
+        empty_ratio (float): the ratio of empty record number to total 
+            record's, if empty_ratio is out of [0. ,1.), do not sample the 
+            records and use all the empty entries. 1. as default
+        repeat (int): repeat times for dataset, use in benchmark.
+    """
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 load_crowd=False,
+                 allow_empty=False,
+                 empty_ratio=1.,
+                 repeat=1):
+        super(COCODataSet, self).__init__(
+            dataset_dir,
+            image_dir,
+            anno_path,
+            data_fields,
+            sample_num,
+            repeat=repeat)
+        self.load_image_only = False
+        self.load_semantic = False
+        self.load_crowd = load_crowd
+        self.allow_empty = allow_empty
+        self.empty_ratio = empty_ratio
+
+    def _sample_empty(self, records, num):
+        # if empty_ratio is out of [0. ,1.), do not sample the records
+        if self.empty_ratio < 0. or self.empty_ratio >= 1.:
+            return records
+        import random
+        sample_num = min(
+            int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records))
+        records = random.sample(records, sample_num)
+        return records
+
+    def parse_dataset(self):
+        anno_path = os.path.join(self.dataset_dir, self.anno_path)
+        image_dir = os.path.join(self.dataset_dir, self.image_dir)
+
+        assert anno_path.endswith('.json'), \
+            'invalid coco annotation file: ' + anno_path
+        from pycocotools.coco import COCO
+        coco = COCO(anno_path)
+        img_ids = coco.getImgIds()
+        img_ids.sort()
+        cat_ids = coco.getCatIds()
+        records = []
+        empty_records = []
+        ct = 0
+
+        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
+        self.cname2cid = dict({
+            coco.loadCats(catid)[0]['name']: clsid
+            for catid, clsid in self.catid2clsid.items()
+        })
+
+        if 'annotations' not in coco.dataset:
+            self.load_image_only = True
+            logger.warning('Annotation file: {} does not contains ground truth '
+                           'and load image information only.'.format(anno_path))
+
+        for img_id in img_ids:
+            img_anno = coco.loadImgs([img_id])[0]
+            im_fname = img_anno['file_name']
+            im_w = float(img_anno['width'])
+            im_h = float(img_anno['height'])
+
+            im_path = os.path.join(image_dir,
+                                   im_fname) if image_dir else im_fname
+            is_empty = False
+            if not os.path.exists(im_path):
+                logger.warning('Illegal image file: {}, and it will be '
+                               'ignored'.format(im_path))
+                continue
+
+            if im_w < 0 or im_h < 0:
+                logger.warning('Illegal width: {} or height: {} in annotation, '
+                               'and im_id: {} will be ignored'.format(
+                                   im_w, im_h, img_id))
+                continue
+
+            coco_rec = {
+                'im_file': im_path,
+                'im_id': np.array([img_id]),
+                'h': im_h,
+                'w': im_w,
+            } if 'image' in self.data_fields else {}
+
+            if not self.load_image_only:
+                ins_anno_ids = coco.getAnnIds(
+                    imgIds=[img_id], iscrowd=None if self.load_crowd else False)
+                instances = coco.loadAnns(ins_anno_ids)
+
+                bboxes = []
+                is_rbox_anno = False
+                for inst in instances:
+                    # check gt bbox
+                    if inst.get('ignore', False):
+                        continue
+                    if 'bbox' not in inst.keys():
+                        continue
+                    else:
+                        if not any(np.array(inst['bbox'])):
+                            continue
+
+                    x1, y1, box_w, box_h = inst['bbox']
+                    x2 = x1 + box_w
+                    y2 = y1 + box_h
+                    eps = 1e-5
+                    if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps:
+                        inst['clean_bbox'] = [
+                            round(float(x), 3) for x in [x1, y1, x2, y2]
+                        ]
+                        bboxes.append(inst)
+                    else:
+                        logger.warning(
+                            'Found an invalid bbox in annotations: im_id: {}, '
+                            'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
+                                img_id, float(inst['area']), x1, y1, x2, y2))
+
+                num_bbox = len(bboxes)
+                if num_bbox <= 0 and not self.allow_empty:
+                    continue
+                elif num_bbox <= 0:
+                    is_empty = True
+
+                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
+                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
+                is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
+                gt_poly = [None] * num_bbox
+                gt_track_id = -np.ones((num_bbox, 1), dtype=np.int32)
+
+                has_segmentation = False
+                has_track_id = False
+                for i, box in enumerate(bboxes):
+                    catid = box['category_id']
+                    gt_class[i][0] = self.catid2clsid[catid]
+                    gt_bbox[i, :] = box['clean_bbox']
+                    is_crowd[i][0] = box['iscrowd']
+                    # check RLE format 
+                    if 'segmentation' in box and box['iscrowd'] == 1:
+                        gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
+                    elif 'segmentation' in box and box['segmentation']:
+                        if not np.array(
+                                box['segmentation'],
+                                dtype=object).size > 0 and not self.allow_empty:
+                            bboxes.pop(i)
+                            gt_poly.pop(i)
+                            np.delete(is_crowd, i)
+                            np.delete(gt_class, i)
+                            np.delete(gt_bbox, i)
+                        else:
+                            gt_poly[i] = box['segmentation']
+                        has_segmentation = True
+
+                    if 'track_id' in box:
+                        gt_track_id[i][0] = box['track_id']
+                        has_track_id = True
+
+                if has_segmentation and not any(
+                        gt_poly) and not self.allow_empty:
+                    continue
+
+                gt_rec = {
+                    'is_crowd': is_crowd,
+                    'gt_class': gt_class,
+                    'gt_bbox': gt_bbox,
+                    'gt_poly': gt_poly,
+                }
+                if has_track_id:
+                    gt_rec.update({'gt_track_id': gt_track_id})
+
+                for k, v in gt_rec.items():
+                    if k in self.data_fields:
+                        coco_rec[k] = v
+
+                # TODO: remove load_semantic
+                if self.load_semantic and 'semantic' in self.data_fields:
+                    seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps',
+                                            'train2017', im_fname[:-3] + 'png')
+                    coco_rec.update({'semantic': seg_path})
+
+            logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format(
+                im_path, img_id, im_h, im_w))
+            if is_empty:
+                empty_records.append(coco_rec)
+            else:
+                records.append(coco_rec)
+            ct += 1
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+        assert ct > 0, 'not found any coco record in %s' % (anno_path)
+        logger.info('Load [{} samples valid, {} samples invalid] in file {}.'.
+                    format(ct, len(img_ids) - ct, anno_path))
+        if self.allow_empty and len(empty_records) > 0:
+            empty_records = self._sample_empty(empty_records, len(records))
+            records += empty_records
+        self.roidbs = records
+
+
+@register
+@serializable
+class SlicedCOCODataSet(COCODataSet):
+    """Sliced COCODataSet"""
+
+    def __init__(
+            self,
+            dataset_dir=None,
+            image_dir=None,
+            anno_path=None,
+            data_fields=['image'],
+            sample_num=-1,
+            load_crowd=False,
+            allow_empty=False,
+            empty_ratio=1.,
+            repeat=1,
+            sliced_size=[640, 640],
+            overlap_ratio=[0.25, 0.25], ):
+        super(SlicedCOCODataSet, self).__init__(
+            dataset_dir=dataset_dir,
+            image_dir=image_dir,
+            anno_path=anno_path,
+            data_fields=data_fields,
+            sample_num=sample_num,
+            load_crowd=load_crowd,
+            allow_empty=allow_empty,
+            empty_ratio=empty_ratio,
+            repeat=repeat, )
+        self.sliced_size = sliced_size
+        self.overlap_ratio = overlap_ratio
+
+    def parse_dataset(self):
+        anno_path = os.path.join(self.dataset_dir, self.anno_path)
+        image_dir = os.path.join(self.dataset_dir, self.image_dir)
+
+        assert anno_path.endswith('.json'), \
+            'invalid coco annotation file: ' + anno_path
+        from pycocotools.coco import COCO
+        coco = COCO(anno_path)
+        img_ids = coco.getImgIds()
+        img_ids.sort()
+        cat_ids = coco.getCatIds()
+        records = []
+        empty_records = []
+        ct = 0
+        ct_sub = 0
+
+        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
+        self.cname2cid = dict({
+            coco.loadCats(catid)[0]['name']: clsid
+            for catid, clsid in self.catid2clsid.items()
+        })
+
+        if 'annotations' not in coco.dataset:
+            self.load_image_only = True
+            logger.warning('Annotation file: {} does not contains ground truth '
+                           'and load image information only.'.format(anno_path))
+        try:
+            import sahi
+            from sahi.slicing import slice_image
+        except Exception as e:
+            logger.error(
+                'sahi not found, plaese install sahi. '
+                'for example: `pip install sahi`, see https://github.com/obss/sahi.'
+            )
+            raise e
+
+        sub_img_ids = 0
+        for img_id in img_ids:
+            img_anno = coco.loadImgs([img_id])[0]
+            im_fname = img_anno['file_name']
+            im_w = float(img_anno['width'])
+            im_h = float(img_anno['height'])
+
+            im_path = os.path.join(image_dir,
+                                   im_fname) if image_dir else im_fname
+            is_empty = False
+            if not os.path.exists(im_path):
+                logger.warning('Illegal image file: {}, and it will be '
+                               'ignored'.format(im_path))
+                continue
+
+            if im_w < 0 or im_h < 0:
+                logger.warning('Illegal width: {} or height: {} in annotation, '
+                               'and im_id: {} will be ignored'.format(
+                                   im_w, im_h, img_id))
+                continue
+
+            slice_image_result = sahi.slicing.slice_image(
+                image=im_path,
+                slice_height=self.sliced_size[0],
+                slice_width=self.sliced_size[1],
+                overlap_height_ratio=self.overlap_ratio[0],
+                overlap_width_ratio=self.overlap_ratio[1])
+
+            sub_img_num = len(slice_image_result)
+            for _ind in range(sub_img_num):
+                im = slice_image_result.images[_ind]
+                coco_rec = {
+                    'image': im,
+                    'im_id': np.array([sub_img_ids + _ind]),
+                    'h': im.shape[0],
+                    'w': im.shape[1],
+                    'ori_im_id': np.array([img_id]),
+                    'st_pix': np.array(
+                        slice_image_result.starting_pixels[_ind],
+                        dtype=np.float32),
+                    'is_last': 1 if _ind == sub_img_num - 1 else 0,
+                } if 'image' in self.data_fields else {}
+                records.append(coco_rec)
+            ct_sub += sub_img_num
+            ct += 1
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+        assert ct > 0, 'not found any coco record in %s' % (anno_path)
+        logger.info('{} samples and slice to {} sub_samples in file {}'.format(
+            ct, ct_sub, anno_path))
+        if self.allow_empty and len(empty_records) > 0:
+            empty_records = self._sample_empty(empty_records, len(records))
+            records += empty_records
+        self.roidbs = records
+
+
+@register
+@serializable
+class SemiCOCODataSet(COCODataSet):
+    """Semi-COCODataSet used for supervised and unsupervised dataSet"""
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 load_crowd=False,
+                 allow_empty=False,
+                 empty_ratio=1.,
+                 repeat=1,
+                 supervised=True):
+        super(SemiCOCODataSet, self).__init__(
+            dataset_dir, image_dir, anno_path, data_fields, sample_num,
+            load_crowd, allow_empty, empty_ratio, repeat)
+        self.supervised = supervised
+        self.length = -1  # defalut -1 means all
+
+    def parse_dataset(self):
+        anno_path = os.path.join(self.dataset_dir, self.anno_path)
+        image_dir = os.path.join(self.dataset_dir, self.image_dir)
+
+        assert anno_path.endswith('.json'), \
+            'invalid coco annotation file: ' + anno_path
+        from pycocotools.coco import COCO
+        coco = COCO(anno_path)
+        img_ids = coco.getImgIds()
+        img_ids.sort()
+        cat_ids = coco.getCatIds()
+        records = []
+        empty_records = []
+        ct = 0
+
+        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
+        self.cname2cid = dict({
+            coco.loadCats(catid)[0]['name']: clsid
+            for catid, clsid in self.catid2clsid.items()
+        })
+
+        if 'annotations' not in coco.dataset or self.supervised == False:
+            self.load_image_only = True
+            logger.warning('Annotation file: {} does not contains ground truth '
+                           'and load image information only.'.format(anno_path))
+
+        for img_id in img_ids:
+            img_anno = coco.loadImgs([img_id])[0]
+            im_fname = img_anno['file_name']
+            im_w = float(img_anno['width'])
+            im_h = float(img_anno['height'])
+
+            im_path = os.path.join(image_dir,
+                                   im_fname) if image_dir else im_fname
+            is_empty = False
+            if not os.path.exists(im_path):
+                logger.warning('Illegal image file: {}, and it will be '
+                               'ignored'.format(im_path))
+                continue
+
+            if im_w < 0 or im_h < 0:
+                logger.warning('Illegal width: {} or height: {} in annotation, '
+                               'and im_id: {} will be ignored'.format(
+                                   im_w, im_h, img_id))
+                continue
+
+            coco_rec = {
+                'im_file': im_path,
+                'im_id': np.array([img_id]),
+                'h': im_h,
+                'w': im_w,
+            } if 'image' in self.data_fields else {}
+
+            if not self.load_image_only:
+                ins_anno_ids = coco.getAnnIds(
+                    imgIds=[img_id], iscrowd=None if self.load_crowd else False)
+                instances = coco.loadAnns(ins_anno_ids)
+
+                bboxes = []
+                is_rbox_anno = False
+                for inst in instances:
+                    # check gt bbox
+                    if inst.get('ignore', False):
+                        continue
+                    if 'bbox' not in inst.keys():
+                        continue
+                    else:
+                        if not any(np.array(inst['bbox'])):
+                            continue
+
+                    x1, y1, box_w, box_h = inst['bbox']
+                    x2 = x1 + box_w
+                    y2 = y1 + box_h
+                    eps = 1e-5
+                    if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps:
+                        inst['clean_bbox'] = [
+                            round(float(x), 3) for x in [x1, y1, x2, y2]
+                        ]
+                        bboxes.append(inst)
+                    else:
+                        logger.warning(
+                            'Found an invalid bbox in annotations: im_id: {}, '
+                            'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
+                                img_id, float(inst['area']), x1, y1, x2, y2))
+
+                num_bbox = len(bboxes)
+                if num_bbox <= 0 and not self.allow_empty:
+                    continue
+                elif num_bbox <= 0:
+                    is_empty = True
+
+                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
+                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
+                is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
+                gt_poly = [None] * num_bbox
+
+                has_segmentation = False
+                for i, box in enumerate(bboxes):
+                    catid = box['category_id']
+                    gt_class[i][0] = self.catid2clsid[catid]
+                    gt_bbox[i, :] = box['clean_bbox']
+                    is_crowd[i][0] = box['iscrowd']
+                    # check RLE format 
+                    if 'segmentation' in box and box['iscrowd'] == 1:
+                        gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
+                    elif 'segmentation' in box and box['segmentation']:
+                        if not np.array(box['segmentation']
+                                        ).size > 0 and not self.allow_empty:
+                            bboxes.pop(i)
+                            gt_poly.pop(i)
+                            np.delete(is_crowd, i)
+                            np.delete(gt_class, i)
+                            np.delete(gt_bbox, i)
+                        else:
+                            gt_poly[i] = box['segmentation']
+                        has_segmentation = True
+
+                if has_segmentation and not any(
+                        gt_poly) and not self.allow_empty:
+                    continue
+
+                gt_rec = {
+                    'is_crowd': is_crowd,
+                    'gt_class': gt_class,
+                    'gt_bbox': gt_bbox,
+                    'gt_poly': gt_poly,
+                }
+
+                for k, v in gt_rec.items():
+                    if k in self.data_fields:
+                        coco_rec[k] = v
+
+                # TODO: remove load_semantic
+                if self.load_semantic and 'semantic' in self.data_fields:
+                    seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps',
+                                            'train2017', im_fname[:-3] + 'png')
+                    coco_rec.update({'semantic': seg_path})
+
+            logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format(
+                im_path, img_id, im_h, im_w))
+            if is_empty:
+                empty_records.append(coco_rec)
+            else:
+                records.append(coco_rec)
+            ct += 1
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+        assert ct > 0, 'not found any coco record in %s' % (anno_path)
+        logger.info('Load [{} samples valid, {} samples invalid] in file {}.'.
+                    format(ct, len(img_ids) - ct, anno_path))
+        if self.allow_empty and len(empty_records) > 0:
+            empty_records = self._sample_empty(empty_records, len(records))
+            records += empty_records
+        self.roidbs = records
+
+        if self.supervised:
+            logger.info(f'Use {len(self.roidbs)} sup_samples data as LABELED')
+        else:
+            if self.length > 0:  # unsup length will be decide by sup length
+                all_roidbs = self.roidbs.copy()
+                selected_idxs = [
+                    np.random.choice(len(all_roidbs))
+                    for _ in range(self.length)
+                ]
+                self.roidbs = [all_roidbs[i] for i in selected_idxs]
+            logger.info(
+                f'Use {len(self.roidbs)} unsup_samples data as UNLABELED')
+
+    def __getitem__(self, idx):
+        n = len(self.roidbs)
+        if self.repeat > 1:
+            idx %= n
+        # data batch
+        roidb = copy.deepcopy(self.roidbs[idx])
+        if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch:
+            idx = np.random.randint(n)
+            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
+        elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch:
+            idx = np.random.randint(n)
+            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
+        elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch:
+            roidb = [roidb, ] + [
+                copy.deepcopy(self.roidbs[np.random.randint(n)])
+                for _ in range(4)
+            ]
+        if isinstance(roidb, Sequence):
+            for r in roidb:
+                r['curr_iter'] = self._curr_iter
+        else:
+            roidb['curr_iter'] = self._curr_iter
+        self._curr_iter += 1
+
+        return self.transform(roidb)
diff --git a/rtdetr_paddle/ppdet/data/source/dataset.py b/rtdetr_paddle/ppdet/data/source/dataset.py
new file mode 100644
index 0000000..4f22b22
--- /dev/null
+++ b/rtdetr_paddle/ppdet/data/source/dataset.py
@@ -0,0 +1,307 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import os
+import copy
+import numpy as np
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+from paddle.io import Dataset
+from ppdet.core.workspace import register, serializable
+from ppdet.utils.download import get_dataset_path
+from ppdet.data import source
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+@serializable
+class DetDataset(Dataset):
+    """
+    Load detection dataset.
+
+    Args:
+        dataset_dir (str): root directory for dataset.
+        image_dir (str): directory for images.
+        anno_path (str): annotation file path.
+        data_fields (list): key name of data dictionary, at least have 'image'.
+        sample_num (int): number of samples to load, -1 means all.
+        use_default_label (bool): whether to load default label list.
+        repeat (int): repeat times for dataset, use in benchmark.
+    """
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 use_default_label=None,
+                 repeat=1,
+                 **kwargs):
+        super(DetDataset, self).__init__()
+        self.dataset_dir = dataset_dir if dataset_dir is not None else ''
+        self.anno_path = anno_path
+        self.image_dir = image_dir if image_dir is not None else ''
+        self.data_fields = data_fields
+        self.sample_num = sample_num
+        self.use_default_label = use_default_label
+        self.repeat = repeat
+        self._epoch = 0
+        self._curr_iter = 0
+
+    def __len__(self, ):
+        return len(self.roidbs) * self.repeat
+
+    def __call__(self, *args, **kwargs):
+        return self
+
+    def __getitem__(self, idx):
+        n = len(self.roidbs)
+        if self.repeat > 1:
+            idx %= n
+        # data batch
+        roidb = copy.deepcopy(self.roidbs[idx])
+        if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch:
+            idx = np.random.randint(n)
+            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
+        elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch:
+            idx = np.random.randint(n)
+            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
+        elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch:
+            roidb = [roidb, ] + [
+                copy.deepcopy(self.roidbs[np.random.randint(n)])
+                for _ in range(4)
+            ]
+        elif self.pre_img_epoch == 0 or self._epoch < self.pre_img_epoch:
+            # Add previous image as input, only used in CenterTrack
+            idx_pre_img = idx - 1
+            if idx_pre_img < 0:
+                idx_pre_img = idx + 1
+            roidb = [roidb, ] + [copy.deepcopy(self.roidbs[idx_pre_img])]
+        if isinstance(roidb, Sequence):
+            for r in roidb:
+                r['curr_iter'] = self._curr_iter
+        else:
+            roidb['curr_iter'] = self._curr_iter
+        self._curr_iter += 1
+
+        return self.transform(roidb)
+
+    def check_or_download_dataset(self):
+        self.dataset_dir = get_dataset_path(self.dataset_dir, self.anno_path,
+                                            self.image_dir)
+
+    def set_kwargs(self, **kwargs):
+        self.mixup_epoch = kwargs.get('mixup_epoch', -1)
+        self.cutmix_epoch = kwargs.get('cutmix_epoch', -1)
+        self.mosaic_epoch = kwargs.get('mosaic_epoch', -1)
+        self.pre_img_epoch = kwargs.get('pre_img_epoch', -1)
+
+    def set_transform(self, transform):
+        self.transform = transform
+
+    def set_epoch(self, epoch_id):
+        self._epoch = epoch_id
+
+    def parse_dataset(self, ):
+        raise NotImplementedError(
+            "Need to implement parse_dataset method of Dataset")
+
+    def get_anno(self):
+        if self.anno_path is None:
+            return
+        return os.path.join(self.dataset_dir, self.anno_path)
+
+
+def _is_valid_file(f, extensions=('.jpg', '.jpeg', '.png', '.bmp')):
+    return f.lower().endswith(extensions)
+
+
+def _make_dataset(dir):
+    dir = os.path.expanduser(dir)
+    if not os.path.isdir(dir):
+        raise ('{} should be a dir'.format(dir))
+    images = []
+    for root, _, fnames in sorted(os.walk(dir, followlinks=True)):
+        for fname in sorted(fnames):
+            path = os.path.join(root, fname)
+            if _is_valid_file(path):
+                images.append(path)
+    return images
+
+
+@register
+@serializable
+class ImageFolder(DetDataset):
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 sample_num=-1,
+                 use_default_label=None,
+                 **kwargs):
+        super(ImageFolder, self).__init__(
+            dataset_dir,
+            image_dir,
+            anno_path,
+            sample_num=sample_num,
+            use_default_label=use_default_label)
+        self._imid2path = {}
+        self.roidbs = None
+        self.sample_num = sample_num
+
+    def check_or_download_dataset(self):
+        return
+
+    def get_anno(self):
+        if self.anno_path is None:
+            return
+        if self.dataset_dir:
+            return os.path.join(self.dataset_dir, self.anno_path)
+        else:
+            return self.anno_path
+
+    def parse_dataset(self, ):
+        if not self.roidbs:
+            self.roidbs = self._load_images()
+
+    def _parse(self):
+        image_dir = self.image_dir
+        if not isinstance(image_dir, Sequence):
+            image_dir = [image_dir]
+        images = []
+        for im_dir in image_dir:
+            if os.path.isdir(im_dir):
+                im_dir = os.path.join(self.dataset_dir, im_dir)
+                images.extend(_make_dataset(im_dir))
+            elif os.path.isfile(im_dir) and _is_valid_file(im_dir):
+                images.append(im_dir)
+        return images
+
+    def _load_images(self):
+        images = self._parse()
+        ct = 0
+        records = []
+        for image in images:
+            assert image != '' and os.path.isfile(image), \
+                    "Image {} not found".format(image)
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+            rec = {'im_id': np.array([ct]), 'im_file': image}
+            self._imid2path[ct] = image
+            ct += 1
+            records.append(rec)
+        assert len(records) > 0, "No image file found"
+        return records
+
+    def get_imid2path(self):
+        return self._imid2path
+
+    def set_images(self, images):
+        self.image_dir = images
+        self.roidbs = self._load_images()
+
+    def set_slice_images(self,
+                         images,
+                         slice_size=[640, 640],
+                         overlap_ratio=[0.25, 0.25]):
+        self.image_dir = images
+        ori_records = self._load_images()
+        try:
+            import sahi
+            from sahi.slicing import slice_image
+        except Exception as e:
+            logger.error(
+                'sahi not found, plaese install sahi. '
+                'for example: `pip install sahi`, see https://github.com/obss/sahi.'
+            )
+            raise e
+
+        sub_img_ids = 0
+        ct = 0
+        ct_sub = 0
+        records = []
+        for i, ori_rec in enumerate(ori_records):
+            im_path = ori_rec['im_file']
+            slice_image_result = sahi.slicing.slice_image(
+                image=im_path,
+                slice_height=slice_size[0],
+                slice_width=slice_size[1],
+                overlap_height_ratio=overlap_ratio[0],
+                overlap_width_ratio=overlap_ratio[1])
+
+            sub_img_num = len(slice_image_result)
+            for _ind in range(sub_img_num):
+                im = slice_image_result.images[_ind]
+                rec = {
+                    'image': im,
+                    'im_id': np.array([sub_img_ids + _ind]),
+                    'h': im.shape[0],
+                    'w': im.shape[1],
+                    'ori_im_id': np.array([ori_rec['im_id'][0]]),
+                    'st_pix': np.array(
+                        slice_image_result.starting_pixels[_ind],
+                        dtype=np.float32),
+                    'is_last': 1 if _ind == sub_img_num - 1 else 0,
+                } if 'image' in self.data_fields else {}
+                records.append(rec)
+            ct_sub += sub_img_num
+            ct += 1
+        logger.info('{} samples and slice to {} sub_samples.'.format(ct,
+                                                                     ct_sub))
+        self.roidbs = records
+
+    def get_label_list(self):
+        # Only VOC dataset needs label list in ImageFold 
+        return self.anno_path
+
+
+@register
+class CommonDataset(object):
+    def __init__(self, **dataset_args):
+        super(CommonDataset, self).__init__()
+        dataset_args = copy.deepcopy(dataset_args)
+        type = dataset_args.pop("name")
+        self.dataset = getattr(source, type)(**dataset_args)
+
+    def __call__(self):
+        return self.dataset
+
+
+@register
+class TrainDataset(CommonDataset):
+    pass
+
+
+@register
+class EvalMOTDataset(CommonDataset):
+    pass
+
+
+@register
+class TestMOTDataset(CommonDataset):
+    pass
+
+
+@register
+class EvalDataset(CommonDataset):
+    pass
+
+
+@register
+class TestDataset(CommonDataset):
+    pass
diff --git a/rtdetr_paddle/ppdet/data/source/voc.py b/rtdetr_paddle/ppdet/data/source/voc.py
new file mode 100644
index 0000000..2f10358
--- /dev/null
+++ b/rtdetr_paddle/ppdet/data/source/voc.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+
+import xml.etree.ElementTree as ET
+
+from ppdet.core.workspace import register, serializable
+
+from .dataset import DetDataset
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+@register
+@serializable
+class VOCDataSet(DetDataset):
+    """
+    Load dataset with PascalVOC format.
+
+    Notes:
+    `anno_path` must contains xml file and image file path for annotations.
+
+    Args:
+        dataset_dir (str): root directory for dataset.
+        image_dir (str): directory for images.
+        anno_path (str): voc annotation file path.
+        data_fields (list): key name of data dictionary, at least have 'image'.
+        sample_num (int): number of samples to load, -1 means all.
+        label_list (str): if use_default_label is False, will load
+            mapping between category and class index.
+        allow_empty (bool): whether to load empty entry. False as default
+        empty_ratio (float): the ratio of empty record number to total 
+            record's, if empty_ratio is out of [0. ,1.), do not sample the 
+            records and use all the empty entries. 1. as default
+        repeat (int): repeat times for dataset, use in benchmark.
+    """
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 label_list=None,
+                 allow_empty=False,
+                 empty_ratio=1.,
+                 repeat=1):
+        super(VOCDataSet, self).__init__(
+            dataset_dir=dataset_dir,
+            image_dir=image_dir,
+            anno_path=anno_path,
+            data_fields=data_fields,
+            sample_num=sample_num,
+            repeat=repeat)
+        self.label_list = label_list
+        self.allow_empty = allow_empty
+        self.empty_ratio = empty_ratio
+
+    def _sample_empty(self, records, num):
+        # if empty_ratio is out of [0. ,1.), do not sample the records
+        if self.empty_ratio < 0. or self.empty_ratio >= 1.:
+            return records
+        import random
+        sample_num = min(
+            int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records))
+        records = random.sample(records, sample_num)
+        return records
+
+    def parse_dataset(self, ):
+        anno_path = os.path.join(self.dataset_dir, self.anno_path)
+        image_dir = os.path.join(self.dataset_dir, self.image_dir)
+
+        # mapping category name to class id
+        # first_class:0, second_class:1, ...
+        records = []
+        empty_records = []
+        ct = 0
+        cname2cid = {}
+        if self.label_list:
+            label_path = os.path.join(self.dataset_dir, self.label_list)
+            if not os.path.exists(label_path):
+                raise ValueError("label_list {} does not exists".format(
+                    label_path))
+            with open(label_path, 'r') as fr:
+                label_id = 0
+                for line in fr.readlines():
+                    cname2cid[line.strip()] = label_id
+                    label_id += 1
+        else:
+            cname2cid = pascalvoc_label()
+
+        with open(anno_path, 'r') as fr:
+            while True:
+                line = fr.readline()
+                if not line:
+                    break
+                img_file, xml_file = [os.path.join(image_dir, x) \
+                        for x in line.strip().split()[:2]]
+                if not os.path.exists(img_file):
+                    logger.warning(
+                        'Illegal image file: {}, and it will be ignored'.format(
+                            img_file))
+                    continue
+                if not os.path.isfile(xml_file):
+                    logger.warning(
+                        'Illegal xml file: {}, and it will be ignored'.format(
+                            xml_file))
+                    continue
+                tree = ET.parse(xml_file)
+                if tree.find('id') is None:
+                    im_id = np.array([ct])
+                else:
+                    im_id = np.array([int(tree.find('id').text)])
+
+                objs = tree.findall('object')
+                im_w = float(tree.find('size').find('width').text)
+                im_h = float(tree.find('size').find('height').text)
+                if im_w < 0 or im_h < 0:
+                    logger.warning(
+                        'Illegal width: {} or height: {} in annotation, '
+                        'and {} will be ignored'.format(im_w, im_h, xml_file))
+                    continue
+
+                num_bbox, i = len(objs), 0
+                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
+                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
+                gt_score = np.zeros((num_bbox, 1), dtype=np.float32)
+                difficult = np.zeros((num_bbox, 1), dtype=np.int32)
+                for obj in objs:
+                    cname = obj.find('name').text
+
+                    # user dataset may not contain difficult field
+                    _difficult = obj.find('difficult')
+                    _difficult = int(
+                        _difficult.text) if _difficult is not None else 0
+
+                    x1 = float(obj.find('bndbox').find('xmin').text)
+                    y1 = float(obj.find('bndbox').find('ymin').text)
+                    x2 = float(obj.find('bndbox').find('xmax').text)
+                    y2 = float(obj.find('bndbox').find('ymax').text)
+                    x1 = max(0, x1)
+                    y1 = max(0, y1)
+                    x2 = min(im_w - 1, x2)
+                    y2 = min(im_h - 1, y2)
+                    if x2 > x1 and y2 > y1:
+                        gt_bbox[i, :] = [x1, y1, x2, y2]
+                        gt_class[i, 0] = cname2cid[cname]
+                        gt_score[i, 0] = 1.
+                        difficult[i, 0] = _difficult
+                        i += 1
+                    else:
+                        logger.warning(
+                            'Found an invalid bbox in annotations: xml_file: {}'
+                            ', x1: {}, y1: {}, x2: {}, y2: {}.'.format(
+                                xml_file, x1, y1, x2, y2))
+                gt_bbox = gt_bbox[:i, :]
+                gt_class = gt_class[:i, :]
+                gt_score = gt_score[:i, :]
+                difficult = difficult[:i, :]
+
+                voc_rec = {
+                    'im_file': img_file,
+                    'im_id': im_id,
+                    'h': im_h,
+                    'w': im_w
+                } if 'image' in self.data_fields else {}
+
+                gt_rec = {
+                    'gt_class': gt_class,
+                    'gt_score': gt_score,
+                    'gt_bbox': gt_bbox,
+                    'difficult': difficult
+                }
+                for k, v in gt_rec.items():
+                    if k in self.data_fields:
+                        voc_rec[k] = v
+
+                if len(objs) == 0:
+                    empty_records.append(voc_rec)
+                else:
+                    records.append(voc_rec)
+
+                ct += 1
+                if self.sample_num > 0 and ct >= self.sample_num:
+                    break
+        assert ct > 0, 'not found any voc record in %s' % (self.anno_path)
+        logger.debug('{} samples in file {}'.format(ct, anno_path))
+        if self.allow_empty and len(empty_records) > 0:
+            empty_records = self._sample_empty(empty_records, len(records))
+            records += empty_records
+        self.roidbs, self.cname2cid = records, cname2cid
+
+    def get_label_list(self):
+        return os.path.join(self.dataset_dir, self.label_list)
+
+
+def pascalvoc_label():
+    labels_map = {
+        'aeroplane': 0,
+        'bicycle': 1,
+        'bird': 2,
+        'boat': 3,
+        'bottle': 4,
+        'bus': 5,
+        'car': 6,
+        'cat': 7,
+        'chair': 8,
+        'cow': 9,
+        'diningtable': 10,
+        'dog': 11,
+        'horse': 12,
+        'motorbike': 13,
+        'person': 14,
+        'pottedplant': 15,
+        'sheep': 16,
+        'sofa': 17,
+        'train': 18,
+        'tvmonitor': 19
+    }
+    return labels_map
diff --git a/rtdetr_paddle/ppdet/data/transform/__init__.py b/rtdetr_paddle/ppdet/data/transform/__init__.py
new file mode 100644
index 0000000..0b71513
--- /dev/null
+++ b/rtdetr_paddle/ppdet/data/transform/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import operators
+from . import batch_operators
+
+
+from .operators import *
+from .batch_operators import *
+
+
+__all__ = []
+__all__ += registered_ops
+
diff --git a/rtdetr_paddle/ppdet/data/transform/batch_operators.py b/rtdetr_paddle/ppdet/data/transform/batch_operators.py
new file mode 100644
index 0000000..c381382
--- /dev/null
+++ b/rtdetr_paddle/ppdet/data/transform/batch_operators.py
@@ -0,0 +1,322 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import typing
+
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+
+import cv2
+import numpy as np
+from .operators import register_op, BaseOperator, Resize
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = [
+    'PadBatch',
+    'BatchRandomResize',
+    'PadGT',
+]
+
+
+@register_op
+class PadBatch(BaseOperator):
+    """
+    Pad a batch of samples so they can be divisible by a stride.
+    The layout of each image should be 'CHW'.
+    Args:
+        pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure
+            height and width is divisible by `pad_to_stride`.
+    """
+
+    def __init__(self, pad_to_stride=0):
+        super(PadBatch, self).__init__()
+        self.pad_to_stride = pad_to_stride
+
+    def __call__(self, samples, context=None):
+        """
+        Args:
+            samples (list): a batch of sample, each is dict.
+        """
+        coarsest_stride = self.pad_to_stride
+
+        # multi scale input is nested list
+        if isinstance(samples,
+                      typing.Sequence) and len(samples) > 0 and isinstance(
+                          samples[0], typing.Sequence):
+            inner_samples = samples[0]
+        else:
+            inner_samples = samples
+
+        max_shape = np.array(
+            [data['image'].shape for data in inner_samples]).max(axis=0)
+        if coarsest_stride > 0:
+            max_shape[1] = int(
+                np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride)
+            max_shape[2] = int(
+                np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride)
+
+        for data in inner_samples:
+            im = data['image']
+            im_c, im_h, im_w = im.shape[:]
+            padding_im = np.zeros(
+                (im_c, max_shape[1], max_shape[2]), dtype=np.float32)
+            padding_im[:, :im_h, :im_w] = im
+            data['image'] = padding_im
+            if 'semantic' in data and data['semantic'] is not None:
+                semantic = data['semantic']
+                padding_sem = np.zeros(
+                    (1, max_shape[1], max_shape[2]), dtype=np.float32)
+                padding_sem[:, :im_h, :im_w] = semantic
+                data['semantic'] = padding_sem
+            if 'gt_segm' in data and data['gt_segm'] is not None:
+                gt_segm = data['gt_segm']
+                padding_segm = np.zeros(
+                    (gt_segm.shape[0], max_shape[1], max_shape[2]),
+                    dtype=np.uint8)
+                padding_segm[:, :im_h, :im_w] = gt_segm
+                data['gt_segm'] = padding_segm
+
+        return samples
+
+
+@register_op
+class BatchRandomResize(BaseOperator):
+    """
+    Resize image to target size randomly. random target_size and interpolation method
+    Args:
+        target_size (int, list, tuple): image target size, if random size is True, must be list or tuple
+        keep_ratio (bool): whether keep_raio or not, default true
+        interp (int): the interpolation method
+        random_size (bool): whether random select target size of image
+        random_interp (bool): whether random select interpolation method
+    """
+
+    def __init__(self,
+                 target_size,
+                 keep_ratio,
+                 interp=cv2.INTER_NEAREST,
+                 random_size=True,
+                 random_interp=False):
+        super(BatchRandomResize, self).__init__()
+        self.keep_ratio = keep_ratio
+        self.interps = [
+            cv2.INTER_NEAREST,
+            cv2.INTER_LINEAR,
+            cv2.INTER_AREA,
+            cv2.INTER_CUBIC,
+            cv2.INTER_LANCZOS4,
+        ]
+        self.interp = interp
+        assert isinstance(target_size, (
+            int, Sequence)), "target_size must be int, list or tuple"
+        if random_size and not isinstance(target_size, list):
+            raise TypeError(
+                "Type of target_size is invalid when random_size is True. Must be List, now is {}".
+                format(type(target_size)))
+        self.target_size = target_size
+        self.random_size = random_size
+        self.random_interp = random_interp
+
+    def __call__(self, samples, context=None):
+        if self.random_size:
+            index = np.random.choice(len(self.target_size))
+            target_size = self.target_size[index]
+        else:
+            target_size = self.target_size
+
+        if self.random_interp:
+            interp = np.random.choice(self.interps)
+        else:
+            interp = self.interp
+
+        resizer = Resize(target_size, keep_ratio=self.keep_ratio, interp=interp)
+        return resizer(samples, context=context)
+
+
+@register_op
+class PadGT(BaseOperator):
+    """
+    Pad 0 to `gt_class`, `gt_bbox`, `gt_score`...
+    The num_max_boxes is the largest for batch.
+    Args:
+        return_gt_mask (bool): If true, return `pad_gt_mask`,
+                                1 means bbox, 0 means no bbox.
+    """
+
+    def __init__(self, return_gt_mask=True, pad_img=False, minimum_gtnum=0):
+        super(PadGT, self).__init__()
+        self.return_gt_mask = return_gt_mask
+        self.pad_img = pad_img
+        self.minimum_gtnum = minimum_gtnum
+
+    def _impad(self,
+               img: np.ndarray,
+               *,
+               shape=None,
+               padding=None,
+               pad_val=0,
+               padding_mode='constant') -> np.ndarray:
+        """Pad the given image to a certain shape or pad on all sides with
+        specified padding mode and padding value.
+
+        Args:
+            img (ndarray): Image to be padded.
+            shape (tuple[int]): Expected padding shape (h, w). Default: None.
+            padding (int or tuple[int]): Padding on each border. If a single int is
+                provided this is used to pad all borders. If tuple of length 2 is
+                provided this is the padding on left/right and top/bottom
+                respectively. If a tuple of length 4 is provided this is the
+                padding for the left, top, right and bottom borders respectively.
+                Default: None. Note that `shape` and `padding` can not be both
+                set.
+            pad_val (Number | Sequence[Number]): Values to be filled in padding
+                areas when padding_mode is 'constant'. Default: 0.
+            padding_mode (str): Type of padding. Should be: constant, edge,
+                reflect or symmetric. Default: constant.
+                - constant: pads with a constant value, this value is specified
+                with pad_val.
+                - edge: pads with the last value at the edge of the image.
+                - reflect: pads with reflection of image without repeating the last
+                value on the edge. For example, padding [1, 2, 3, 4] with 2
+                elements on both sides in reflect mode will result in
+                [3, 2, 1, 2, 3, 4, 3, 2].
+                - symmetric: pads with reflection of image repeating the last value
+                on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
+                both sides in symmetric mode will result in
+                [2, 1, 1, 2, 3, 4, 4, 3]
+
+        Returns:
+            ndarray: The padded image.
+        """
+
+        assert (shape is not None) ^ (padding is not None)
+        if shape is not None:
+            width = max(shape[1] - img.shape[1], 0)
+            height = max(shape[0] - img.shape[0], 0)
+            padding = (0, 0, int(width), int(height))
+
+        # check pad_val
+        import numbers
+        if isinstance(pad_val, tuple):
+            assert len(pad_val) == img.shape[-1]
+        elif not isinstance(pad_val, numbers.Number):
+            raise TypeError('pad_val must be a int or a tuple. '
+                            f'But received {type(pad_val)}')
+
+        # check padding
+        if isinstance(padding, tuple) and len(padding) in [2, 4]:
+            if len(padding) == 2:
+                padding = (padding[0], padding[1], padding[0], padding[1])
+        elif isinstance(padding, numbers.Number):
+            padding = (padding, padding, padding, padding)
+        else:
+            raise ValueError('Padding must be a int or a 2, or 4 element tuple.'
+                             f'But received {padding}')
+
+        # check padding mode
+        assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
+
+        border_type = {
+            'constant': cv2.BORDER_CONSTANT,
+            'edge': cv2.BORDER_REPLICATE,
+            'reflect': cv2.BORDER_REFLECT_101,
+            'symmetric': cv2.BORDER_REFLECT
+        }
+        img = cv2.copyMakeBorder(
+            img,
+            padding[1],
+            padding[3],
+            padding[0],
+            padding[2],
+            border_type[padding_mode],
+            value=pad_val)
+
+        return img
+
+    def checkmaxshape(self, samples):
+        maxh, maxw = 0, 0
+        for sample in samples:
+            h, w = sample['im_shape']
+            if h > maxh:
+                maxh = h
+            if w > maxw:
+                maxw = w
+        return (maxh, maxw)
+
+    def __call__(self, samples, context=None):
+        num_max_boxes = max([len(s['gt_bbox']) for s in samples])
+        num_max_boxes = max(self.minimum_gtnum, num_max_boxes)
+        if self.pad_img:
+            maxshape = self.checkmaxshape(samples)
+        for sample in samples:
+            if self.pad_img:
+                img = sample['image']
+                padimg = self._impad(img, shape=maxshape)
+                sample['image'] = padimg
+            if self.return_gt_mask:
+                sample['pad_gt_mask'] = np.zeros(
+                    (num_max_boxes, 1), dtype=np.float32)
+            if num_max_boxes == 0:
+                continue
+
+            num_gt = len(sample['gt_bbox'])
+            pad_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32)
+            pad_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32)
+            if num_gt > 0:
+                pad_gt_class[:num_gt] = sample['gt_class']
+                pad_gt_bbox[:num_gt] = sample['gt_bbox']
+            sample['gt_class'] = pad_gt_class
+            sample['gt_bbox'] = pad_gt_bbox
+            # pad_gt_mask
+            if 'pad_gt_mask' in sample:
+                sample['pad_gt_mask'][:num_gt] = 1
+            # gt_score
+            if 'gt_score' in sample:
+                pad_gt_score = np.zeros((num_max_boxes, 1), dtype=np.float32)
+                if num_gt > 0:
+                    pad_gt_score[:num_gt] = sample['gt_score']
+                sample['gt_score'] = pad_gt_score
+            if 'is_crowd' in sample:
+                pad_is_crowd = np.zeros((num_max_boxes, 1), dtype=np.int32)
+                if num_gt > 0:
+                    pad_is_crowd[:num_gt] = sample['is_crowd']
+                sample['is_crowd'] = pad_is_crowd
+            if 'difficult' in sample:
+                pad_diff = np.zeros((num_max_boxes, 1), dtype=np.int32)
+                if num_gt > 0:
+                    pad_diff[:num_gt] = sample['difficult']
+                sample['difficult'] = pad_diff
+            if 'gt_joints' in sample:
+                num_joints = sample['gt_joints'].shape[1]
+                pad_gt_joints = np.zeros(
+                    (num_max_boxes, num_joints, 3), dtype=np.float32)
+                if num_gt > 0:
+                    pad_gt_joints[:num_gt] = sample['gt_joints']
+                sample['gt_joints'] = pad_gt_joints
+            if 'gt_areas' in sample:
+                pad_gt_areas = np.zeros((num_max_boxes, 1), dtype=np.float32)
+                if num_gt > 0:
+                    pad_gt_areas[:num_gt, 0] = sample['gt_areas']
+                sample['gt_areas'] = pad_gt_areas
+        return samples
+
+
+
diff --git a/rtdetr_paddle/ppdet/data/transform/op_helper.py b/rtdetr_paddle/ppdet/data/transform/op_helper.py
new file mode 100644
index 0000000..6c40030
--- /dev/null
+++ b/rtdetr_paddle/ppdet/data/transform/op_helper.py
@@ -0,0 +1,494 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# this file contains helper methods for BBOX processing
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import random
+import math
+import cv2
+
+
+def meet_emit_constraint(src_bbox, sample_bbox):
+    center_x = (src_bbox[2] + src_bbox[0]) / 2
+    center_y = (src_bbox[3] + src_bbox[1]) / 2
+    if center_x >= sample_bbox[0] and \
+            center_x <= sample_bbox[2] and \
+            center_y >= sample_bbox[1] and \
+            center_y <= sample_bbox[3]:
+        return True
+    return False
+
+
+def clip_bbox(src_bbox):
+    src_bbox[0] = max(min(src_bbox[0], 1.0), 0.0)
+    src_bbox[1] = max(min(src_bbox[1], 1.0), 0.0)
+    src_bbox[2] = max(min(src_bbox[2], 1.0), 0.0)
+    src_bbox[3] = max(min(src_bbox[3], 1.0), 0.0)
+    return src_bbox
+
+
+def bbox_area(src_bbox):
+    if src_bbox[2] < src_bbox[0] or src_bbox[3] < src_bbox[1]:
+        return 0.
+    else:
+        width = src_bbox[2] - src_bbox[0]
+        height = src_bbox[3] - src_bbox[1]
+        return width * height
+
+
+def is_overlap(object_bbox, sample_bbox):
+    if object_bbox[0] >= sample_bbox[2] or \
+       object_bbox[2] <= sample_bbox[0] or \
+       object_bbox[1] >= sample_bbox[3] or \
+       object_bbox[3] <= sample_bbox[1]:
+        return False
+    else:
+        return True
+
+
+def filter_and_process(sample_bbox, bboxes, labels, scores=None,
+                       keypoints=None):
+    new_bboxes = []
+    new_labels = []
+    new_scores = []
+    new_keypoints = []
+    new_kp_ignore = []
+    for i in range(len(bboxes)):
+        new_bbox = [0, 0, 0, 0]
+        obj_bbox = [bboxes[i][0], bboxes[i][1], bboxes[i][2], bboxes[i][3]]
+        if not meet_emit_constraint(obj_bbox, sample_bbox):
+            continue
+        if not is_overlap(obj_bbox, sample_bbox):
+            continue
+        sample_width = sample_bbox[2] - sample_bbox[0]
+        sample_height = sample_bbox[3] - sample_bbox[1]
+        new_bbox[0] = (obj_bbox[0] - sample_bbox[0]) / sample_width
+        new_bbox[1] = (obj_bbox[1] - sample_bbox[1]) / sample_height
+        new_bbox[2] = (obj_bbox[2] - sample_bbox[0]) / sample_width
+        new_bbox[3] = (obj_bbox[3] - sample_bbox[1]) / sample_height
+        new_bbox = clip_bbox(new_bbox)
+        if bbox_area(new_bbox) > 0:
+            new_bboxes.append(new_bbox)
+            new_labels.append([labels[i][0]])
+            if scores is not None:
+                new_scores.append([scores[i][0]])
+            if keypoints is not None:
+                sample_keypoint = keypoints[0][i]
+                for j in range(len(sample_keypoint)):
+                    kp_len = sample_height if j % 2 else sample_width
+                    sample_coord = sample_bbox[1] if j % 2 else sample_bbox[0]
+                    sample_keypoint[j] = (
+                        sample_keypoint[j] - sample_coord) / kp_len
+                    sample_keypoint[j] = max(min(sample_keypoint[j], 1.0), 0.0)
+                new_keypoints.append(sample_keypoint)
+                new_kp_ignore.append(keypoints[1][i])
+
+    bboxes = np.array(new_bboxes)
+    labels = np.array(new_labels)
+    scores = np.array(new_scores)
+    if keypoints is not None:
+        keypoints = np.array(new_keypoints)
+        new_kp_ignore = np.array(new_kp_ignore)
+        return bboxes, labels, scores, (keypoints, new_kp_ignore)
+    return bboxes, labels, scores
+
+
+def bbox_area_sampling(bboxes, labels, scores, target_size, min_size):
+    new_bboxes = []
+    new_labels = []
+    new_scores = []
+    for i, bbox in enumerate(bboxes):
+        w = float((bbox[2] - bbox[0]) * target_size)
+        h = float((bbox[3] - bbox[1]) * target_size)
+        if w * h < float(min_size * min_size):
+            continue
+        else:
+            new_bboxes.append(bbox)
+            new_labels.append(labels[i])
+            if scores is not None and scores.size != 0:
+                new_scores.append(scores[i])
+    bboxes = np.array(new_bboxes)
+    labels = np.array(new_labels)
+    scores = np.array(new_scores)
+    return bboxes, labels, scores
+
+
+def generate_sample_bbox(sampler):
+    scale = np.random.uniform(sampler[2], sampler[3])
+    aspect_ratio = np.random.uniform(sampler[4], sampler[5])
+    aspect_ratio = max(aspect_ratio, (scale**2.0))
+    aspect_ratio = min(aspect_ratio, 1 / (scale**2.0))
+    bbox_width = scale * (aspect_ratio**0.5)
+    bbox_height = scale / (aspect_ratio**0.5)
+    xmin_bound = 1 - bbox_width
+    ymin_bound = 1 - bbox_height
+    xmin = np.random.uniform(0, xmin_bound)
+    ymin = np.random.uniform(0, ymin_bound)
+    xmax = xmin + bbox_width
+    ymax = ymin + bbox_height
+    sampled_bbox = [xmin, ymin, xmax, ymax]
+    return sampled_bbox
+
+
+def generate_sample_bbox_square(sampler, image_width, image_height):
+    scale = np.random.uniform(sampler[2], sampler[3])
+    aspect_ratio = np.random.uniform(sampler[4], sampler[5])
+    aspect_ratio = max(aspect_ratio, (scale**2.0))
+    aspect_ratio = min(aspect_ratio, 1 / (scale**2.0))
+    bbox_width = scale * (aspect_ratio**0.5)
+    bbox_height = scale / (aspect_ratio**0.5)
+    if image_height < image_width:
+        bbox_width = bbox_height * image_height / image_width
+    else:
+        bbox_height = bbox_width * image_width / image_height
+    xmin_bound = 1 - bbox_width
+    ymin_bound = 1 - bbox_height
+    xmin = np.random.uniform(0, xmin_bound)
+    ymin = np.random.uniform(0, ymin_bound)
+    xmax = xmin + bbox_width
+    ymax = ymin + bbox_height
+    sampled_bbox = [xmin, ymin, xmax, ymax]
+    return sampled_bbox
+
+
+def data_anchor_sampling(bbox_labels, image_width, image_height, scale_array,
+                         resize_width):
+    num_gt = len(bbox_labels)
+    # np.random.randint range: [low, high)
+    rand_idx = np.random.randint(0, num_gt) if num_gt != 0 else 0
+
+    if num_gt != 0:
+        norm_xmin = bbox_labels[rand_idx][0]
+        norm_ymin = bbox_labels[rand_idx][1]
+        norm_xmax = bbox_labels[rand_idx][2]
+        norm_ymax = bbox_labels[rand_idx][3]
+
+        xmin = norm_xmin * image_width
+        ymin = norm_ymin * image_height
+        wid = image_width * (norm_xmax - norm_xmin)
+        hei = image_height * (norm_ymax - norm_ymin)
+        range_size = 0
+
+        area = wid * hei
+        for scale_ind in range(0, len(scale_array) - 1):
+            if area > scale_array[scale_ind] ** 2 and area < \
+                    scale_array[scale_ind + 1] ** 2:
+                range_size = scale_ind + 1
+                break
+
+        if area > scale_array[len(scale_array) - 2]**2:
+            range_size = len(scale_array) - 2
+
+        scale_choose = 0.0
+        if range_size == 0:
+            rand_idx_size = 0
+        else:
+            # np.random.randint range: [low, high)
+            rng_rand_size = np.random.randint(0, range_size + 1)
+            rand_idx_size = rng_rand_size % (range_size + 1)
+
+        if rand_idx_size == range_size:
+            min_resize_val = scale_array[rand_idx_size] / 2.0
+            max_resize_val = min(2.0 * scale_array[rand_idx_size],
+                                 2 * math.sqrt(wid * hei))
+            scale_choose = random.uniform(min_resize_val, max_resize_val)
+        else:
+            min_resize_val = scale_array[rand_idx_size] / 2.0
+            max_resize_val = 2.0 * scale_array[rand_idx_size]
+            scale_choose = random.uniform(min_resize_val, max_resize_val)
+
+        sample_bbox_size = wid * resize_width / scale_choose
+
+        w_off_orig = 0.0
+        h_off_orig = 0.0
+        if sample_bbox_size < max(image_height, image_width):
+            if wid <= sample_bbox_size:
+                w_off_orig = np.random.uniform(xmin + wid - sample_bbox_size,
+                                               xmin)
+            else:
+                w_off_orig = np.random.uniform(xmin,
+                                               xmin + wid - sample_bbox_size)
+
+            if hei <= sample_bbox_size:
+                h_off_orig = np.random.uniform(ymin + hei - sample_bbox_size,
+                                               ymin)
+            else:
+                h_off_orig = np.random.uniform(ymin,
+                                               ymin + hei - sample_bbox_size)
+
+        else:
+            w_off_orig = np.random.uniform(image_width - sample_bbox_size, 0.0)
+            h_off_orig = np.random.uniform(image_height - sample_bbox_size, 0.0)
+
+        w_off_orig = math.floor(w_off_orig)
+        h_off_orig = math.floor(h_off_orig)
+
+        # Figure out top left coordinates.
+        w_off = float(w_off_orig / image_width)
+        h_off = float(h_off_orig / image_height)
+
+        sampled_bbox = [
+            w_off, h_off, w_off + float(sample_bbox_size / image_width),
+            h_off + float(sample_bbox_size / image_height)
+        ]
+        return sampled_bbox
+    else:
+        return 0
+
+
+def jaccard_overlap(sample_bbox, object_bbox):
+    if sample_bbox[0] >= object_bbox[2] or \
+        sample_bbox[2] <= object_bbox[0] or \
+        sample_bbox[1] >= object_bbox[3] or \
+        sample_bbox[3] <= object_bbox[1]:
+        return 0
+    intersect_xmin = max(sample_bbox[0], object_bbox[0])
+    intersect_ymin = max(sample_bbox[1], object_bbox[1])
+    intersect_xmax = min(sample_bbox[2], object_bbox[2])
+    intersect_ymax = min(sample_bbox[3], object_bbox[3])
+    intersect_size = (intersect_xmax - intersect_xmin) * (
+        intersect_ymax - intersect_ymin)
+    sample_bbox_size = bbox_area(sample_bbox)
+    object_bbox_size = bbox_area(object_bbox)
+    overlap = intersect_size / (
+        sample_bbox_size + object_bbox_size - intersect_size)
+    return overlap
+
+
+def intersect_bbox(bbox1, bbox2):
+    if bbox2[0] > bbox1[2] or bbox2[2] < bbox1[0] or \
+        bbox2[1] > bbox1[3] or bbox2[3] < bbox1[1]:
+        intersection_box = [0.0, 0.0, 0.0, 0.0]
+    else:
+        intersection_box = [
+            max(bbox1[0], bbox2[0]), max(bbox1[1], bbox2[1]),
+            min(bbox1[2], bbox2[2]), min(bbox1[3], bbox2[3])
+        ]
+    return intersection_box
+
+
+def bbox_coverage(bbox1, bbox2):
+    inter_box = intersect_bbox(bbox1, bbox2)
+    intersect_size = bbox_area(inter_box)
+
+    if intersect_size > 0:
+        bbox1_size = bbox_area(bbox1)
+        return intersect_size / bbox1_size
+    else:
+        return 0.
+
+
+def satisfy_sample_constraint(sampler,
+                              sample_bbox,
+                              gt_bboxes,
+                              satisfy_all=False):
+    if sampler[6] == 0 and sampler[7] == 0:
+        return True
+    satisfied = []
+    for i in range(len(gt_bboxes)):
+        object_bbox = [
+            gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3]
+        ]
+        overlap = jaccard_overlap(sample_bbox, object_bbox)
+        if sampler[6] != 0 and \
+                overlap < sampler[6]:
+            satisfied.append(False)
+            continue
+        if sampler[7] != 0 and \
+                overlap > sampler[7]:
+            satisfied.append(False)
+            continue
+        satisfied.append(True)
+        if not satisfy_all:
+            return True
+
+    if satisfy_all:
+        return np.all(satisfied)
+    else:
+        return False
+
+
+def satisfy_sample_constraint_coverage(sampler, sample_bbox, gt_bboxes):
+    if sampler[6] == 0 and sampler[7] == 0:
+        has_jaccard_overlap = False
+    else:
+        has_jaccard_overlap = True
+    if sampler[8] == 0 and sampler[9] == 0:
+        has_object_coverage = False
+    else:
+        has_object_coverage = True
+
+    if not has_jaccard_overlap and not has_object_coverage:
+        return True
+    found = False
+    for i in range(len(gt_bboxes)):
+        object_bbox = [
+            gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3]
+        ]
+        if has_jaccard_overlap:
+            overlap = jaccard_overlap(sample_bbox, object_bbox)
+            if sampler[6] != 0 and \
+                    overlap < sampler[6]:
+                continue
+            if sampler[7] != 0 and \
+                    overlap > sampler[7]:
+                continue
+            found = True
+        if has_object_coverage:
+            object_coverage = bbox_coverage(object_bbox, sample_bbox)
+            if sampler[8] != 0 and \
+                    object_coverage < sampler[8]:
+                continue
+            if sampler[9] != 0 and \
+                    object_coverage > sampler[9]:
+                continue
+            found = True
+        if found:
+            return True
+    return found
+
+
+def crop_image_sampling(img, sample_bbox, image_width, image_height,
+                        target_size):
+    # no clipping here
+    xmin = int(sample_bbox[0] * image_width)
+    xmax = int(sample_bbox[2] * image_width)
+    ymin = int(sample_bbox[1] * image_height)
+    ymax = int(sample_bbox[3] * image_height)
+
+    w_off = xmin
+    h_off = ymin
+    width = xmax - xmin
+    height = ymax - ymin
+    cross_xmin = max(0.0, float(w_off))
+    cross_ymin = max(0.0, float(h_off))
+    cross_xmax = min(float(w_off + width - 1.0), float(image_width))
+    cross_ymax = min(float(h_off + height - 1.0), float(image_height))
+    cross_width = cross_xmax - cross_xmin
+    cross_height = cross_ymax - cross_ymin
+
+    roi_xmin = 0 if w_off >= 0 else abs(w_off)
+    roi_ymin = 0 if h_off >= 0 else abs(h_off)
+    roi_width = cross_width
+    roi_height = cross_height
+
+    roi_y1 = int(roi_ymin)
+    roi_y2 = int(roi_ymin + roi_height)
+    roi_x1 = int(roi_xmin)
+    roi_x2 = int(roi_xmin + roi_width)
+
+    cross_y1 = int(cross_ymin)
+    cross_y2 = int(cross_ymin + cross_height)
+    cross_x1 = int(cross_xmin)
+    cross_x2 = int(cross_xmin + cross_width)
+
+    sample_img = np.zeros((height, width, 3))
+    sample_img[roi_y1: roi_y2, roi_x1: roi_x2] = \
+        img[cross_y1: cross_y2, cross_x1: cross_x2]
+
+    sample_img = cv2.resize(
+        sample_img, (target_size, target_size), interpolation=cv2.INTER_AREA)
+
+    return sample_img
+
+
+def is_poly(segm):
+    assert isinstance(segm, (list, dict)), \
+        "Invalid segm type: {}".format(type(segm))
+    return isinstance(segm, list)
+
+
+def gaussian_radius(bbox_size, min_overlap):
+    height, width = bbox_size
+
+    a1 = 1
+    b1 = (height + width)
+    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
+    sq1 = np.sqrt(b1**2 - 4 * a1 * c1)
+    radius1 = (b1 + sq1) / (2 * a1)
+
+    a2 = 4
+    b2 = 2 * (height + width)
+    c2 = (1 - min_overlap) * width * height
+    sq2 = np.sqrt(b2**2 - 4 * a2 * c2)
+    radius2 = (b2 + sq2) / 2
+
+    a3 = 4 * min_overlap
+    b3 = -2 * min_overlap * (height + width)
+    c3 = (min_overlap - 1) * width * height
+    sq3 = np.sqrt(b3**2 - 4 * a3 * c3)
+    radius3 = (b3 + sq3) / 2
+    return min(radius1, radius2, radius3)
+
+
+def draw_gaussian(heatmap, center, radius, k=1, delte=6):
+    diameter = 2 * radius + 1
+    sigma = diameter / delte
+    gaussian = gaussian2D((diameter, diameter), sigma_x=sigma, sigma_y=sigma)
+
+    x, y = center
+
+    height, width = heatmap.shape[0:2]
+
+    left, right = min(x, radius), min(width - x, radius + 1)
+    top, bottom = min(y, radius), min(height - y, radius + 1)
+
+    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:
+                               radius + right]
+    np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
+
+
+def gaussian2D(shape, sigma_x=1, sigma_y=1):
+    m, n = [(ss - 1.) / 2. for ss in shape]
+    y, x = np.ogrid[-m:m + 1, -n:n + 1]
+
+    h = np.exp(-(x * x / (2 * sigma_x * sigma_x) + y * y / (2 * sigma_y *
+                                                            sigma_y)))
+    h[h < np.finfo(h.dtype).eps * h.max()] = 0
+    return h
+
+
+def draw_umich_gaussian(heatmap, center, radius, k=1):
+    """
+    draw_umich_gaussian, refer to https://github.com/xingyizhou/CenterNet/blob/master/src/lib/utils/image.py#L126
+    """
+    diameter = 2 * radius + 1
+    gaussian = gaussian2D(
+        (diameter, diameter), sigma_x=diameter / 6, sigma_y=diameter / 6)
+
+    x, y = int(center[0]), int(center[1])
+
+    height, width = heatmap.shape[0:2]
+
+    left, right = min(x, radius), min(width - x, radius + 1)
+    top, bottom = min(y, radius), min(height - y, radius + 1)
+
+    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:
+                               radius + right]
+    if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
+        np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
+    return heatmap
+
+
+def get_border(border, size):
+    i = 1
+    while size - border // i <= border // i:
+        i *= 2
+    return border // i
diff --git a/rtdetr_paddle/ppdet/data/transform/operators.py b/rtdetr_paddle/ppdet/data/transform/operators.py
new file mode 100644
index 0000000..b64f032
--- /dev/null
+++ b/rtdetr_paddle/ppdet/data/transform/operators.py
@@ -0,0 +1,3797 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# function:
+#    operators to process sample,
+#    eg: decode/resize/crop image
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+
+from numbers import Number, Integral
+
+import uuid
+import random
+import math
+import numpy as np
+import os
+import copy
+import logging
+import cv2
+from PIL import Image, ImageDraw
+import pickle
+import threading
+MUTEX = threading.Lock()
+
+from ppdet.core.workspace import serializable
+from ..reader import Compose
+
+from .op_helper import (satisfy_sample_constraint, filter_and_process,
+                        generate_sample_bbox, clip_bbox, data_anchor_sampling,
+                        satisfy_sample_constraint_coverage, crop_image_sampling,
+                        generate_sample_bbox_square, bbox_area_sampling, is_poly)
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+registered_ops = []
+
+
+def register_op(cls):
+    registered_ops.append(cls.__name__)
+    if not hasattr(BaseOperator, cls.__name__):
+        setattr(BaseOperator, cls.__name__, cls)
+    else:
+        raise KeyError("The {} class has been registered.".format(cls.__name__))
+    return serializable(cls)
+
+
+class BboxError(ValueError):
+    pass
+
+
+class ImageError(ValueError):
+    pass
+
+
+class BaseOperator(object):
+    def __init__(self, name=None):
+        if name is None:
+            name = self.__class__.__name__
+        self._id = name + '_' + str(uuid.uuid4())[-6:]
+
+    def apply(self, sample, context=None):
+        """ Process a sample.
+        Args:
+            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
+            context (dict): info about this sample processing
+        Returns:
+            result (dict): a processed sample
+        """
+        return sample
+
+    def __call__(self, sample, context=None):
+        """ Process a sample.
+        Args:
+            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
+            context (dict): info about this sample processing
+        Returns:
+            result (dict): a processed sample
+        """
+        if isinstance(sample, Sequence):
+            for i in range(len(sample)):
+                sample[i] = self.apply(sample[i], context)
+        else:
+            sample = self.apply(sample, context)
+        return sample
+
+    def __str__(self):
+        return str(self._id)
+
+
+@register_op
+class Decode(BaseOperator):
+    def __init__(self):
+        """ Transform the image data to numpy format following the rgb format
+        """
+        super(Decode, self).__init__()
+
+    def apply(self, sample, context=None):
+        """ load image if 'im_file' field is not empty but 'image' is"""
+        if 'image' not in sample:
+            with open(sample['im_file'], 'rb') as f:
+                sample['image'] = f.read()
+            sample.pop('im_file')
+
+        try:
+            im = sample['image']
+            data = np.frombuffer(im, dtype='uint8')
+            im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode
+            if 'keep_ori_im' in sample and sample['keep_ori_im']:
+                sample['ori_image'] = im
+            im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+        except:
+            im = sample['image']
+
+        sample['image'] = im
+        if 'h' not in sample:
+            sample['h'] = im.shape[0]
+        elif sample['h'] != im.shape[0]:
+            logger.warning(
+                "The actual image height: {} is not equal to the "
+                "height: {} in annotation, and update sample['h'] by actual "
+                "image height.".format(im.shape[0], sample['h']))
+            sample['h'] = im.shape[0]
+        if 'w' not in sample:
+            sample['w'] = im.shape[1]
+        elif sample['w'] != im.shape[1]:
+            logger.warning(
+                "The actual image width: {} is not equal to the "
+                "width: {} in annotation, and update sample['w'] by actual "
+                "image width.".format(im.shape[1], sample['w']))
+            sample['w'] = im.shape[1]
+
+        sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
+        sample['scale_factor'] = np.array([1., 1.], dtype=np.float32)
+        return sample
+
+
+def _make_dirs(dirname):
+    try:
+        from pathlib import Path
+    except ImportError:
+        from pathlib2 import Path
+    Path(dirname).mkdir(exist_ok=True)
+
+
+@register_op
+class DecodeCache(BaseOperator):
+    def __init__(self, cache_root=None):
+        '''decode image and caching
+        '''
+        super(DecodeCache, self).__init__()
+
+        self.use_cache = False if cache_root is None else True
+        self.cache_root = cache_root
+
+        if cache_root is not None:
+            _make_dirs(cache_root)
+
+    def apply(self, sample, context=None):
+
+        if self.use_cache and os.path.exists(
+                self.cache_path(self.cache_root, sample['im_file'])):
+            path = self.cache_path(self.cache_root, sample['im_file'])
+            im = self.load(path)
+
+        else:
+            if 'image' not in sample:
+                with open(sample['im_file'], 'rb') as f:
+                    sample['image'] = f.read()
+
+            im = sample['image']
+            data = np.frombuffer(im, dtype='uint8')
+            im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode
+            if 'keep_ori_im' in sample and sample['keep_ori_im']:
+                sample['ori_image'] = im
+            im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+
+            if self.use_cache and not os.path.exists(
+                    self.cache_path(self.cache_root, sample['im_file'])):
+                path = self.cache_path(self.cache_root, sample['im_file'])
+                self.dump(im, path)
+
+        sample['image'] = im
+        sample['h'] = im.shape[0]
+        sample['w'] = im.shape[1]
+
+        sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
+        sample['scale_factor'] = np.array([1., 1.], dtype=np.float32)
+
+        sample.pop('im_file')
+
+        return sample
+
+    @staticmethod
+    def cache_path(dir_oot, im_file):
+        return os.path.join(dir_oot, os.path.basename(im_file) + '.pkl')
+
+    @staticmethod
+    def load(path):
+        with open(path, 'rb') as f:
+            im = pickle.load(f)
+        return im
+
+    @staticmethod
+    def dump(obj, path):
+        MUTEX.acquire()
+        try:
+            with open(path, 'wb') as f:
+                pickle.dump(obj, f)
+
+        except Exception as e:
+            logger.warning('dump {} occurs exception {}'.format(path, str(e)))
+
+        finally:
+            MUTEX.release()
+
+
+@register_op
+class Permute(BaseOperator):
+    def __init__(self):
+        """
+        Change the channel to be (C, H, W)
+        """
+        super(Permute, self).__init__()
+
+    def apply(self, sample, context=None):
+        im = sample['image']
+        im = im.transpose((2, 0, 1))
+        sample['image'] = im
+
+        if 'pre_image' in sample:
+            pre_im = sample['pre_image']
+            pre_im = pre_im.transpose((2, 0, 1))
+            sample['pre_image'] = pre_im
+        return sample
+
+
+@register_op
+class Lighting(BaseOperator):
+    """
+    Lighting the image by eigenvalues and eigenvectors
+    Args:
+        eigval (list): eigenvalues
+        eigvec (list): eigenvectors
+        alphastd (float): random weight of lighting, 0.1 by default
+    """
+
+    def __init__(self, eigval, eigvec, alphastd=0.1):
+        super(Lighting, self).__init__()
+        self.alphastd = alphastd
+        self.eigval = np.array(eigval).astype('float32')
+        self.eigvec = np.array(eigvec).astype('float32')
+
+    def apply(self, sample, context=None):
+        alpha = np.random.normal(scale=self.alphastd, size=(3, ))
+        sample['image'] += np.dot(self.eigvec, self.eigval * alpha)
+
+        if 'pre_image' in sample:
+            sample['pre_image'] += np.dot(self.eigvec, self.eigval * alpha)
+        return sample
+
+
+@register_op
+class RandomErasingImage(BaseOperator):
+    def __init__(self, prob=0.5, lower=0.02, higher=0.4, aspect_ratio=0.3):
+        """
+        Random Erasing Data Augmentation, see https://arxiv.org/abs/1708.04896
+        Args:
+            prob (float): probability to carry out random erasing
+            lower (float): lower limit of the erasing area ratio
+            higher (float): upper limit of the erasing area ratio
+            aspect_ratio (float): aspect ratio of the erasing region
+        """
+        super(RandomErasingImage, self).__init__()
+        self.prob = prob
+        self.lower = lower
+        self.higher = higher
+        self.aspect_ratio = aspect_ratio
+
+    def apply(self, sample, context=None):
+        gt_bbox = sample['gt_bbox']
+        im = sample['image']
+        if not isinstance(im, np.ndarray):
+            raise TypeError("{}: image is not a numpy array.".format(self))
+        if len(im.shape) != 3:
+            raise ImageError("{}: image is not 3-dimensional.".format(self))
+
+        for idx in range(gt_bbox.shape[0]):
+            if self.prob <= np.random.rand():
+                continue
+
+            x1, y1, x2, y2 = gt_bbox[idx, :]
+            w_bbox = x2 - x1
+            h_bbox = y2 - y1
+            area = w_bbox * h_bbox
+
+            target_area = random.uniform(self.lower, self.higher) * area
+            aspect_ratio = random.uniform(self.aspect_ratio,
+                                          1 / self.aspect_ratio)
+
+            h = int(round(math.sqrt(target_area * aspect_ratio)))
+            w = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if w < w_bbox and h < h_bbox:
+                off_y1 = random.randint(0, int(h_bbox - h))
+                off_x1 = random.randint(0, int(w_bbox - w))
+                im[int(y1 + off_y1):int(y1 + off_y1 + h), int(x1 + off_x1):int(
+                    x1 + off_x1 + w), :] = 0
+        sample['image'] = im
+        return sample
+
+
+@register_op
+class NormalizeImage(BaseOperator):
+    def __init__(self,
+                 mean=[0.485, 0.456, 0.406],
+                 std=[0.229, 0.224, 0.225],
+                 is_scale=True,
+                 norm_type='mean_std'):
+        """
+        Args:
+            mean (list): the pixel mean
+            std (list): the pixel variance
+            is_scale (bool): scale the pixel to [0,1]
+            norm_type (str): type in ['mean_std', 'none']
+        """
+        super(NormalizeImage, self).__init__()
+        self.mean = mean
+        self.std = std
+        self.is_scale = is_scale
+        self.norm_type = norm_type
+        if not (isinstance(self.mean, list) and isinstance(self.std, list) and
+                isinstance(self.is_scale, bool) and
+                self.norm_type in ['mean_std', 'none']):
+            raise TypeError("{}: input type is invalid.".format(self))
+        from functools import reduce
+        if reduce(lambda x, y: x * y, self.std) == 0:
+            raise ValueError('{}: std is invalid!'.format(self))
+
+    def apply(self, sample, context=None):
+        """Normalize the image.
+        Operators:
+            1.(optional) Scale the pixel to [0,1]
+            2.(optional) Each pixel minus mean and is divided by std
+        """
+        im = sample['image']
+
+        im = im.astype(np.float32, copy=False)
+        if self.is_scale:
+            scale = 1.0 / 255.0
+            im *= scale
+
+        if self.norm_type == 'mean_std':
+            mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
+            std = np.array(self.std)[np.newaxis, np.newaxis, :]
+            im -= mean
+            im /= std
+
+        sample['image'] = im
+
+        if 'pre_image' in sample:
+            pre_im = sample['pre_image']
+            pre_im = pre_im.astype(np.float32, copy=False)
+            if self.is_scale:
+                scale = 1.0 / 255.0
+                pre_im *= scale
+
+            if self.norm_type == 'mean_std':
+                mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
+                std = np.array(self.std)[np.newaxis, np.newaxis, :]
+                pre_im -= mean
+                pre_im /= std
+            sample['pre_image'] = pre_im
+
+        return sample
+
+
+@register_op
+class RandomDistort(BaseOperator):
+    """Random color distortion.
+    Args:
+        hue (list): hue settings. in [lower, upper, probability] format.
+        saturation (list): saturation settings. in [lower, upper, probability] format.
+        contrast (list): contrast settings. in [lower, upper, probability] format.
+        brightness (list): brightness settings. in [lower, upper, probability] format.
+        random_apply (bool): whether to apply in random (yolo) or fixed (SSD)
+            order.
+        count (int): the number of doing distrot
+        random_channel (bool): whether to swap channels randomly
+    """
+
+    def __init__(self,
+                 hue=[-18, 18, 0.5],
+                 saturation=[0.5, 1.5, 0.5],
+                 contrast=[0.5, 1.5, 0.5],
+                 brightness=[0.5, 1.5, 0.5],
+                 random_apply=True,
+                 count=4,
+                 random_channel=False,
+                 prob=1.0):
+        super(RandomDistort, self).__init__()
+        self.hue = hue
+        self.saturation = saturation
+        self.contrast = contrast
+        self.brightness = brightness
+        self.random_apply = random_apply
+        self.count = count
+        self.random_channel = random_channel
+        self.prob = prob
+
+    def apply_hue(self, img):
+        low, high, prob = self.hue
+        if np.random.uniform(0., 1.) < prob:
+            return img
+
+        img = img.astype(np.float32)
+        # it works, but result differ from HSV version
+        delta = np.random.uniform(low, high)
+        u = np.cos(delta * np.pi)
+        w = np.sin(delta * np.pi)
+        bt = np.array([[1.0, 0.0, 0.0], [0.0, u, -w], [0.0, w, u]])
+        tyiq = np.array([[0.299, 0.587, 0.114], [0.596, -0.274, -0.321],
+                         [0.211, -0.523, 0.311]])
+        ityiq = np.array([[1.0, 0.956, 0.621], [1.0, -0.272, -0.647],
+                          [1.0, -1.107, 1.705]])
+        t = np.dot(np.dot(ityiq, bt), tyiq).T
+        img = np.dot(img, t)
+        return img
+
+    def apply_saturation(self, img):
+        low, high, prob = self.saturation
+        if np.random.uniform(0., 1.) < prob:
+            return img
+        delta = np.random.uniform(low, high)
+        img = img.astype(np.float32)
+        # it works, but result differ from HSV version
+        gray = img * np.array([[[0.299, 0.587, 0.114]]], dtype=np.float32)
+        gray = gray.sum(axis=2, keepdims=True)
+        gray *= (1.0 - delta)
+        img *= delta
+        img += gray
+        return img
+
+    def apply_contrast(self, img):
+        low, high, prob = self.contrast
+        if np.random.uniform(0., 1.) < prob:
+            return img
+        delta = np.random.uniform(low, high)
+        img = img.astype(np.float32)
+        img *= delta
+        return img
+
+    def apply_brightness(self, img):
+        low, high, prob = self.brightness
+        if np.random.uniform(0., 1.) < prob:
+            return img
+        delta = np.random.uniform(low, high)
+        img = img.astype(np.float32)
+        img += delta
+        return img
+
+    def apply(self, sample, context=None):
+        if random.random() > self.prob:
+            return sample
+        img = sample['image']
+        if self.random_apply:
+            functions = [
+                self.apply_brightness, self.apply_contrast,
+                self.apply_saturation, self.apply_hue
+            ]
+            distortions = np.random.permutation(functions)[:self.count]
+            for func in distortions:
+                img = func(img)
+            sample['image'] = img
+            return sample
+
+        img = self.apply_brightness(img)
+        mode = np.random.randint(0, 2)
+
+        if mode:
+            img = self.apply_contrast(img)
+
+        img = self.apply_saturation(img)
+        img = self.apply_hue(img)
+
+        if not mode:
+            img = self.apply_contrast(img)
+
+        if self.random_channel:
+            if np.random.randint(0, 2):
+                img = img[..., np.random.permutation(3)]
+        sample['image'] = img
+        return sample
+
+
+@register_op
+class PhotoMetricDistortion(BaseOperator):
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+    8. randomly swap channels
+
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (tuple): range of contrast.
+        saturation_range (tuple): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+
+    def __init__(self,
+                 brightness_delta=32,
+                 contrast_range=(0.5, 1.5),
+                 saturation_range=(0.5, 1.5),
+                 hue_delta=18):
+        super(PhotoMetricDistortion, self).__init__()
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+
+    def apply(self, results, context=None):
+        """Call function to perform photometric distortion on images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images distorted.
+        """
+
+        img = results['image']
+        img = img.astype(np.float32)
+        # random brightness
+        if np.random.randint(2):
+            delta = np.random.uniform(-self.brightness_delta,
+                                      self.brightness_delta)
+            img += delta
+
+        # mode == 0 --> do random contrast first
+        # mode == 1 --> do random contrast last
+        mode = np.random.randint(2)
+        if mode == 1:
+            if np.random.randint(2):
+                alpha = np.random.uniform(self.contrast_lower,
+                                          self.contrast_upper)
+                img *= alpha
+
+        # convert color from BGR to HSV
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
+
+        # random saturation
+        if np.random.randint(2):
+            img[..., 1] *= np.random.uniform(self.saturation_lower,
+                                             self.saturation_upper)
+
+        # random hue
+        if np.random.randint(2):
+            img[..., 0] += np.random.uniform(-self.hue_delta, self.hue_delta)
+            img[..., 0][img[..., 0] > 360] -= 360
+            img[..., 0][img[..., 0] < 0] += 360
+
+        # convert color from HSV to BGR
+        img = cv2.cvtColor(img, cv2.COLOR_HSV2BGR)
+
+        # random contrast
+        if mode == 0:
+            if np.random.randint(2):
+                alpha = np.random.uniform(self.contrast_lower,
+                                          self.contrast_upper)
+                img *= alpha
+
+        # randomly swap channels
+        if np.random.randint(2):
+            img = img[..., np.random.permutation(3)]
+
+        results['image'] = img
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
+        repr_str += 'contrast_range='
+        repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
+        repr_str += 'saturation_range='
+        repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
+        repr_str += f'hue_delta={self.hue_delta})'
+        return repr_str
+
+
+@register_op
+class AutoAugment(BaseOperator):
+    def __init__(self, autoaug_type="v1"):
+        """
+        Args:
+            autoaug_type (str): autoaug type, support v0, v1, v2, v3, test
+        """
+        super(AutoAugment, self).__init__()
+        self.autoaug_type = autoaug_type
+
+    def apply(self, sample, context=None):
+        """
+        Learning Data Augmentation Strategies for Object Detection, see https://arxiv.org/abs/1906.11172
+        """
+        im = sample['image']
+        gt_bbox = sample['gt_bbox']
+        if not isinstance(im, np.ndarray):
+            raise TypeError("{}: image is not a numpy array.".format(self))
+        if len(im.shape) != 3:
+            raise ImageError("{}: image is not 3-dimensional.".format(self))
+        if len(gt_bbox) == 0:
+            return sample
+
+        height, width, _ = im.shape
+        norm_gt_bbox = np.ones_like(gt_bbox, dtype=np.float32)
+        norm_gt_bbox[:, 0] = gt_bbox[:, 1] / float(height)
+        norm_gt_bbox[:, 1] = gt_bbox[:, 0] / float(width)
+        norm_gt_bbox[:, 2] = gt_bbox[:, 3] / float(height)
+        norm_gt_bbox[:, 3] = gt_bbox[:, 2] / float(width)
+
+        from .autoaugment_utils import distort_image_with_autoaugment
+        im, norm_gt_bbox = distort_image_with_autoaugment(im, norm_gt_bbox,
+                                                          self.autoaug_type)
+
+        gt_bbox[:, 0] = norm_gt_bbox[:, 1] * float(width)
+        gt_bbox[:, 1] = norm_gt_bbox[:, 0] * float(height)
+        gt_bbox[:, 2] = norm_gt_bbox[:, 3] * float(width)
+        gt_bbox[:, 3] = norm_gt_bbox[:, 2] * float(height)
+
+        sample['image'] = im
+        sample['gt_bbox'] = gt_bbox
+        return sample
+
+
+@register_op
+class RandomFlip(BaseOperator):
+    def __init__(self, prob=0.5):
+        """
+        Args:
+            prob (float): the probability of flipping image
+        """
+        super(RandomFlip, self).__init__()
+        self.prob = prob
+        if not (isinstance(self.prob, float)):
+            raise TypeError("{}: input type is invalid.".format(self))
+
+    def apply_segm(self, segms, height, width):
+        def _flip_poly(poly, width):
+            flipped_poly = np.array(poly)
+            flipped_poly[0::2] = width - np.array(poly[0::2])
+            return flipped_poly.tolist()
+
+        def _flip_rle(rle, height, width):
+            if 'counts' in rle and type(rle['counts']) == list:
+                rle = mask_util.frPyObjects(rle, height, width)
+            mask = mask_util.decode(rle)
+            mask = mask[:, ::-1]
+            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
+            return rle
+
+        flipped_segms = []
+        for segm in segms:
+            if is_poly(segm):
+                # Polygon format
+                flipped_segms.append([_flip_poly(poly, width) for poly in segm])
+            else:
+                # RLE format
+                import pycocotools.mask as mask_util
+                flipped_segms.append(_flip_rle(segm, height, width))
+        return flipped_segms
+
+    def apply_keypoint(self, gt_keypoint, width):
+        for i in range(gt_keypoint.shape[1]):
+            if i % 2 == 0:
+                old_x = gt_keypoint[:, i].copy()
+                gt_keypoint[:, i] = width - old_x
+        return gt_keypoint
+
+    def apply_image(self, image):
+        return image[:, ::-1, :]
+
+    def apply_bbox(self, bbox, width):
+        oldx1 = bbox[:, 0].copy()
+        oldx2 = bbox[:, 2].copy()
+        bbox[:, 0] = width - oldx2
+        bbox[:, 2] = width - oldx1
+        return bbox
+
+    def apply(self, sample, context=None):
+        """Filp the image and bounding box.
+        Operators:
+            1. Flip the image numpy.
+            2. Transform the bboxes' x coordinates.
+              (Must judge whether the coordinates are normalized!)
+            3. Transform the segmentations' x coordinates.
+              (Must judge whether the coordinates are normalized!)
+        Output:
+            sample: the image, bounding box and segmentation part
+                    in sample are flipped.
+        """
+        if np.random.uniform(0, 1) < self.prob:
+            im = sample['image']
+            height, width = im.shape[:2]
+            im = self.apply_image(im)
+            if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+                sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], width)
+            if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
+                sample['gt_poly'] = self.apply_segm(sample['gt_poly'], height,
+                                                    width)
+            if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0:
+                sample['gt_keypoint'] = self.apply_keypoint(
+                    sample['gt_keypoint'], width)
+
+            if 'semantic' in sample and sample['semantic']:
+                sample['semantic'] = sample['semantic'][:, ::-1]
+
+            if 'gt_segm' in sample and sample['gt_segm'].any():
+                sample['gt_segm'] = sample['gt_segm'][:, :, ::-1]
+
+            sample['flipped'] = True
+            sample['image'] = im
+        return sample
+
+
+@register_op
+class Resize(BaseOperator):
+    def __init__(self, target_size, keep_ratio, interp=cv2.INTER_LINEAR):
+        """
+        Resize image to target size. if keep_ratio is True, 
+        resize the image's long side to the maximum of target_size
+        if keep_ratio is False, resize the image to target size(h, w)
+        Args:
+            target_size (int|list): image target size
+            keep_ratio (bool): whether keep_ratio or not, default true
+            interp (int): the interpolation method
+        """
+        super(Resize, self).__init__()
+        self.keep_ratio = keep_ratio
+        self.interp = interp
+        if not isinstance(target_size, (Integral, Sequence)):
+            raise TypeError(
+                "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}".
+                format(type(target_size)))
+        if isinstance(target_size, Integral):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+
+    def apply_image(self, image, scale):
+        im_scale_x, im_scale_y = scale
+
+        return cv2.resize(
+            image,
+            None,
+            None,
+            fx=im_scale_x,
+            fy=im_scale_y,
+            interpolation=self.interp)
+
+    def apply_bbox(self, bbox, scale, size):
+        im_scale_x, im_scale_y = scale
+        resize_w, resize_h = size
+        bbox[:, 0::2] *= im_scale_x
+        bbox[:, 1::2] *= im_scale_y
+        bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w)
+        bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h)
+        return bbox
+
+    def apply_area(self, area, scale):
+        im_scale_x, im_scale_y = scale
+        return area * im_scale_x * im_scale_y
+
+    def apply_joints(self, joints, scale, size):
+        im_scale_x, im_scale_y = scale
+        resize_w, resize_h = size
+        joints[..., 0] *= im_scale_x
+        joints[..., 1] *= im_scale_y
+        joints[..., 0] = np.clip(joints[..., 0], 0, resize_w)
+        joints[..., 1] = np.clip(joints[..., 1], 0, resize_h)
+        return joints
+
+    def apply_segm(self, segms, im_size, scale):
+        def _resize_poly(poly, im_scale_x, im_scale_y):
+            resized_poly = np.array(poly).astype('float32')
+            resized_poly[0::2] *= im_scale_x
+            resized_poly[1::2] *= im_scale_y
+            return resized_poly.tolist()
+
+        def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y):
+            if 'counts' in rle and type(rle['counts']) == list:
+                rle = mask_util.frPyObjects(rle, im_h, im_w)
+
+            mask = mask_util.decode(rle)
+            mask = cv2.resize(
+                mask,
+                None,
+                None,
+                fx=im_scale_x,
+                fy=im_scale_y,
+                interpolation=self.interp)
+            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
+            return rle
+
+        im_h, im_w = im_size
+        im_scale_x, im_scale_y = scale
+        resized_segms = []
+        for segm in segms:
+            if is_poly(segm):
+                # Polygon format
+                resized_segms.append([
+                    _resize_poly(poly, im_scale_x, im_scale_y) for poly in segm
+                ])
+            else:
+                # RLE format
+                import pycocotools.mask as mask_util
+                resized_segms.append(
+                    _resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y))
+
+        return resized_segms
+
+    def apply(self, sample, context=None):
+        """ Resize the image numpy.
+        """
+        im = sample['image']
+        if not isinstance(im, np.ndarray):
+            raise TypeError("{}: image type is not numpy.".format(self))
+
+        # apply image
+        if len(im.shape) == 3:
+            im_shape = im.shape
+        else:
+            im_shape = im[0].shape
+
+        if self.keep_ratio:
+            im_size_min = np.min(im_shape[0:2])
+            im_size_max = np.max(im_shape[0:2])
+
+            target_size_min = np.min(self.target_size)
+            target_size_max = np.max(self.target_size)
+
+            im_scale = min(target_size_min / im_size_min,
+                           target_size_max / im_size_max)
+
+            resize_h = int(im_scale * float(im_shape[0]) + 0.5)
+            resize_w = int(im_scale * float(im_shape[1]) + 0.5)
+
+            im_scale_x = im_scale
+            im_scale_y = im_scale
+        else:
+            resize_h, resize_w = self.target_size
+            im_scale_y = resize_h / im_shape[0]
+            im_scale_x = resize_w / im_shape[1]
+
+        if len(im.shape) == 3:
+            im = self.apply_image(sample['image'], [im_scale_x, im_scale_y])
+            sample['image'] = im.astype(np.float32)
+        else:
+            resized_images = []
+            for one_im in im:
+                applied_im = self.apply_image(one_im, [im_scale_x, im_scale_y])
+                resized_images.append(applied_im)
+
+            sample['image'] = np.array(resized_images)
+
+        # 2d keypoints resize
+        if 'kps2d' in sample.keys():
+            kps2d = sample['kps2d']
+            kps2d[:, :, 0] = kps2d[:, :, 0] * im_scale_x
+            kps2d[:, :, 1] = kps2d[:, :, 1] * im_scale_y
+
+            sample['kps2d'] = kps2d
+
+        sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)
+        if 'scale_factor' in sample:
+            scale_factor = sample['scale_factor']
+            sample['scale_factor'] = np.asarray(
+                [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
+                dtype=np.float32)
+        else:
+            sample['scale_factor'] = np.asarray(
+                [im_scale_y, im_scale_x], dtype=np.float32)
+
+        # apply bbox
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'],
+                                                [im_scale_x, im_scale_y],
+                                                [resize_w, resize_h])
+
+        # apply areas
+        if 'gt_areas' in sample:
+            sample['gt_areas'] = self.apply_area(sample['gt_areas'],
+                                                 [im_scale_x, im_scale_y])
+
+        # apply polygon
+        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
+            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2],
+                                                [im_scale_x, im_scale_y])
+
+        # apply semantic
+        if 'semantic' in sample and sample['semantic']:
+            semantic = sample['semantic']
+            semantic = cv2.resize(
+                semantic.astype('float32'),
+                None,
+                None,
+                fx=im_scale_x,
+                fy=im_scale_y,
+                interpolation=self.interp)
+            semantic = np.asarray(semantic).astype('int32')
+            semantic = np.expand_dims(semantic, 0)
+            sample['semantic'] = semantic
+
+        # apply gt_segm
+        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
+            masks = [
+                cv2.resize(
+                    gt_segm,
+                    None,
+                    None,
+                    fx=im_scale_x,
+                    fy=im_scale_y,
+                    interpolation=cv2.INTER_NEAREST)
+                for gt_segm in sample['gt_segm']
+            ]
+            sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
+
+        if 'gt_joints' in sample:
+            sample['gt_joints'] = self.apply_joints(sample['gt_joints'],
+                                                    [im_scale_x, im_scale_y],
+                                                    [resize_w, resize_h])
+
+        return sample
+
+
+@register_op
+class RandomResize(BaseOperator):
+    def __init__(self,
+                 target_size,
+                 keep_ratio=True,
+                 interp=cv2.INTER_LINEAR,
+                 random_range=False,
+                 random_size=True,
+                 random_interp=False):
+        """
+        Resize image to target size randomly. random target_size and interpolation method
+        Args:
+            target_size (int, list, tuple): image target size, if random size is True, must be list or tuple
+            keep_ratio (bool): whether keep_raio or not, default true
+            interp (int): the interpolation method
+            random_range (bool): whether random select target size of image, the target_size must be 
+                a [[min_short_edge, long_edge], [max_short_edge, long_edge]]
+            random_size (bool): whether random select target size of image
+            random_interp (bool): whether random select interpolation method
+        """
+        super(RandomResize, self).__init__()
+        self.keep_ratio = keep_ratio
+        self.interp = interp
+        self.interps = [
+            cv2.INTER_NEAREST,
+            cv2.INTER_LINEAR,
+            cv2.INTER_AREA,
+            cv2.INTER_CUBIC,
+            cv2.INTER_LANCZOS4,
+        ]
+        assert isinstance(target_size, (
+            Integral, Sequence)), "target_size must be Integer, List or Tuple"
+        if (random_range or random_size) and not isinstance(target_size,
+                                                            Sequence):
+            raise TypeError(
+                "Type of target_size is invalid when random_size or random_range is True. Must be List or Tuple, now is {}".
+                format(type(target_size)))
+        if random_range and not len(target_size) == 2:
+            raise TypeError(
+                "target_size must be two list as [[min_short_edge, long_edge], [max_short_edge, long_edge]] when random_range is True."
+            )
+        self.target_size = target_size
+        self.random_range = random_range
+        self.random_size = random_size
+        self.random_interp = random_interp
+
+    def apply(self, sample, context=None):
+        """ Resize the image numpy.
+        """
+        if self.random_range:
+            short_edge = np.random.randint(self.target_size[0][0],
+                                           self.target_size[1][0] + 1)
+            long_edge = max(self.target_size[0][1], self.target_size[1][1] + 1)
+            target_size = [short_edge, long_edge]
+        else:
+            if self.random_size:
+                target_size = random.choice(self.target_size)
+            else:
+                target_size = self.target_size
+
+        if self.random_interp:
+            interp = random.choice(self.interps)
+        else:
+            interp = self.interp
+
+        resizer = Resize(target_size, self.keep_ratio, interp)
+        return resizer(sample, context=context)
+
+
+@register_op
+class RandomExpand(BaseOperator):
+    """Random expand the canvas.
+    Args:
+        ratio (float): maximum expansion ratio.
+        prob (float): probability to expand.
+        fill_value (list): color value used to fill the canvas. in RGB order.
+    """
+
+    def __init__(self, ratio=4., prob=0.5, fill_value=(127.5, 127.5, 127.5)):
+        super(RandomExpand, self).__init__()
+        assert ratio > 1.01, "expand ratio must be larger than 1.01"
+        self.ratio = ratio
+        self.prob = prob
+        assert isinstance(fill_value, (Number, Sequence)), \
+            "fill value must be either float or sequence"
+        if isinstance(fill_value, Number):
+            fill_value = (fill_value, ) * 3
+        if not isinstance(fill_value, tuple):
+            fill_value = tuple(fill_value)
+        self.fill_value = fill_value
+
+    def apply(self, sample, context=None):
+        if np.random.uniform(0., 1.) < self.prob:
+            return sample
+
+        im = sample['image']
+        height, width = im.shape[:2]
+        ratio = np.random.uniform(1., self.ratio)
+        h = int(height * ratio)
+        w = int(width * ratio)
+        if not h > height or not w > width:
+            return sample
+        y = np.random.randint(0, h - height)
+        x = np.random.randint(0, w - width)
+        offsets, size = [x, y], [h, w]
+
+        pad = Pad(size,
+                  pad_mode=-1,
+                  offsets=offsets,
+                  fill_value=self.fill_value)
+
+        return pad(sample, context=context)
+
+
+@register_op
+class CropWithSampling(BaseOperator):
+    def __init__(self, batch_sampler, satisfy_all=False, avoid_no_bbox=True):
+        """
+        Args:
+            batch_sampler (list): Multiple sets of different
+                                  parameters for cropping.
+            satisfy_all (bool): whether all boxes must satisfy.
+            e.g.[[1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0],
+                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 1.0],
+                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 1.0],
+                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 1.0],
+                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 1.0],
+                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 1.0],
+                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0]]
+           [max sample, max trial, min scale, max scale,
+            min aspect ratio, max aspect ratio,
+            min overlap, max overlap]
+            avoid_no_bbox (bool): whether to avoid the
+                                  situation where the box does not appear.
+        """
+        super(CropWithSampling, self).__init__()
+        self.batch_sampler = batch_sampler
+        self.satisfy_all = satisfy_all
+        self.avoid_no_bbox = avoid_no_bbox
+
+    def apply(self, sample, context):
+        """
+        Crop the image and modify bounding box.
+        Operators:
+            1. Scale the image width and height.
+            2. Crop the image according to a radom sample.
+            3. Rescale the bounding box.
+            4. Determine if the new bbox is satisfied in the new image.
+        Returns:
+            sample: the image, bounding box are replaced.
+        """
+        assert 'image' in sample, "image data not found"
+        im = sample['image']
+        gt_bbox = sample['gt_bbox']
+        gt_class = sample['gt_class']
+        im_height, im_width = im.shape[:2]
+        gt_score = None
+        if 'gt_score' in sample:
+            gt_score = sample['gt_score']
+        sampled_bbox = []
+        gt_bbox = gt_bbox.tolist()
+        for sampler in self.batch_sampler:
+            found = 0
+            for i in range(sampler[1]):
+                if found >= sampler[0]:
+                    break
+                sample_bbox = generate_sample_bbox(sampler)
+                if satisfy_sample_constraint(sampler, sample_bbox, gt_bbox,
+                                             self.satisfy_all):
+                    sampled_bbox.append(sample_bbox)
+                    found = found + 1
+        im = np.array(im)
+        while sampled_bbox:
+            idx = int(np.random.uniform(0, len(sampled_bbox)))
+            sample_bbox = sampled_bbox.pop(idx)
+            sample_bbox = clip_bbox(sample_bbox)
+            crop_bbox, crop_class, crop_score = \
+                filter_and_process(sample_bbox, gt_bbox, gt_class, scores=gt_score)
+            if self.avoid_no_bbox:
+                if len(crop_bbox) < 1:
+                    continue
+            xmin = int(sample_bbox[0] * im_width)
+            xmax = int(sample_bbox[2] * im_width)
+            ymin = int(sample_bbox[1] * im_height)
+            ymax = int(sample_bbox[3] * im_height)
+            im = im[ymin:ymax, xmin:xmax]
+            sample['image'] = im
+            sample['gt_bbox'] = crop_bbox
+            sample['gt_class'] = crop_class
+            sample['gt_score'] = crop_score
+            return sample
+        return sample
+
+
+@register_op
+class CropWithDataAchorSampling(BaseOperator):
+    def __init__(self,
+                 batch_sampler,
+                 anchor_sampler=None,
+                 target_size=None,
+                 das_anchor_scales=[16, 32, 64, 128],
+                 sampling_prob=0.5,
+                 min_size=8.,
+                 avoid_no_bbox=True):
+        """
+        Args:
+            anchor_sampler (list): anchor_sampling sets of different
+                                  parameters for cropping.
+            batch_sampler (list): Multiple sets of different
+                                  parameters for cropping.
+              e.g.[[1, 10, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.2, 0.0]]
+                  [[1, 50, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
+                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
+                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
+                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
+                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0]]
+              [max sample, max trial, min scale, max scale,
+               min aspect ratio, max aspect ratio,
+               min overlap, max overlap, min coverage, max coverage]
+            target_size (int): target image size.
+            das_anchor_scales (list[float]): a list of anchor scales in data
+                anchor smapling.
+            min_size (float): minimum size of sampled bbox.
+            avoid_no_bbox (bool): whether to avoid the
+                                  situation where the box does not appear.
+        """
+        super(CropWithDataAchorSampling, self).__init__()
+        self.anchor_sampler = anchor_sampler
+        self.batch_sampler = batch_sampler
+        self.target_size = target_size
+        self.sampling_prob = sampling_prob
+        self.min_size = min_size
+        self.avoid_no_bbox = avoid_no_bbox
+        self.das_anchor_scales = np.array(das_anchor_scales)
+
+    def apply(self, sample, context):
+        """
+        Crop the image and modify bounding box.
+        Operators:
+            1. Scale the image width and height.
+            2. Crop the image according to a radom sample.
+            3. Rescale the bounding box.
+            4. Determine if the new bbox is satisfied in the new image.
+        Returns:
+            sample: the image, bounding box are replaced.
+        """
+        assert 'image' in sample, "image data not found"
+        im = sample['image']
+        gt_bbox = sample['gt_bbox']
+        gt_class = sample['gt_class']
+        image_height, image_width = im.shape[:2]
+        gt_bbox[:, 0] /= image_width
+        gt_bbox[:, 1] /= image_height
+        gt_bbox[:, 2] /= image_width
+        gt_bbox[:, 3] /= image_height
+        gt_score = None
+        if 'gt_score' in sample:
+            gt_score = sample['gt_score']
+        sampled_bbox = []
+        gt_bbox = gt_bbox.tolist()
+
+        prob = np.random.uniform(0., 1.)
+        if prob > self.sampling_prob:  # anchor sampling
+            assert self.anchor_sampler
+            for sampler in self.anchor_sampler:
+                found = 0
+                for i in range(sampler[1]):
+                    if found >= sampler[0]:
+                        break
+                    sample_bbox = data_anchor_sampling(
+                        gt_bbox, image_width, image_height,
+                        self.das_anchor_scales, self.target_size)
+                    if sample_bbox == 0:
+                        break
+                    if satisfy_sample_constraint_coverage(sampler, sample_bbox,
+                                                          gt_bbox):
+                        sampled_bbox.append(sample_bbox)
+                        found = found + 1
+            im = np.array(im)
+            while sampled_bbox:
+                idx = int(np.random.uniform(0, len(sampled_bbox)))
+                sample_bbox = sampled_bbox.pop(idx)
+
+                if 'gt_keypoint' in sample.keys():
+                    keypoints = (sample['gt_keypoint'],
+                                 sample['keypoint_ignore'])
+                    crop_bbox, crop_class, crop_score, gt_keypoints = \
+                        filter_and_process(sample_bbox, gt_bbox, gt_class,
+                                scores=gt_score,
+                                keypoints=keypoints)
+                else:
+                    crop_bbox, crop_class, crop_score = filter_and_process(
+                        sample_bbox, gt_bbox, gt_class, scores=gt_score)
+                crop_bbox, crop_class, crop_score = bbox_area_sampling(
+                    crop_bbox, crop_class, crop_score, self.target_size,
+                    self.min_size)
+
+                if self.avoid_no_bbox:
+                    if len(crop_bbox) < 1:
+                        continue
+                im = crop_image_sampling(im, sample_bbox, image_width,
+                                         image_height, self.target_size)
+                height, width = im.shape[:2]
+                crop_bbox[:, 0] *= width
+                crop_bbox[:, 1] *= height
+                crop_bbox[:, 2] *= width
+                crop_bbox[:, 3] *= height
+                sample['image'] = im
+                sample['gt_bbox'] = crop_bbox
+                sample['gt_class'] = crop_class
+                if 'gt_score' in sample:
+                    sample['gt_score'] = crop_score
+                if 'gt_keypoint' in sample.keys():
+                    sample['gt_keypoint'] = gt_keypoints[0]
+                    sample['keypoint_ignore'] = gt_keypoints[1]
+                return sample
+            return sample
+
+        else:
+            for sampler in self.batch_sampler:
+                found = 0
+                for i in range(sampler[1]):
+                    if found >= sampler[0]:
+                        break
+                    sample_bbox = generate_sample_bbox_square(
+                        sampler, image_width, image_height)
+                    if satisfy_sample_constraint_coverage(sampler, sample_bbox,
+                                                          gt_bbox):
+                        sampled_bbox.append(sample_bbox)
+                        found = found + 1
+            im = np.array(im)
+            while sampled_bbox:
+                idx = int(np.random.uniform(0, len(sampled_bbox)))
+                sample_bbox = sampled_bbox.pop(idx)
+                sample_bbox = clip_bbox(sample_bbox)
+
+                if 'gt_keypoint' in sample.keys():
+                    keypoints = (sample['gt_keypoint'],
+                                 sample['keypoint_ignore'])
+                    crop_bbox, crop_class, crop_score, gt_keypoints = \
+                        filter_and_process(sample_bbox, gt_bbox, gt_class,
+                                scores=gt_score,
+                                keypoints=keypoints)
+                else:
+                    crop_bbox, crop_class, crop_score = filter_and_process(
+                        sample_bbox, gt_bbox, gt_class, scores=gt_score)
+                # sampling bbox according the bbox area
+                crop_bbox, crop_class, crop_score = bbox_area_sampling(
+                    crop_bbox, crop_class, crop_score, self.target_size,
+                    self.min_size)
+
+                if self.avoid_no_bbox:
+                    if len(crop_bbox) < 1:
+                        continue
+                xmin = int(sample_bbox[0] * image_width)
+                xmax = int(sample_bbox[2] * image_width)
+                ymin = int(sample_bbox[1] * image_height)
+                ymax = int(sample_bbox[3] * image_height)
+                im = im[ymin:ymax, xmin:xmax]
+                height, width = im.shape[:2]
+                crop_bbox[:, 0] *= width
+                crop_bbox[:, 1] *= height
+                crop_bbox[:, 2] *= width
+                crop_bbox[:, 3] *= height
+                sample['image'] = im
+                sample['gt_bbox'] = crop_bbox
+                sample['gt_class'] = crop_class
+                if 'gt_score' in sample:
+                    sample['gt_score'] = crop_score
+                if 'gt_keypoint' in sample.keys():
+                    sample['gt_keypoint'] = gt_keypoints[0]
+                    sample['keypoint_ignore'] = gt_keypoints[1]
+                return sample
+            return sample
+
+
+@register_op
+class RandomCrop(BaseOperator):
+    """Random crop image and bboxes.
+    Args:
+        aspect_ratio (list): aspect ratio of cropped region.
+            in [min, max] format.
+        thresholds (list): iou thresholds for decide a valid bbox crop.
+        scaling (list): ratio between a cropped region and the original image.
+             in [min, max] format.
+        num_attempts (int): number of tries before giving up.
+        allow_no_crop (bool): allow return without actually cropping them.
+        cover_all_box (bool): ensure all bboxes are covered in the final crop.
+        is_mask_crop(bool): whether crop the segmentation.
+    """
+
+    def __init__(self,
+                 aspect_ratio=[.5, 2.],
+                 thresholds=[.0, .1, .3, .5, .7, .9],
+                 scaling=[.3, 1.],
+                 num_attempts=50,
+                 allow_no_crop=True,
+                 cover_all_box=False,
+                 is_mask_crop=False,
+                 ioumode="iou",
+                 prob=1.0):
+        super(RandomCrop, self).__init__()
+        self.aspect_ratio = aspect_ratio
+        self.thresholds = thresholds
+        self.scaling = scaling
+        self.num_attempts = num_attempts
+        self.allow_no_crop = allow_no_crop
+        self.cover_all_box = cover_all_box
+        self.is_mask_crop = is_mask_crop
+        self.ioumode = ioumode
+        self.prob = prob
+
+    def crop_segms(self, segms, valid_ids, crop, height, width):
+        def _crop_poly(segm, crop):
+            xmin, ymin, xmax, ymax = crop
+            crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin]
+            crop_p = np.array(crop_coord).reshape(4, 2)
+            crop_p = Polygon(crop_p)
+
+            crop_segm = list()
+            for poly in segm:
+                poly = np.array(poly).reshape(len(poly) // 2, 2)
+                polygon = Polygon(poly)
+                if not polygon.is_valid:
+                    exterior = polygon.exterior
+                    multi_lines = exterior.intersection(exterior)
+                    polygons = shapely.ops.polygonize(multi_lines)
+                    polygon = MultiPolygon(polygons)
+                multi_polygon = list()
+                if isinstance(polygon, MultiPolygon):
+                    multi_polygon = copy.deepcopy(polygon)
+                else:
+                    multi_polygon.append(copy.deepcopy(polygon))
+                for per_polygon in multi_polygon:
+                    inter = per_polygon.intersection(crop_p)
+                    if not inter:
+                        continue
+                    if isinstance(inter, (MultiPolygon, GeometryCollection)):
+                        for part in inter:
+                            if not isinstance(part, Polygon):
+                                continue
+                            part = np.squeeze(
+                                np.array(part.exterior.coords[:-1]).reshape(1,
+                                                                            -1))
+                            part[0::2] -= xmin
+                            part[1::2] -= ymin
+                            crop_segm.append(part.tolist())
+                    elif isinstance(inter, Polygon):
+                        crop_poly = np.squeeze(
+                            np.array(inter.exterior.coords[:-1]).reshape(1, -1))
+                        crop_poly[0::2] -= xmin
+                        crop_poly[1::2] -= ymin
+                        crop_segm.append(crop_poly.tolist())
+                    else:
+                        continue
+            return crop_segm
+
+        def _crop_rle(rle, crop, height, width):
+            if 'counts' in rle and type(rle['counts']) == list:
+                rle = mask_util.frPyObjects(rle, height, width)
+            mask = mask_util.decode(rle)
+            mask = mask[crop[1]:crop[3], crop[0]:crop[2]]
+            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
+            return rle
+
+        crop_segms = []
+        for id in valid_ids:
+            segm = segms[id]
+            if is_poly(segm):
+                import copy
+                import shapely.ops
+                from shapely.geometry import Polygon, MultiPolygon, GeometryCollection
+                logging.getLogger("shapely").setLevel(logging.WARNING)
+                # Polygon format
+                crop_segms.append(_crop_poly(segm, crop))
+            else:
+                # RLE format
+                import pycocotools.mask as mask_util
+                crop_segms.append(_crop_rle(segm, crop, height, width))
+        return crop_segms
+
+    def set_fake_bboxes(self, sample):
+        sample['gt_bbox'] = np.array(
+            [
+                [32, 32, 128, 128],
+                [32, 32, 128, 256],
+                [32, 64, 128, 128],
+                [32, 64, 128, 256],
+                [64, 64, 128, 256],
+                [64, 64, 256, 256],
+                [64, 32, 128, 256],
+                [64, 32, 128, 256],
+                [96, 32, 128, 256],
+                [96, 32, 128, 256],
+            ],
+            dtype=np.float32)
+        sample['gt_class'] = np.array(
+            [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]], np.int32)
+        return sample
+
+    def apply(self, sample, context=None):
+        if random.random() > self.prob:
+            return sample
+
+        if 'gt_bbox' not in sample:
+            # only used in semi-det as unsup data
+            sample = self.set_fake_bboxes(sample)
+            sample = self.random_crop(sample, fake_bboxes=True)
+            return sample
+
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:
+            return sample
+        sample = self.random_crop(sample)
+        return sample
+
+    def random_crop(self, sample, fake_bboxes=False):
+        h, w = sample['image'].shape[:2]
+        gt_bbox = sample['gt_bbox']
+
+        # NOTE Original method attempts to generate one candidate for each
+        # threshold then randomly sample one from the resulting list.
+        # Here a short circuit approach is taken, i.e., randomly choose a
+        # threshold and attempt to find a valid crop, and simply return the
+        # first one found.
+        # The probability is not exactly the same, kinda resembling the
+        # "Monty Hall" problem. Actually carrying out the attempts will affect
+        # observability (just like opening doors in the "Monty Hall" game).
+        thresholds = list(self.thresholds)
+        if self.allow_no_crop:
+            thresholds.append('no_crop')
+        np.random.shuffle(thresholds)
+
+        for thresh in thresholds:
+            if thresh == 'no_crop':
+                return sample
+
+            found = False
+            for i in range(self.num_attempts):
+                scale = np.random.uniform(*self.scaling)
+                if self.aspect_ratio is not None:
+                    min_ar, max_ar = self.aspect_ratio
+                    aspect_ratio = np.random.uniform(
+                        max(min_ar, scale**2), min(max_ar, scale**-2))
+                    h_scale = scale / np.sqrt(aspect_ratio)
+                    w_scale = scale * np.sqrt(aspect_ratio)
+                else:
+                    h_scale = np.random.uniform(*self.scaling)
+                    w_scale = np.random.uniform(*self.scaling)
+                crop_h = h * h_scale
+                crop_w = w * w_scale
+                if self.aspect_ratio is None:
+                    if crop_h / crop_w < 0.5 or crop_h / crop_w > 2.0:
+                        continue
+
+                crop_h = int(crop_h)
+                crop_w = int(crop_w)
+                crop_y = np.random.randint(0, h - crop_h)
+                crop_x = np.random.randint(0, w - crop_w)
+                crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h]
+                if self.ioumode == "iof":
+                    iou = self._gtcropiou_matrix(
+                        gt_bbox, np.array(
+                            [crop_box], dtype=np.float32))
+                elif self.ioumode == "iou":
+                    iou = self._iou_matrix(
+                        gt_bbox, np.array(
+                            [crop_box], dtype=np.float32))
+                if iou.max() < thresh:
+                    continue
+
+                if self.cover_all_box and iou.min() < thresh:
+                    continue
+
+                cropped_box, valid_ids = self._crop_box_with_center_constraint(
+                    gt_bbox, np.array(
+                        crop_box, dtype=np.float32))
+                if valid_ids.size > 0:
+                    found = True
+                    break
+
+            if found:
+                if self.is_mask_crop and 'gt_poly' in sample and len(sample[
+                        'gt_poly']) > 0:
+                    crop_polys = self.crop_segms(
+                        sample['gt_poly'],
+                        valid_ids,
+                        np.array(
+                            crop_box, dtype=np.int64),
+                        h,
+                        w)
+                    if [] in crop_polys:
+                        delete_id = list()
+                        valid_polys = list()
+                        for id, crop_poly in enumerate(crop_polys):
+                            if crop_poly == []:
+                                delete_id.append(id)
+                            else:
+                                valid_polys.append(crop_poly)
+                        valid_ids = np.delete(valid_ids, delete_id)
+                        if len(valid_polys) == 0:
+                            return sample
+                        sample['gt_poly'] = valid_polys
+                    else:
+                        sample['gt_poly'] = crop_polys
+
+                if 'gt_segm' in sample:
+                    sample['gt_segm'] = self._crop_segm(sample['gt_segm'],
+                                                        crop_box)
+                    sample['gt_segm'] = np.take(
+                        sample['gt_segm'], valid_ids, axis=0)
+
+                sample['image'] = self._crop_image(sample['image'], crop_box)
+                if fake_bboxes == True:
+                    return sample
+
+                sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0)
+                sample['gt_class'] = np.take(
+                    sample['gt_class'], valid_ids, axis=0)
+                if 'gt_score' in sample:
+                    sample['gt_score'] = np.take(
+                        sample['gt_score'], valid_ids, axis=0)
+
+                if 'is_crowd' in sample:
+                    sample['is_crowd'] = np.take(
+                        sample['is_crowd'], valid_ids, axis=0)
+
+                if 'difficult' in sample:
+                    sample['difficult'] = np.take(
+                        sample['difficult'], valid_ids, axis=0)
+
+                if 'gt_joints' in sample:
+                    sample['gt_joints'] = self._crop_joints(sample['gt_joints'],
+                                                            crop_box)
+
+                return sample
+
+        return sample
+
+    def _iou_matrix(self, a, b):
+        tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2])
+        br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
+
+        area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2)
+        area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
+        area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
+        area_o = (area_a[:, np.newaxis] + area_b - area_i)
+        return area_i / (area_o + 1e-10)
+
+    def _gtcropiou_matrix(self, a, b):
+        tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2])
+        br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
+
+        area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2)
+        area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
+        area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
+        area_o = (area_a[:, np.newaxis] + area_b - area_i)
+        return area_i / (area_a + 1e-10)
+
+    def _crop_box_with_center_constraint(self, box, crop):
+        cropped_box = box.copy()
+
+        cropped_box[:, :2] = np.maximum(box[:, :2], crop[:2])
+        cropped_box[:, 2:] = np.minimum(box[:, 2:], crop[2:])
+        cropped_box[:, :2] -= crop[:2]
+        cropped_box[:, 2:] -= crop[:2]
+
+        centers = (box[:, :2] + box[:, 2:]) / 2
+        valid = np.logical_and(crop[:2] <= centers,
+                               centers < crop[2:]).all(axis=1)
+        valid = np.logical_and(
+            valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1))
+
+        return cropped_box, np.where(valid)[0]
+
+    def _crop_image(self, img, crop):
+        x1, y1, x2, y2 = crop
+        return img[y1:y2, x1:x2, :]
+
+    def _crop_segm(self, segm, crop):
+        x1, y1, x2, y2 = crop
+        return segm[:, y1:y2, x1:x2]
+
+    def _crop_joints(self, joints, crop):
+        x1, y1, x2, y2 = crop
+        joints[joints[..., 0] > x2, :] = 0
+        joints[joints[..., 1] > y2, :] = 0
+        joints[joints[..., 0] < x1, :] = 0
+        joints[joints[..., 1] < y1, :] = 0
+        joints[..., 0] -= x1
+        joints[..., 1] -= y1
+        return joints
+
+
+@register_op
+class RandomScaledCrop(BaseOperator):
+    """Resize image and bbox based on long side (with optional random scaling),
+       then crop or pad image to target size.
+    Args:
+        target_size (int|list): target size, "hw" format.
+        scale_range (list): random scale range.
+        interp (int): interpolation method, default to `cv2.INTER_LINEAR`.
+        fill_value (float|list|tuple): color value used to fill the canvas,
+            in RGB order.
+    """
+
+    def __init__(self,
+                 target_size=512,
+                 scale_range=[.1, 2.],
+                 interp=cv2.INTER_LINEAR,
+                 fill_value=(123.675, 116.28, 103.53)):
+        super(RandomScaledCrop, self).__init__()
+        assert isinstance(target_size, (
+            Integral, Sequence)), "target_size must be Integer, List or Tuple"
+        if isinstance(target_size, Integral):
+            target_size = [target_size, ] * 2
+
+        self.target_size = target_size
+        self.scale_range = scale_range
+        self.interp = interp
+        assert isinstance(fill_value, (Number, Sequence)), \
+            "fill value must be either float or sequence"
+        if isinstance(fill_value, Number):
+            fill_value = (fill_value, ) * 3
+        if not isinstance(fill_value, tuple):
+            fill_value = tuple(fill_value)
+        self.fill_value = fill_value
+
+    def apply_image(self, img, output_size, offset_x, offset_y):
+        th, tw = self.target_size
+        rh, rw = output_size
+        img = cv2.resize(
+            img, (rw, rh), interpolation=self.interp).astype(np.float32)
+        canvas = np.ones([th, tw, 3], dtype=np.float32)
+        canvas *= np.array(self.fill_value, dtype=np.float32)
+        canvas[:min(th, rh), :min(tw, rw)] = \
+            img[offset_y:offset_y + th, offset_x:offset_x + tw]
+        return canvas
+
+    def apply_bbox(self, gt_bbox, gt_class, scale, offset_x, offset_y):
+        th, tw = self.target_size
+        shift_array = np.array(
+            [
+                offset_x,
+                offset_y,
+            ] * 2, dtype=np.float32)
+        boxes = gt_bbox * scale - shift_array
+        boxes[:, 0::2] = np.clip(boxes[:, 0::2], 0, tw)
+        boxes[:, 1::2] = np.clip(boxes[:, 1::2], 0, th)
+        # filter boxes with no area
+        area = np.prod(boxes[..., 2:] - boxes[..., :2], axis=1)
+        valid = (area > 1.).nonzero()[0]
+        return boxes[valid], gt_class[valid], valid
+
+    def apply_segm(self, segms, output_size, offset_x, offset_y, valid=None):
+        th, tw = self.target_size
+        rh, rw = output_size
+        out_segms = []
+        for segm in segms:
+            segm = cv2.resize(segm, (rw, rh), interpolation=cv2.INTER_NEAREST)
+            segm = segm.astype(np.float32)
+            canvas = np.zeros([th, tw], dtype=segm.dtype)
+            canvas[:min(th, rh), :min(tw, rw)] = \
+                segm[offset_y:offset_y + th, offset_x:offset_x + tw]
+            out_segms.append(canvas)
+        out_segms = np.stack(out_segms)
+        return out_segms if valid is None else out_segms[valid]
+
+    def apply(self, sample, context=None):
+        img = sample['image']
+        h, w = img.shape[:2]
+        random_scale = np.random.uniform(*self.scale_range)
+        target_scale_size = [t * random_scale for t in self.target_size]
+        # Compute actual rescaling applied to image.
+        scale = min(target_scale_size[0] / h, target_scale_size[1] / w)
+        output_size = [int(round(h * scale)), int(round(w * scale))]
+        # get offset
+        offset_x = int(
+            max(0, np.random.uniform(0., output_size[1] - self.target_size[1])))
+        offset_y = int(
+            max(0, np.random.uniform(0., output_size[0] - self.target_size[0])))
+
+        # apply to image
+        sample['image'] = self.apply_image(img, output_size, offset_x, offset_y)
+
+        # apply to bbox
+        valid = None
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            sample['gt_bbox'], sample['gt_class'], valid = self.apply_bbox(
+                sample['gt_bbox'], sample['gt_class'], scale, offset_x,
+                offset_y)
+
+        # apply to segm
+        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
+            sample['gt_segm'] = self.apply_segm(sample['gt_segm'], output_size,
+                                                offset_x, offset_y, valid)
+
+        sample['im_shape'] = np.asarray(output_size, dtype=np.float32)
+        scale_factor = sample['scale_factor']
+        sample['scale_factor'] = np.asarray(
+            [scale_factor[0] * scale, scale_factor[1] * scale],
+            dtype=np.float32)
+
+        return sample
+
+
+@register_op
+class Cutmix(BaseOperator):
+    def __init__(self, alpha=1.5, beta=1.5):
+        """ 
+        CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features, see https://arxiv.org/abs/1905.04899
+        Cutmix image and gt_bbbox/gt_score
+        Args:
+             alpha (float): alpha parameter of beta distribute
+             beta (float): beta parameter of beta distribute
+        """
+        super(Cutmix, self).__init__()
+        self.alpha = alpha
+        self.beta = beta
+        if self.alpha <= 0.0:
+            raise ValueError("alpha shold be positive in {}".format(self))
+        if self.beta <= 0.0:
+            raise ValueError("beta shold be positive in {}".format(self))
+
+    def apply_image(self, img1, img2, factor):
+        """ _rand_bbox """
+        h = max(img1.shape[0], img2.shape[0])
+        w = max(img1.shape[1], img2.shape[1])
+        cut_rat = np.sqrt(1. - factor)
+
+        cut_w = np.int32(w * cut_rat)
+        cut_h = np.int32(h * cut_rat)
+
+        # uniform
+        cx = np.random.randint(w)
+        cy = np.random.randint(h)
+
+        bbx1 = np.clip(cx - cut_w // 2, 0, w - 1)
+        bby1 = np.clip(cy - cut_h // 2, 0, h - 1)
+        bbx2 = np.clip(cx + cut_w // 2, 0, w - 1)
+        bby2 = np.clip(cy + cut_h // 2, 0, h - 1)
+
+        img_1_pad = np.zeros((h, w, img1.shape[2]), 'float32')
+        img_1_pad[:img1.shape[0], :img1.shape[1], :] = \
+            img1.astype('float32')
+        img_2_pad = np.zeros((h, w, img2.shape[2]), 'float32')
+        img_2_pad[:img2.shape[0], :img2.shape[1], :] = \
+            img2.astype('float32')
+        img_1_pad[bby1:bby2, bbx1:bbx2, :] = img_2_pad[bby1:bby2, bbx1:bbx2, :]
+        return img_1_pad
+
+    def __call__(self, sample, context=None):
+        if not isinstance(sample, Sequence):
+            return sample
+
+        assert len(sample) == 2, 'cutmix need two samples'
+
+        factor = np.random.beta(self.alpha, self.beta)
+        factor = max(0.0, min(1.0, factor))
+        if factor >= 1.0:
+            return sample[0]
+        if factor <= 0.0:
+            return sample[1]
+        img1 = sample[0]['image']
+        img2 = sample[1]['image']
+        img = self.apply_image(img1, img2, factor)
+        gt_bbox1 = sample[0]['gt_bbox']
+        gt_bbox2 = sample[1]['gt_bbox']
+        gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0)
+        gt_class1 = sample[0]['gt_class']
+        gt_class2 = sample[1]['gt_class']
+        gt_class = np.concatenate((gt_class1, gt_class2), axis=0)
+        gt_score1 = np.ones_like(sample[0]['gt_class'])
+        gt_score2 = np.ones_like(sample[1]['gt_class'])
+        gt_score = np.concatenate(
+            (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0)
+        result = copy.deepcopy(sample[0])
+        result['image'] = img
+        result['gt_bbox'] = gt_bbox
+        result['gt_score'] = gt_score
+        result['gt_class'] = gt_class
+        if 'is_crowd' in sample[0]:
+            is_crowd1 = sample[0]['is_crowd']
+            is_crowd2 = sample[1]['is_crowd']
+            is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0)
+            result['is_crowd'] = is_crowd
+        if 'difficult' in sample[0]:
+            is_difficult1 = sample[0]['difficult']
+            is_difficult2 = sample[1]['difficult']
+            is_difficult = np.concatenate(
+                (is_difficult1, is_difficult2), axis=0)
+            result['difficult'] = is_difficult
+        return result
+
+
+@register_op
+class Mixup(BaseOperator):
+    def __init__(self, alpha=1.5, beta=1.5):
+        """ Mixup image and gt_bbbox/gt_score
+        Args:
+            alpha (float): alpha parameter of beta distribute
+            beta (float): beta parameter of beta distribute
+        """
+        super(Mixup, self).__init__()
+        self.alpha = alpha
+        self.beta = beta
+        if self.alpha <= 0.0:
+            raise ValueError("alpha shold be positive in {}".format(self))
+        if self.beta <= 0.0:
+            raise ValueError("beta shold be positive in {}".format(self))
+
+    def apply_image(self, img1, img2, factor):
+        h = max(img1.shape[0], img2.shape[0])
+        w = max(img1.shape[1], img2.shape[1])
+        img = np.zeros((h, w, img1.shape[2]), 'float32')
+        img[:img1.shape[0], :img1.shape[1], :] = \
+            img1.astype('float32') * factor
+        img[:img2.shape[0], :img2.shape[1], :] += \
+            img2.astype('float32') * (1.0 - factor)
+        return img.astype('uint8')
+
+    def __call__(self, sample, context=None):
+        if not isinstance(sample, Sequence):
+            return sample
+
+        assert len(sample) == 2, 'mixup need two samples'
+
+        factor = np.random.beta(self.alpha, self.beta)
+        factor = max(0.0, min(1.0, factor))
+        if factor >= 1.0:
+            return sample[0]
+        if factor <= 0.0:
+            return sample[1]
+        im = self.apply_image(sample[0]['image'], sample[1]['image'], factor)
+        result = copy.deepcopy(sample[0])
+        result['image'] = im
+        # apply bbox and score
+        if 'gt_bbox' in sample[0]:
+            gt_bbox1 = sample[0]['gt_bbox']
+            gt_bbox2 = sample[1]['gt_bbox']
+            gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0)
+            result['gt_bbox'] = gt_bbox
+        if 'gt_class' in sample[0]:
+            gt_class1 = sample[0]['gt_class']
+            gt_class2 = sample[1]['gt_class']
+            gt_class = np.concatenate((gt_class1, gt_class2), axis=0)
+            result['gt_class'] = gt_class
+
+            gt_score1 = np.ones_like(sample[0]['gt_class'])
+            gt_score2 = np.ones_like(sample[1]['gt_class'])
+            gt_score = np.concatenate(
+                (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0)
+            result['gt_score'] = gt_score.astype('float32')
+        if 'is_crowd' in sample[0]:
+            is_crowd1 = sample[0]['is_crowd']
+            is_crowd2 = sample[1]['is_crowd']
+            is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0)
+            result['is_crowd'] = is_crowd
+        if 'difficult' in sample[0]:
+            is_difficult1 = sample[0]['difficult']
+            is_difficult2 = sample[1]['difficult']
+            is_difficult = np.concatenate(
+                (is_difficult1, is_difficult2), axis=0)
+            result['difficult'] = is_difficult
+
+        if 'gt_ide' in sample[0]:
+            gt_ide1 = sample[0]['gt_ide']
+            gt_ide2 = sample[1]['gt_ide']
+            gt_ide = np.concatenate((gt_ide1, gt_ide2), axis=0)
+            result['gt_ide'] = gt_ide
+        return result
+
+
+@register_op
+class NormalizeBox(BaseOperator):
+    """Transform the bounding box's coornidates to [0,1]."""
+
+    def __init__(self):
+        super(NormalizeBox, self).__init__()
+
+    def apply(self, sample, context):
+        im = sample['image']
+        gt_bbox = sample['gt_bbox']
+        height, width, _ = im.shape
+        for i in range(gt_bbox.shape[0]):
+            gt_bbox[i][0] = gt_bbox[i][0] / width
+            gt_bbox[i][1] = gt_bbox[i][1] / height
+            gt_bbox[i][2] = gt_bbox[i][2] / width
+            gt_bbox[i][3] = gt_bbox[i][3] / height
+        sample['gt_bbox'] = gt_bbox
+
+        if 'gt_keypoint' in sample.keys():
+            gt_keypoint = sample['gt_keypoint']
+
+            for i in range(gt_keypoint.shape[1]):
+                if i % 2:
+                    gt_keypoint[:, i] = gt_keypoint[:, i] / height
+                else:
+                    gt_keypoint[:, i] = gt_keypoint[:, i] / width
+            sample['gt_keypoint'] = gt_keypoint
+
+        return sample
+
+
+@register_op
+class BboxXYXY2XYWH(BaseOperator):
+    """
+    Convert bbox XYXY format to XYWH format.
+    """
+
+    def __init__(self):
+        super(BboxXYXY2XYWH, self).__init__()
+
+    def apply(self, sample, context=None):
+        assert 'gt_bbox' in sample
+        bbox = sample['gt_bbox']
+        bbox[:, 2:4] = bbox[:, 2:4] - bbox[:, :2]
+        bbox[:, :2] = bbox[:, :2] + bbox[:, 2:4] / 2.
+        sample['gt_bbox'] = bbox
+        return sample
+
+
+@register_op
+class PadBox(BaseOperator):
+    def __init__(self, num_max_boxes=50):
+        """
+        Pad zeros to bboxes if number of bboxes is less than num_max_boxes.
+        Args:
+            num_max_boxes (int): the max number of bboxes
+        """
+        self.num_max_boxes = num_max_boxes
+        super(PadBox, self).__init__()
+
+    def apply(self, sample, context=None):
+        assert 'gt_bbox' in sample
+        bbox = sample['gt_bbox']
+        gt_num = min(self.num_max_boxes, len(bbox))
+        num_max = self.num_max_boxes
+        # fields = context['fields'] if context else []
+        pad_bbox = np.zeros((num_max, 4), dtype=np.float32)
+        if gt_num > 0:
+            pad_bbox[:gt_num, :] = bbox[:gt_num, :]
+        sample['gt_bbox'] = pad_bbox
+        if 'gt_class' in sample:
+            pad_class = np.zeros((num_max, ), dtype=np.int32)
+            if gt_num > 0:
+                pad_class[:gt_num] = sample['gt_class'][:gt_num, 0]
+            sample['gt_class'] = pad_class
+        if 'gt_score' in sample:
+            pad_score = np.zeros((num_max, ), dtype=np.float32)
+            if gt_num > 0:
+                pad_score[:gt_num] = sample['gt_score'][:gt_num, 0]
+            sample['gt_score'] = pad_score
+        # in training, for example in op ExpandImage,
+        # the bbox and gt_class is expandded, but the difficult is not,
+        # so, judging by it's length
+        if 'difficult' in sample:
+            pad_diff = np.zeros((num_max, ), dtype=np.int32)
+            if gt_num > 0:
+                pad_diff[:gt_num] = sample['difficult'][:gt_num, 0]
+            sample['difficult'] = pad_diff
+        if 'is_crowd' in sample:
+            pad_crowd = np.zeros((num_max, ), dtype=np.int32)
+            if gt_num > 0:
+                pad_crowd[:gt_num] = sample['is_crowd'][:gt_num, 0]
+            sample['is_crowd'] = pad_crowd
+        if 'gt_ide' in sample:
+            pad_ide = np.zeros((num_max, ), dtype=np.int32)
+            if gt_num > 0:
+                pad_ide[:gt_num] = sample['gt_ide'][:gt_num, 0]
+            sample['gt_ide'] = pad_ide
+        return sample
+
+
+@register_op
+class DebugVisibleImage(BaseOperator):
+    """
+    In debug mode, visualize images according to `gt_box`.
+    (Currently only supported when not cropping and flipping image.)
+    """
+
+    def __init__(self, output_dir='output/debug', is_normalized=False):
+        super(DebugVisibleImage, self).__init__()
+        self.is_normalized = is_normalized
+        self.output_dir = output_dir
+        if not os.path.isdir(output_dir):
+            os.makedirs(output_dir)
+        if not isinstance(self.is_normalized, bool):
+            raise TypeError("{}: input type is invalid.".format(self))
+
+    def apply(self, sample, context=None):
+        image = Image.fromarray(sample['image'].astype(np.uint8))
+        out_file_name = '{:012d}.jpg'.format(sample['im_id'][0])
+        width = sample['w']
+        height = sample['h']
+        gt_bbox = sample['gt_bbox']
+        gt_class = sample['gt_class']
+        draw = ImageDraw.Draw(image)
+        for i in range(gt_bbox.shape[0]):
+            if self.is_normalized:
+                gt_bbox[i][0] = gt_bbox[i][0] * width
+                gt_bbox[i][1] = gt_bbox[i][1] * height
+                gt_bbox[i][2] = gt_bbox[i][2] * width
+                gt_bbox[i][3] = gt_bbox[i][3] * height
+
+            xmin, ymin, xmax, ymax = gt_bbox[i]
+            draw.line(
+                [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin),
+                 (xmin, ymin)],
+                width=2,
+                fill='green')
+            # draw label
+            text = str(gt_class[i][0])
+            tw, th = draw.textsize(text)
+            draw.rectangle(
+                [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill='green')
+            draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))
+
+        if 'gt_keypoint' in sample.keys():
+            gt_keypoint = sample['gt_keypoint']
+            if self.is_normalized:
+                for i in range(gt_keypoint.shape[1]):
+                    if i % 2:
+                        gt_keypoint[:, i] = gt_keypoint[:, i] * height
+                    else:
+                        gt_keypoint[:, i] = gt_keypoint[:, i] * width
+            for i in range(gt_keypoint.shape[0]):
+                keypoint = gt_keypoint[i]
+                for j in range(int(keypoint.shape[0] / 2)):
+                    x1 = round(keypoint[2 * j]).astype(np.int32)
+                    y1 = round(keypoint[2 * j + 1]).astype(np.int32)
+                    draw.ellipse(
+                        (x1, y1, x1 + 5, y1 + 5), fill='green', outline='green')
+        save_path = os.path.join(self.output_dir, out_file_name)
+        image.save(save_path, quality=95)
+        return sample
+
+
+@register_op
+class Pad(BaseOperator):
+    def __init__(self,
+                 size=None,
+                 size_divisor=32,
+                 pad_mode=0,
+                 offsets=None,
+                 fill_value=(127.5, 127.5, 127.5)):
+        """
+        Pad image to a specified size or multiple of size_divisor.
+        Args:
+            size (int, Sequence): image target size, if None, pad to multiple of size_divisor, default None
+            size_divisor (int): size divisor, default 32
+            pad_mode (int): pad mode, currently only supports four modes [-1, 0, 1, 2]. if -1, use specified offsets
+                if 0, only pad to right and bottom. if 1, pad according to center. if 2, only pad left and top
+            offsets (list): [offset_x, offset_y], specify offset while padding, only supported pad_mode=-1
+            fill_value (bool): rgb value of pad area, default (127.5, 127.5, 127.5)
+        """
+        super(Pad, self).__init__()
+
+        if not isinstance(size, (int, Sequence)):
+            raise TypeError(
+                "Type of target_size is invalid when random_size is True. \
+                            Must be List, now is {}".format(type(size)))
+
+        if isinstance(size, int):
+            size = [size, size]
+
+        assert pad_mode in [
+            -1, 0, 1, 2
+        ], 'currently only supports four modes [-1, 0, 1, 2]'
+        if pad_mode == -1:
+            assert offsets, 'if pad_mode is -1, offsets should not be None'
+
+        self.size = size
+        self.size_divisor = size_divisor
+        self.pad_mode = pad_mode
+        self.fill_value = fill_value
+        self.offsets = offsets
+
+    def apply_segm(self, segms, offsets, im_size, size):
+        def _expand_poly(poly, x, y):
+            expanded_poly = np.array(poly)
+            expanded_poly[0::2] += x
+            expanded_poly[1::2] += y
+            return expanded_poly.tolist()
+
+        def _expand_rle(rle, x, y, height, width, h, w):
+            if 'counts' in rle and type(rle['counts']) == list:
+                rle = mask_util.frPyObjects(rle, height, width)
+            mask = mask_util.decode(rle)
+            expanded_mask = np.full((h, w), 0).astype(mask.dtype)
+            expanded_mask[y:y + height, x:x + width] = mask
+            rle = mask_util.encode(
+                np.array(
+                    expanded_mask, order='F', dtype=np.uint8))
+            return rle
+
+        x, y = offsets
+        height, width = im_size
+        h, w = size
+        expanded_segms = []
+        for segm in segms:
+            if is_poly(segm):
+                # Polygon format
+                expanded_segms.append(
+                    [_expand_poly(poly, x, y) for poly in segm])
+            else:
+                # RLE format
+                import pycocotools.mask as mask_util
+                expanded_segms.append(
+                    _expand_rle(segm, x, y, height, width, h, w))
+        return expanded_segms
+
+    def apply_bbox(self, bbox, offsets):
+        return bbox + np.array(offsets * 2, dtype=np.float32)
+
+    def apply_keypoint(self, keypoints, offsets):
+        n = len(keypoints[0]) // 2
+        return keypoints + np.array(offsets * n, dtype=np.float32)
+
+    def apply_image(self, image, offsets, im_size, size):
+        x, y = offsets
+        im_h, im_w = im_size
+        h, w = size
+        canvas = np.ones((h, w, 3), dtype=np.float32)
+        canvas *= np.array(self.fill_value, dtype=np.float32)
+        canvas[y:y + im_h, x:x + im_w, :] = image.astype(np.float32)
+        return canvas
+
+    def apply(self, sample, context=None):
+        im = sample['image']
+        im_h, im_w = im.shape[:2]
+        if self.size:
+            h, w = self.size
+            assert (
+                im_h <= h and im_w <= w
+            ), '(h, w) of target size should be greater than (im_h, im_w)'
+        else:
+            h = int(np.ceil(im_h / self.size_divisor) * self.size_divisor)
+            w = int(np.ceil(im_w / self.size_divisor) * self.size_divisor)
+
+        if h == im_h and w == im_w:
+            sample['image'] = im.astype(np.float32)
+            return sample
+
+        if self.pad_mode == -1:
+            offset_x, offset_y = self.offsets
+        elif self.pad_mode == 0:
+            offset_y, offset_x = 0, 0
+        elif self.pad_mode == 1:
+            offset_y, offset_x = (h - im_h) // 2, (w - im_w) // 2
+        else:
+            offset_y, offset_x = h - im_h, w - im_w
+
+        offsets, im_size, size = [offset_x, offset_y], [im_h, im_w], [h, w]
+
+        sample['image'] = self.apply_image(im, offsets, im_size, size)
+
+        if self.pad_mode == 0:
+            return sample
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], offsets)
+
+        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
+            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], offsets,
+                                                im_size, size)
+
+        if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0:
+            sample['gt_keypoint'] = self.apply_keypoint(sample['gt_keypoint'],
+                                                        offsets)
+
+        return sample
+
+
+@register_op
+class Poly2Mask(BaseOperator):
+    """
+    gt poly to mask annotations.
+    Args:
+        del_poly (bool): Whether to delete poly after generating mask. Default: False.
+    """
+
+    def __init__(self, del_poly=False):
+        super(Poly2Mask, self).__init__()
+        import pycocotools.mask as maskUtils
+        self.maskutils = maskUtils
+        self.del_poly = del_poly
+
+    def _poly2mask(self, mask_ann, img_h, img_w):
+        if isinstance(mask_ann, list):
+            # polygon -- a single object might consist of multiple parts
+            # we merge all parts into one mask rle code
+            rles = self.maskutils.frPyObjects(mask_ann, img_h, img_w)
+            rle = self.maskutils.merge(rles)
+        elif isinstance(mask_ann['counts'], list):
+            # uncompressed RLE
+            rle = self.maskutils.frPyObjects(mask_ann, img_h, img_w)
+        else:
+            # rle
+            rle = mask_ann
+        mask = self.maskutils.decode(rle)
+        return mask
+
+    def apply(self, sample, context=None):
+        assert 'gt_poly' in sample
+        im_h, im_w = sample['im_shape']
+        masks = [
+            self._poly2mask(gt_poly, im_h, im_w)
+            for gt_poly in sample['gt_poly']
+        ]
+        sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
+        if self.del_poly:
+            del (sample['gt_poly'])
+
+        return sample
+
+
+@register_op
+class AugmentHSV(BaseOperator):
+    """ 
+    Augment the SV channel of image data.
+    Args:
+        fraction (float): the fraction for augment. Default: 0.5.
+        is_bgr (bool): whether the image is BGR mode. Default: True.
+        hgain (float): H channel gains
+        sgain (float): S channel gains
+        vgain (float): V channel gains
+    """
+
+    def __init__(self,
+                 fraction=0.50,
+                 is_bgr=True,
+                 hgain=None,
+                 sgain=None,
+                 vgain=None):
+        super(AugmentHSV, self).__init__()
+        self.fraction = fraction
+        self.is_bgr = is_bgr
+        self.hgain = hgain
+        self.sgain = sgain
+        self.vgain = vgain
+        self.use_hsvgain = False if hgain is None else True
+
+    def apply(self, sample, context=None):
+        img = sample['image']
+        if self.is_bgr:
+            img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
+        else:
+            img_hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
+
+        if self.use_hsvgain:
+            hsv_augs = np.random.uniform(
+                -1, 1, 3) * [self.hgain, self.sgain, self.vgain]
+            # random selection of h, s, v
+            hsv_augs *= np.random.randint(0, 2, 3)
+            img_hsv[..., 0] = (img_hsv[..., 0] + hsv_augs[0]) % 180
+            img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_augs[1], 0, 255)
+            img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_augs[2], 0, 255)
+
+        else:
+            S = img_hsv[:, :, 1].astype(np.float32)
+            V = img_hsv[:, :, 2].astype(np.float32)
+
+            a = (random.random() * 2 - 1) * self.fraction + 1
+            S *= a
+            if a > 1:
+                np.clip(S, a_min=0, a_max=255, out=S)
+
+            a = (random.random() * 2 - 1) * self.fraction + 1
+            V *= a
+            if a > 1:
+                np.clip(V, a_min=0, a_max=255, out=V)
+
+            img_hsv[:, :, 1] = S.astype(np.uint8)
+            img_hsv[:, :, 2] = V.astype(np.uint8)
+
+        if self.is_bgr:
+            cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img)
+        else:
+            cv2.cvtColor(img_hsv, cv2.COLOR_HSV2RGB, dst=img)
+
+        sample['image'] = img.astype(np.float32)
+        return sample
+
+
+@register_op
+class Norm2PixelBbox(BaseOperator):
+    """
+    Transform the bounding box's coornidates which is in [0,1] to pixels.
+    """
+
+    def __init__(self):
+        super(Norm2PixelBbox, self).__init__()
+
+    def apply(self, sample, context=None):
+        assert 'gt_bbox' in sample
+        bbox = sample['gt_bbox']
+        height, width = sample['image'].shape[:2]
+        bbox[:, 0::2] = bbox[:, 0::2] * width
+        bbox[:, 1::2] = bbox[:, 1::2] * height
+        sample['gt_bbox'] = bbox
+        return sample
+
+
+@register_op
+class BboxCXCYWH2XYXY(BaseOperator):
+    """
+    Convert bbox CXCYWH format to XYXY format.
+    [center_x, center_y, width, height] -> [x0, y0, x1, y1]
+    """
+
+    def __init__(self):
+        super(BboxCXCYWH2XYXY, self).__init__()
+
+    def apply(self, sample, context=None):
+        assert 'gt_bbox' in sample
+        bbox0 = sample['gt_bbox']
+        bbox = bbox0.copy()
+
+        bbox[:, :2] = bbox0[:, :2] - bbox0[:, 2:4] / 2.
+        bbox[:, 2:4] = bbox0[:, :2] + bbox0[:, 2:4] / 2.
+        sample['gt_bbox'] = bbox
+        return sample
+
+
+@register_op
+class RandomResizeCrop(BaseOperator):
+    """Random resize and crop image and bboxes.
+    Args:
+        resizes (list): resize image to one of resizes. if keep_ratio is True and mode is
+        'long', resize the image's long side to the maximum of target_size, if keep_ratio is
+        True and mode is 'short', resize the image's short side to the minimum of target_size.
+        cropsizes (list): crop sizes after resize, [(min_crop_1, max_crop_1), ...]
+        mode (str): resize mode, `long` or `short`. Details see resizes. 
+        prob (float): probability of this op.
+        keep_ratio (bool): whether keep_ratio or not, default true
+        interp (int): the interpolation method
+        thresholds (list): iou thresholds for decide a valid bbox crop.
+        num_attempts (int): number of tries before giving up.
+        allow_no_crop (bool): allow return without actually cropping them.
+        cover_all_box (bool): ensure all bboxes are covered in the final crop.
+        is_mask_crop(bool): whether crop the segmentation.
+    """
+
+    def __init__(self,
+                 resizes,
+                 cropsizes,
+                 prob=0.5,
+                 mode='short',
+                 keep_ratio=True,
+                 interp=cv2.INTER_LINEAR,
+                 num_attempts=3,
+                 cover_all_box=False,
+                 allow_no_crop=False,
+                 thresholds=[0.3, 0.5, 0.7],
+                 is_mask_crop=False,
+                 ioumode="iou"):
+        super(RandomResizeCrop, self).__init__()
+
+        self.resizes = resizes
+        self.cropsizes = cropsizes
+        self.prob = prob
+        self.mode = mode
+        self.ioumode = ioumode
+
+        self.resizer = Resize(0, keep_ratio=keep_ratio, interp=interp)
+        self.croper = RandomCrop(
+            num_attempts=num_attempts,
+            cover_all_box=cover_all_box,
+            thresholds=thresholds,
+            allow_no_crop=allow_no_crop,
+            is_mask_crop=is_mask_crop)
+
+    def _format_size(self, size):
+        if isinstance(size, Integral):
+            size = (size, size)
+        return size
+
+    def apply(self, sample, context=None):
+        if random.random() < self.prob:
+            _resize = self._format_size(random.choice(self.resizes))
+            _cropsize = self._format_size(random.choice(self.cropsizes))
+            sample = self._resize(
+                self.resizer,
+                sample,
+                size=_resize,
+                mode=self.mode,
+                context=context)
+            sample = self._random_crop(
+                self.croper, sample, size=_cropsize, context=context)
+        return sample
+
+    @staticmethod
+    def _random_crop(croper, sample, size, context=None):
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:
+            return sample
+
+        self = croper
+        h, w = sample['image'].shape[:2]
+        gt_bbox = sample['gt_bbox']
+        cropsize = size
+        min_crop = min(cropsize)
+        max_crop = max(cropsize)
+
+        thresholds = list(self.thresholds)
+        np.random.shuffle(thresholds)
+
+        for thresh in thresholds:
+            found = False
+            for _ in range(self.num_attempts):
+
+                crop_h = random.randint(min_crop, min(h, max_crop))
+                crop_w = random.randint(min_crop, min(w, max_crop))
+
+                crop_y = random.randint(0, h - crop_h)
+                crop_x = random.randint(0, w - crop_w)
+
+                crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h]
+                if self.ioumode == "iof":
+                    iou = self._gtcropiou_matrix(
+                        gt_bbox, np.array(
+                            [crop_box], dtype=np.float32))
+                elif self.ioumode == "iou":
+                    iou = self._iou_matrix(
+                        gt_bbox, np.array(
+                            [crop_box], dtype=np.float32))
+                if iou.max() < thresh:
+                    continue
+
+                if self.cover_all_box and iou.min() < thresh:
+                    continue
+
+                cropped_box, valid_ids = self._crop_box_with_center_constraint(
+                    gt_bbox, np.array(
+                        crop_box, dtype=np.float32))
+                if valid_ids.size > 0:
+                    found = True
+                    break
+
+            if found:
+                if self.is_mask_crop and 'gt_poly' in sample and len(sample[
+                        'gt_poly']) > 0:
+                    crop_polys = self.crop_segms(
+                        sample['gt_poly'],
+                        valid_ids,
+                        np.array(
+                            crop_box, dtype=np.int64),
+                        h,
+                        w)
+                    if [] in crop_polys:
+                        delete_id = list()
+                        valid_polys = list()
+                        for id, crop_poly in enumerate(crop_polys):
+                            if crop_poly == []:
+                                delete_id.append(id)
+                            else:
+                                valid_polys.append(crop_poly)
+                        valid_ids = np.delete(valid_ids, delete_id)
+                        if len(valid_polys) == 0:
+                            return sample
+                        sample['gt_poly'] = valid_polys
+                    else:
+                        sample['gt_poly'] = crop_polys
+
+                if 'gt_segm' in sample:
+                    sample['gt_segm'] = self._crop_segm(sample['gt_segm'],
+                                                        crop_box)
+                    sample['gt_segm'] = np.take(
+                        sample['gt_segm'], valid_ids, axis=0)
+
+                sample['image'] = self._crop_image(sample['image'], crop_box)
+                sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0)
+                sample['gt_class'] = np.take(
+                    sample['gt_class'], valid_ids, axis=0)
+                if 'gt_score' in sample:
+                    sample['gt_score'] = np.take(
+                        sample['gt_score'], valid_ids, axis=0)
+
+                if 'is_crowd' in sample:
+                    sample['is_crowd'] = np.take(
+                        sample['is_crowd'], valid_ids, axis=0)
+
+                if 'gt_areas' in sample:
+                    sample['gt_areas'] = np.take(
+                        sample['gt_areas'], valid_ids, axis=0)
+
+                if 'gt_joints' in sample:
+                    gt_joints = self._crop_joints(sample['gt_joints'], crop_box)
+                    sample['gt_joints'] = gt_joints[valid_ids]
+                return sample
+
+        return sample
+
+    @staticmethod
+    def _resize(resizer, sample, size, mode='short', context=None):
+        self = resizer
+        im = sample['image']
+        target_size = size
+
+        if not isinstance(im, np.ndarray):
+            raise TypeError("{}: image type is not numpy.".format(self))
+        if len(im.shape) != 3:
+            raise ImageError('{}: image is not 3-dimensional.'.format(self))
+
+        # apply image
+        im_shape = im.shape
+        if self.keep_ratio:
+
+            im_size_min = np.min(im_shape[0:2])
+            im_size_max = np.max(im_shape[0:2])
+
+            target_size_min = np.min(target_size)
+            target_size_max = np.max(target_size)
+
+            if mode == 'long':
+                im_scale = min(target_size_min / im_size_min,
+                               target_size_max / im_size_max)
+            else:
+                im_scale = max(target_size_min / im_size_min,
+                               target_size_max / im_size_max)
+
+            resize_h = int(im_scale * float(im_shape[0]) + 0.5)
+            resize_w = int(im_scale * float(im_shape[1]) + 0.5)
+
+            im_scale_x = im_scale
+            im_scale_y = im_scale
+        else:
+            resize_h, resize_w = target_size
+            im_scale_y = resize_h / im_shape[0]
+            im_scale_x = resize_w / im_shape[1]
+
+        im = self.apply_image(sample['image'], [im_scale_x, im_scale_y])
+        sample['image'] = im
+        sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)
+        if 'scale_factor' in sample:
+            scale_factor = sample['scale_factor']
+            sample['scale_factor'] = np.asarray(
+                [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
+                dtype=np.float32)
+        else:
+            sample['scale_factor'] = np.asarray(
+                [im_scale_y, im_scale_x], dtype=np.float32)
+
+        # apply bbox
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'],
+                                                [im_scale_x, im_scale_y],
+                                                [resize_w, resize_h])
+
+        # apply polygon
+        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
+            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2],
+                                                [im_scale_x, im_scale_y])
+
+        # apply semantic
+        if 'semantic' in sample and sample['semantic']:
+            semantic = sample['semantic']
+            semantic = cv2.resize(
+                semantic.astype('float32'),
+                None,
+                None,
+                fx=im_scale_x,
+                fy=im_scale_y,
+                interpolation=self.interp)
+            semantic = np.asarray(semantic).astype('int32')
+            semantic = np.expand_dims(semantic, 0)
+            sample['semantic'] = semantic
+
+        # apply gt_segm
+        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
+            masks = [
+                cv2.resize(
+                    gt_segm,
+                    None,
+                    None,
+                    fx=im_scale_x,
+                    fy=im_scale_y,
+                    interpolation=cv2.INTER_NEAREST)
+                for gt_segm in sample['gt_segm']
+            ]
+            sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
+
+        if 'gt_joints' in sample:
+            sample['gt_joints'] = self.apply_joints(sample['gt_joints'],
+                                                    [im_scale_x, im_scale_y],
+                                                    [resize_w, resize_h])
+
+        return sample
+
+
+@register_op
+class RandomSelect(BaseOperator):
+    """
+    Randomly choose a transformation between transforms1 and transforms2,
+    and the probability of choosing transforms1 is p.
+
+    The code is based on https://github.com/facebookresearch/detr/blob/main/datasets/transforms.py
+
+    """
+
+    def __init__(self, transforms1, transforms2, p=0.5):
+        super(RandomSelect, self).__init__()
+        self.transforms1 = Compose(transforms1)
+        self.transforms2 = Compose(transforms2)
+        self.p = p
+
+    def apply(self, sample, context=None):
+        if random.random() < self.p:
+            return self.transforms1(sample)
+        return self.transforms2(sample)
+
+
+@register_op
+class RandomShortSideResize(BaseOperator):
+    def __init__(self,
+                 short_side_sizes,
+                 max_size=None,
+                 interp=cv2.INTER_LINEAR,
+                 random_interp=False):
+        """
+        Resize the image randomly according to the short side. If max_size is not None,
+        the long side is scaled according to max_size. The whole process will be keep ratio.
+        Args:
+            short_side_sizes (list|tuple): Image target short side size.
+            max_size (int): The size of the longest side of image after resize.
+            interp (int): The interpolation method.
+            random_interp (bool): Whether random select interpolation method.
+        """
+        super(RandomShortSideResize, self).__init__()
+
+        assert isinstance(short_side_sizes,
+                          Sequence), "short_side_sizes must be List or Tuple"
+
+        self.short_side_sizes = short_side_sizes
+        self.max_size = max_size
+        self.interp = interp
+        self.random_interp = random_interp
+        self.interps = [
+            cv2.INTER_NEAREST,
+            cv2.INTER_LINEAR,
+            cv2.INTER_AREA,
+            cv2.INTER_CUBIC,
+            cv2.INTER_LANCZOS4,
+        ]
+
+    def get_size_with_aspect_ratio(self, image_shape, size, max_size=None):
+        h, w = image_shape
+        max_clip = False
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(max_size * min_original_size / max_original_size)
+                max_clip = True
+
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (w, h)
+
+        if w < h:
+            ow = size
+            oh = int(round(size * h / w)) if not max_clip else max_size
+        else:
+            oh = size
+            ow = int(round(size * w / h)) if not max_clip else max_size
+
+        return (ow, oh)
+
+    def resize(self,
+               sample,
+               target_size,
+               max_size=None,
+               interp=cv2.INTER_LINEAR):
+        im = sample['image']
+        if not isinstance(im, np.ndarray):
+            raise TypeError("{}: image type is not numpy.".format(self))
+        if len(im.shape) != 3:
+            raise ImageError('{}: image is not 3-dimensional.'.format(self))
+
+        target_size = self.get_size_with_aspect_ratio(im.shape[:2], target_size,
+                                                      max_size)
+        im_scale_y, im_scale_x = target_size[1] / im.shape[0], target_size[
+            0] / im.shape[1]
+
+        sample['image'] = cv2.resize(im, target_size, interpolation=interp)
+        sample['im_shape'] = np.asarray(target_size[::-1], dtype=np.float32)
+        if 'scale_factor' in sample:
+            scale_factor = sample['scale_factor']
+            sample['scale_factor'] = np.asarray(
+                [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
+                dtype=np.float32)
+        else:
+            sample['scale_factor'] = np.asarray(
+                [im_scale_y, im_scale_x], dtype=np.float32)
+
+        # apply bbox
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            sample['gt_bbox'] = self.apply_bbox(
+                sample['gt_bbox'], [im_scale_x, im_scale_y], target_size)
+        # apply polygon
+        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
+            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im.shape[:2],
+                                                [im_scale_x, im_scale_y])
+        # apply semantic
+        if 'semantic' in sample and sample['semantic']:
+            semantic = sample['semantic']
+            semantic = cv2.resize(
+                semantic.astype('float32'),
+                target_size,
+                interpolation=self.interp)
+            semantic = np.asarray(semantic).astype('int32')
+            semantic = np.expand_dims(semantic, 0)
+            sample['semantic'] = semantic
+        # apply gt_segm
+        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
+            masks = [
+                cv2.resize(
+                    gt_segm, target_size, interpolation=cv2.INTER_NEAREST)
+                for gt_segm in sample['gt_segm']
+            ]
+            sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
+
+        if 'gt_joints' in sample:
+            sample['gt_joints'] = self.apply_joints(
+                sample['gt_joints'], [im_scale_x, im_scale_y], target_size)
+
+        # apply areas
+        if 'gt_areas' in sample:
+            sample['gt_areas'] = self.apply_area(sample['gt_areas'],
+                                                 [im_scale_x, im_scale_y])
+
+        return sample
+
+    def apply_bbox(self, bbox, scale, size):
+        im_scale_x, im_scale_y = scale
+        resize_w, resize_h = size
+        bbox[:, 0::2] *= im_scale_x
+        bbox[:, 1::2] *= im_scale_y
+        bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w)
+        bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h)
+        return bbox.astype('float32')
+
+    def apply_joints(self, joints, scale, size):
+        im_scale_x, im_scale_y = scale
+        resize_w, resize_h = size
+        joints[..., 0] *= im_scale_x
+        joints[..., 1] *= im_scale_y
+        # joints[joints[..., 0] >= resize_w, :] = 0
+        # joints[joints[..., 1] >= resize_h, :] = 0
+        # joints[joints[..., 0] < 0, :] = 0
+        # joints[joints[..., 1] < 0, :] = 0
+        joints[..., 0] = np.clip(joints[..., 0], 0, resize_w)
+        joints[..., 1] = np.clip(joints[..., 1], 0, resize_h)
+        return joints
+
+    def apply_area(self, area, scale):
+        im_scale_x, im_scale_y = scale
+        return area * im_scale_x * im_scale_y
+
+    def apply_segm(self, segms, im_size, scale):
+        def _resize_poly(poly, im_scale_x, im_scale_y):
+            resized_poly = np.array(poly).astype('float32')
+            resized_poly[0::2] *= im_scale_x
+            resized_poly[1::2] *= im_scale_y
+            return resized_poly.tolist()
+
+        def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y):
+            if 'counts' in rle and type(rle['counts']) == list:
+                rle = mask_util.frPyObjects(rle, im_h, im_w)
+
+            mask = mask_util.decode(rle)
+            mask = cv2.resize(
+                mask,
+                None,
+                None,
+                fx=im_scale_x,
+                fy=im_scale_y,
+                interpolation=self.interp)
+            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
+            return rle
+
+        im_h, im_w = im_size
+        im_scale_x, im_scale_y = scale
+        resized_segms = []
+        for segm in segms:
+            if is_poly(segm):
+                # Polygon format
+                resized_segms.append([
+                    _resize_poly(poly, im_scale_x, im_scale_y) for poly in segm
+                ])
+            else:
+                # RLE format
+                import pycocotools.mask as mask_util
+                resized_segms.append(
+                    _resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y))
+
+        return resized_segms
+
+    def apply(self, sample, context=None):
+        target_size = random.choice(self.short_side_sizes)
+        interp = random.choice(
+            self.interps) if self.random_interp else self.interp
+
+        return self.resize(sample, target_size, self.max_size, interp)
+
+
+@register_op
+class RandomShortSideRangeResize(RandomShortSideResize):
+    def __init__(self, scales, interp=cv2.INTER_LINEAR, random_interp=False):
+        """
+        Resize the image randomly according to the short side. If max_size is not None,
+        the long side is scaled according to max_size. The whole process will be keep ratio.
+        Args:
+            short_side_sizes (list|tuple): Image target short side size.
+            interp (int): The interpolation method.
+            random_interp (bool): Whether random select interpolation method.
+        """
+        super(RandomShortSideRangeResize, self).__init__(scales, None, interp,
+                                                         random_interp)
+
+        assert isinstance(scales,
+                          Sequence), "short_side_sizes must be List or Tuple"
+
+        self.scales = scales
+
+    def random_sample(self, img_scales):
+        img_scale_long = [max(s) for s in img_scales]
+        img_scale_short = [min(s) for s in img_scales]
+        long_edge = np.random.randint(
+            min(img_scale_long), max(img_scale_long) + 1)
+        short_edge = np.random.randint(
+            min(img_scale_short), max(img_scale_short) + 1)
+        img_scale = (long_edge, short_edge)
+        return img_scale
+
+    def apply(self, sample, context=None):
+        long_edge, short_edge = self.random_sample(self.short_side_sizes)
+        # print("target size:{}".format((long_edge, short_edge)))
+        interp = random.choice(
+            self.interps) if self.random_interp else self.interp
+
+        return self.resize(sample, short_edge, long_edge, interp)
+
+
+@register_op
+class RandomSizeCrop(BaseOperator):
+    """
+    Cut the image randomly according to `min_size` and `max_size`
+    Args:
+        min_size (int): Min size for edges of cropped image.
+        max_size (int): Max size for edges of cropped image. If it
+                        is set to larger than length of the input image,
+                        the output will keep the origin length.
+        keep_empty (bool): Whether to keep the cropped result with no object.
+                           If it is set to False, the no-object result will not
+                           be returned, replaced by the original input.
+    """
+
+    def __init__(self, min_size, max_size, keep_empty=True):
+        super(RandomSizeCrop, self).__init__()
+        self.min_size = min_size
+        self.max_size = max_size
+        self.keep_empty = keep_empty
+
+        from paddle.vision.transforms.functional import crop as paddle_crop
+        self.paddle_crop = paddle_crop
+
+    @staticmethod
+    def get_crop_params(img_shape, output_size):
+        """Get parameters for ``crop`` for a random crop.
+        Args:
+            img_shape (list|tuple): Image's height and width.
+            output_size (list|tuple): Expected output size of the crop.
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
+        """
+        h, w = img_shape
+        th, tw = output_size
+
+        if h + 1 < th or w + 1 < tw:
+            raise ValueError(
+                "Required crop size {} is larger then input image size {}".
+                format((th, tw), (h, w)))
+
+        if w == tw and h == th:
+            return 0, 0, h, w
+
+        i = random.randint(0, h - th + 1)
+        j = random.randint(0, w - tw + 1)
+        return i, j, th, tw
+
+    def crop(self, sample, region):
+        keep_index = None
+        # apply bbox and check whether the cropped result is valid
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            croped_bbox = self.apply_bbox(sample['gt_bbox'], region)
+            bbox = croped_bbox.reshape([-1, 2, 2])
+            area = (bbox[:, 1, :] - bbox[:, 0, :]).prod(axis=1)
+            keep_index = np.where(area > 0)[0]
+
+            if not self.keep_empty and len(keep_index) == 0:
+                # When keep_empty is set to False, cropped with no-object will
+                # not be used and return the origin content.
+                return sample
+
+            sample['gt_bbox'] = croped_bbox[keep_index] if len(
+                keep_index) > 0 else np.zeros(
+                    [0, 4], dtype=np.float32)
+            sample['gt_class'] = sample['gt_class'][keep_index] if len(
+                keep_index) > 0 else np.zeros(
+                    [0, 1], dtype=np.float32)
+            if 'gt_score' in sample:
+                sample['gt_score'] = sample['gt_score'][keep_index] if len(
+                    keep_index) > 0 else np.zeros(
+                        [0, 1], dtype=np.float32)
+            if 'is_crowd' in sample:
+                sample['is_crowd'] = sample['is_crowd'][keep_index] if len(
+                    keep_index) > 0 else np.zeros(
+                        [0, 1], dtype=np.float32)
+            if 'gt_areas' in sample:
+                sample['gt_areas'] = np.take(
+                    sample['gt_areas'], keep_index, axis=0)
+
+        image_shape = sample['image'].shape[:2]
+        sample['image'] = self.paddle_crop(sample['image'], *region)
+        sample['im_shape'] = np.array(
+            sample['image'].shape[:2], dtype=np.float32)
+
+        # apply polygon
+        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
+            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], region,
+                                                image_shape)
+            sample['gt_poly'] = np.array(sample['gt_poly'])
+            if keep_index is not None and len(keep_index) > 0:
+                sample['gt_poly'] = sample['gt_poly'][keep_index]
+            sample['gt_poly'] = sample['gt_poly'].tolist()
+        # apply gt_segm
+        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
+            i, j, h, w = region
+            sample['gt_segm'] = sample['gt_segm'][:, i:i + h, j:j + w]
+            if keep_index is not None and len(keep_index) > 0:
+                sample['gt_segm'] = sample['gt_segm'][keep_index]
+
+        if 'gt_joints' in sample:
+            gt_joints = self._crop_joints(sample['gt_joints'], region)
+            sample['gt_joints'] = gt_joints
+            if keep_index is not None:
+                sample['gt_joints'] = sample['gt_joints'][keep_index]
+
+        return sample
+
+    def apply_bbox(self, bbox, region):
+        i, j, h, w = region
+        region_size = np.asarray([w, h])
+        crop_bbox = bbox - np.asarray([j, i, j, i])
+        crop_bbox = np.minimum(crop_bbox.reshape([-1, 2, 2]), region_size)
+        crop_bbox = crop_bbox.clip(min=0)
+        return crop_bbox.reshape([-1, 4]).astype('float32')
+
+    def _crop_joints(self, joints, region):
+        y1, x1, h, w = region
+        x2 = x1 + w
+        y2 = y1 + h
+        # x1, y1, x2, y2 = crop
+        joints[..., 0] -= x1
+        joints[..., 1] -= y1
+        joints[joints[..., 0] > w, :] = 0
+        joints[joints[..., 1] > h, :] = 0
+        joints[joints[..., 0] < 0, :] = 0
+        joints[joints[..., 1] < 0, :] = 0
+        return joints
+
+    def apply_segm(self, segms, region, image_shape):
+        def _crop_poly(segm, crop):
+            xmin, ymin, xmax, ymax = crop
+            crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin]
+            crop_p = np.array(crop_coord).reshape(4, 2)
+            crop_p = Polygon(crop_p)
+
+            crop_segm = list()
+            for poly in segm:
+                poly = np.array(poly).reshape(len(poly) // 2, 2)
+                polygon = Polygon(poly)
+                if not polygon.is_valid:
+                    exterior = polygon.exterior
+                    multi_lines = exterior.intersection(exterior)
+                    polygons = shapely.ops.polygonize(multi_lines)
+                    polygon = MultiPolygon(polygons)
+                multi_polygon = list()
+                if isinstance(polygon, MultiPolygon):
+                    multi_polygon = copy.deepcopy(polygon)
+                else:
+                    multi_polygon.append(copy.deepcopy(polygon))
+                for per_polygon in multi_polygon:
+                    inter = per_polygon.intersection(crop_p)
+                    if not inter:
+                        continue
+                    if isinstance(inter, (MultiPolygon, GeometryCollection)):
+                        for part in inter:
+                            if not isinstance(part, Polygon):
+                                continue
+                            part = np.squeeze(
+                                np.array(part.exterior.coords[:-1]).reshape(1,
+                                                                            -1))
+                            part[0::2] -= xmin
+                            part[1::2] -= ymin
+                            crop_segm.append(part.tolist())
+                    elif isinstance(inter, Polygon):
+                        crop_poly = np.squeeze(
+                            np.array(inter.exterior.coords[:-1]).reshape(1, -1))
+                        crop_poly[0::2] -= xmin
+                        crop_poly[1::2] -= ymin
+                        crop_segm.append(crop_poly.tolist())
+                    else:
+                        continue
+            return crop_segm
+
+        def _crop_rle(rle, crop, height, width):
+            if 'counts' in rle and type(rle['counts']) == list:
+                rle = mask_util.frPyObjects(rle, height, width)
+            mask = mask_util.decode(rle)
+            mask = mask[crop[1]:crop[3], crop[0]:crop[2]]
+            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
+            return rle
+
+        i, j, h, w = region
+        crop = [j, i, j + w, i + h]
+        height, width = image_shape
+        crop_segms = []
+        for segm in segms:
+            if is_poly(segm):
+                import copy
+                import shapely.ops
+                from shapely.geometry import Polygon, MultiPolygon, GeometryCollection
+                # Polygon format
+                crop_segms.append(_crop_poly(segm, crop))
+            else:
+                # RLE format
+                import pycocotools.mask as mask_util
+                crop_segms.append(_crop_rle(segm, crop, height, width))
+        return crop_segms
+
+    def apply(self, sample, context=None):
+        h = random.randint(self.min_size,
+                           min(sample['image'].shape[0], self.max_size))
+        w = random.randint(self.min_size,
+                           min(sample['image'].shape[1], self.max_size))
+
+        region = self.get_crop_params(sample['image'].shape[:2], [h, w])
+        return self.crop(sample, region)
+
+
+@register_op
+class CenterRandColor(BaseOperator):
+    """Random color for CenterNet series models.
+    Args:
+        saturation (float): saturation settings.
+        contrast (float): contrast settings.
+        brightness (float): brightness settings.
+    """
+
+    def __init__(self, saturation=0.4, contrast=0.4, brightness=0.4):
+        super(CenterRandColor, self).__init__()
+        self.saturation = saturation
+        self.contrast = contrast
+        self.brightness = brightness
+
+    def apply_saturation(self, img, img_gray):
+        alpha = 1. + np.random.uniform(
+            low=-self.saturation, high=self.saturation)
+        self._blend(alpha, img, img_gray[:, :, None])
+        return img
+
+    def apply_contrast(self, img, img_gray):
+        alpha = 1. + np.random.uniform(low=-self.contrast, high=self.contrast)
+        img_mean = img_gray.mean()
+        self._blend(alpha, img, img_mean)
+        return img
+
+    def apply_brightness(self, img, img_gray):
+        alpha = 1 + np.random.uniform(
+            low=-self.brightness, high=self.brightness)
+        img *= alpha
+        return img
+
+    def _blend(self, alpha, img, img_mean):
+        img *= alpha
+        img_mean *= (1 - alpha)
+        img += img_mean
+
+    def apply(self, sample, context=None):
+        functions = [
+            self.apply_brightness,
+            self.apply_contrast,
+            self.apply_saturation,
+        ]
+
+        img = sample['image']
+        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        distortions = np.random.permutation(functions)
+        for func in distortions:
+            img = func(img, img_gray)
+        sample['image'] = img
+
+        if 'pre_image' in sample:
+            pre_img = sample['pre_image']
+            pre_img_gray = cv2.cvtColor(pre_img, cv2.COLOR_BGR2GRAY)
+            pre_distortions = np.random.permutation(functions)
+            for func in pre_distortions:
+                pre_img = func(pre_img, pre_img_gray)
+            sample['pre_image'] = pre_img
+
+        return sample
+
+
+@register_op
+class Mosaic(BaseOperator):
+    """ Mosaic operator for image and gt_bboxes
+    The code is based on https://github.com/Megvii-BaseDetection/YOLOX/blob/main/yolox/data/datasets/mosaicdetection.py
+
+    1. get mosaic coords
+    2. clip bbox and get mosaic_labels
+    3. random_affine augment
+    4. Mixup augment as copypaste (optinal), not used in tiny/nano
+
+    Args:
+        prob (float): probability of using Mosaic, 1.0 as default
+        input_dim (list[int]): input shape
+        degrees (list[2]): the rotate range to apply, transform range is [min, max]
+        translate (list[2]): the translate range to apply, transform range is [min, max]
+        scale (list[2]): the scale range to apply, transform range is [min, max]
+        shear (list[2]): the shear range to apply, transform range is [min, max]
+        enable_mixup (bool): whether to enable Mixup or not
+        mixup_prob (float): probability of using Mixup, 1.0 as default
+        mixup_scale (list[int]): scale range of Mixup
+        remove_outside_box (bool): whether remove outside boxes, False as
+            default in COCO dataset, True in MOT dataset
+    """
+
+    def __init__(self,
+                 prob=1.0,
+                 input_dim=[640, 640],
+                 degrees=[-10, 10],
+                 translate=[-0.1, 0.1],
+                 scale=[0.1, 2],
+                 shear=[-2, 2],
+                 enable_mixup=True,
+                 mixup_prob=1.0,
+                 mixup_scale=[0.5, 1.5],
+                 remove_outside_box=False):
+        super(Mosaic, self).__init__()
+        self.prob = prob
+        if isinstance(input_dim, Integral):
+            input_dim = [input_dim, input_dim]
+        self.input_dim = input_dim
+        self.degrees = degrees
+        self.translate = translate
+        self.scale = scale
+        self.shear = shear
+        self.enable_mixup = enable_mixup
+        self.mixup_prob = mixup_prob
+        self.mixup_scale = mixup_scale
+        self.remove_outside_box = remove_outside_box
+
+    def get_mosaic_coords(self, mosaic_idx, xc, yc, w, h, input_h, input_w):
+        # (x1, y1, x2, y2) means coords in large image,
+        # small_coords means coords in small image in mosaic aug.
+        if mosaic_idx == 0:
+            # top left
+            x1, y1, x2, y2 = max(xc - w, 0), max(yc - h, 0), xc, yc
+            small_coords = w - (x2 - x1), h - (y2 - y1), w, h
+        elif mosaic_idx == 1:
+            # top right
+            x1, y1, x2, y2 = xc, max(yc - h, 0), min(xc + w, input_w * 2), yc
+            small_coords = 0, h - (y2 - y1), min(w, x2 - x1), h
+        elif mosaic_idx == 2:
+            # bottom left
+            x1, y1, x2, y2 = max(xc - w, 0), yc, xc, min(input_h * 2, yc + h)
+            small_coords = w - (x2 - x1), 0, w, min(y2 - y1, h)
+        elif mosaic_idx == 3:
+            # bottom right
+            x1, y1, x2, y2 = xc, yc, min(xc + w, input_w * 2), min(input_h * 2,
+                                                                   yc + h)
+            small_coords = 0, 0, min(w, x2 - x1), min(y2 - y1, h)
+
+        return (x1, y1, x2, y2), small_coords
+
+    def random_affine_augment(self,
+                              img,
+                              labels=[],
+                              input_dim=[640, 640],
+                              degrees=[-10, 10],
+                              scales=[0.1, 2],
+                              shears=[-2, 2],
+                              translates=[-0.1, 0.1]):
+        # random rotation and scale
+        degree = random.uniform(degrees[0], degrees[1])
+        scale = random.uniform(scales[0], scales[1])
+        assert scale > 0, "Argument scale should be positive."
+        R = cv2.getRotationMatrix2D(angle=degree, center=(0, 0), scale=scale)
+        M = np.ones([2, 3])
+
+        # random shear
+        shear = random.uniform(shears[0], shears[1])
+        shear_x = math.tan(shear * math.pi / 180)
+        shear_y = math.tan(shear * math.pi / 180)
+        M[0] = R[0] + shear_y * R[1]
+        M[1] = R[1] + shear_x * R[0]
+
+        # random translation
+        translate = random.uniform(translates[0], translates[1])
+        translation_x = translate * input_dim[0]
+        translation_y = translate * input_dim[1]
+        M[0, 2] = translation_x
+        M[1, 2] = translation_y
+
+        # warpAffine
+        img = cv2.warpAffine(
+            img, M, dsize=tuple(input_dim), borderValue=(114, 114, 114))
+
+        num_gts = len(labels)
+        if num_gts > 0:
+            # warp corner points
+            corner_points = np.ones((4 * num_gts, 3))
+            corner_points[:, :2] = labels[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
+                4 * num_gts, 2)  # x1y1, x2y2, x1y2, x2y1
+            # apply affine transform
+            corner_points = corner_points @M.T
+            corner_points = corner_points.reshape(num_gts, 8)
+
+            # create new boxes
+            corner_xs = corner_points[:, 0::2]
+            corner_ys = corner_points[:, 1::2]
+            new_bboxes = np.concatenate((corner_xs.min(1), corner_ys.min(1),
+                                         corner_xs.max(1), corner_ys.max(1)))
+            new_bboxes = new_bboxes.reshape(4, num_gts).T
+
+            # clip boxes
+            new_bboxes[:, 0::2] = np.clip(new_bboxes[:, 0::2], 0, input_dim[0])
+            new_bboxes[:, 1::2] = np.clip(new_bboxes[:, 1::2], 0, input_dim[1])
+            labels[:, :4] = new_bboxes
+
+        return img, labels
+
+    def __call__(self, sample, context=None):
+        if not isinstance(sample, Sequence):
+            return sample
+
+        assert len(
+            sample) == 5, "Mosaic needs 5 samples, 4 for mosaic and 1 for mixup."
+        if np.random.uniform(0., 1.) > self.prob:
+            return sample[0]
+
+        mosaic_gt_bbox, mosaic_gt_class, mosaic_is_crowd, mosaic_difficult = [], [], [], []
+        input_h, input_w = self.input_dim
+        yc = int(random.uniform(0.5 * input_h, 1.5 * input_h))
+        xc = int(random.uniform(0.5 * input_w, 1.5 * input_w))
+        mosaic_img = np.full((input_h * 2, input_w * 2, 3), 114, dtype=np.uint8)
+
+        # 1. get mosaic coords
+        for mosaic_idx, sp in enumerate(sample[:4]):
+            img = sp['image']
+            gt_bbox = sp['gt_bbox']
+            h0, w0 = img.shape[:2]
+            scale = min(1. * input_h / h0, 1. * input_w / w0)
+            img = cv2.resize(
+                img, (int(w0 * scale), int(h0 * scale)),
+                interpolation=cv2.INTER_LINEAR)
+            (h, w, c) = img.shape[:3]
+
+            # suffix l means large image, while s means small image in mosaic aug.
+            (l_x1, l_y1, l_x2, l_y2), (
+                s_x1, s_y1, s_x2, s_y2) = self.get_mosaic_coords(
+                    mosaic_idx, xc, yc, w, h, input_h, input_w)
+
+            mosaic_img[l_y1:l_y2, l_x1:l_x2] = img[s_y1:s_y2, s_x1:s_x2]
+            padw, padh = l_x1 - s_x1, l_y1 - s_y1
+
+            # Normalized xywh to pixel xyxy format
+            _gt_bbox = gt_bbox.copy()
+            if len(gt_bbox) > 0:
+                _gt_bbox[:, 0] = scale * gt_bbox[:, 0] + padw
+                _gt_bbox[:, 1] = scale * gt_bbox[:, 1] + padh
+                _gt_bbox[:, 2] = scale * gt_bbox[:, 2] + padw
+                _gt_bbox[:, 3] = scale * gt_bbox[:, 3] + padh
+
+            mosaic_gt_bbox.append(_gt_bbox)
+            mosaic_gt_class.append(sp['gt_class'])
+            if 'is_crowd' in sp:
+                mosaic_is_crowd.append(sp['is_crowd'])
+            if 'difficult' in sp:
+                mosaic_difficult.append(sp['difficult'])
+
+        # 2. clip bbox and get mosaic_labels([gt_bbox, gt_class, is_crowd])
+        if len(mosaic_gt_bbox):
+            mosaic_gt_bbox = np.concatenate(mosaic_gt_bbox, 0)
+            mosaic_gt_class = np.concatenate(mosaic_gt_class, 0)
+            if mosaic_is_crowd:
+                mosaic_is_crowd = np.concatenate(mosaic_is_crowd, 0)
+                mosaic_labels = np.concatenate([
+                    mosaic_gt_bbox,
+                    mosaic_gt_class.astype(mosaic_gt_bbox.dtype),
+                    mosaic_is_crowd.astype(mosaic_gt_bbox.dtype)
+                ], 1)
+            elif mosaic_difficult:
+                mosaic_difficult = np.concatenate(mosaic_difficult, 0)
+                mosaic_labels = np.concatenate([
+                    mosaic_gt_bbox,
+                    mosaic_gt_class.astype(mosaic_gt_bbox.dtype),
+                    mosaic_difficult.astype(mosaic_gt_bbox.dtype)
+                ], 1)
+            else:
+                mosaic_labels = np.concatenate([
+                    mosaic_gt_bbox, mosaic_gt_class.astype(mosaic_gt_bbox.dtype)
+                ], 1)
+            if self.remove_outside_box:
+                # for MOT dataset
+                flag1 = mosaic_gt_bbox[:, 0] < 2 * input_w
+                flag2 = mosaic_gt_bbox[:, 2] > 0
+                flag3 = mosaic_gt_bbox[:, 1] < 2 * input_h
+                flag4 = mosaic_gt_bbox[:, 3] > 0
+                flag_all = flag1 * flag2 * flag3 * flag4
+                mosaic_labels = mosaic_labels[flag_all]
+            else:
+                mosaic_labels[:, 0] = np.clip(mosaic_labels[:, 0], 0,
+                                              2 * input_w)
+                mosaic_labels[:, 1] = np.clip(mosaic_labels[:, 1], 0,
+                                              2 * input_h)
+                mosaic_labels[:, 2] = np.clip(mosaic_labels[:, 2], 0,
+                                              2 * input_w)
+                mosaic_labels[:, 3] = np.clip(mosaic_labels[:, 3], 0,
+                                              2 * input_h)
+        else:
+            mosaic_labels = np.zeros((1, 6))
+
+        # 3. random_affine augment
+        mosaic_img, mosaic_labels = self.random_affine_augment(
+            mosaic_img,
+            mosaic_labels,
+            input_dim=self.input_dim,
+            degrees=self.degrees,
+            translates=self.translate,
+            scales=self.scale,
+            shears=self.shear)
+
+        # 4. Mixup augment as copypaste, https://arxiv.org/abs/2012.07177
+        # optinal, not used(enable_mixup=False) in tiny/nano
+        if (self.enable_mixup and not len(mosaic_labels) == 0 and
+                random.random() < self.mixup_prob):
+            sample_mixup = sample[4]
+            mixup_img = sample_mixup['image']
+            if 'is_crowd' in sample_mixup:
+                cp_labels = np.concatenate([
+                    sample_mixup['gt_bbox'],
+                    sample_mixup['gt_class'].astype(mosaic_labels.dtype),
+                    sample_mixup['is_crowd'].astype(mosaic_labels.dtype)
+                ], 1)
+            elif 'difficult' in sample_mixup:
+                cp_labels = np.concatenate([
+                    sample_mixup['gt_bbox'],
+                    sample_mixup['gt_class'].astype(mosaic_labels.dtype),
+                    sample_mixup['difficult'].astype(mosaic_labels.dtype)
+                ], 1)
+            else:
+                cp_labels = np.concatenate([
+                    sample_mixup['gt_bbox'],
+                    sample_mixup['gt_class'].astype(mosaic_labels.dtype)
+                ], 1)
+            mosaic_img, mosaic_labels = self.mixup_augment(
+                mosaic_img, mosaic_labels, self.input_dim, cp_labels, mixup_img)
+
+        sample0 = sample[0]
+        sample0['image'] = mosaic_img.astype(np.uint8)  # can not be float32
+        sample0['h'] = float(mosaic_img.shape[0])
+        sample0['w'] = float(mosaic_img.shape[1])
+        sample0['im_shape'][0] = sample0['h']
+        sample0['im_shape'][1] = sample0['w']
+        sample0['gt_bbox'] = mosaic_labels[:, :4].astype(np.float32)
+        sample0['gt_class'] = mosaic_labels[:, 4:5].astype(np.float32)
+        if 'is_crowd' in sample[0]:
+            sample0['is_crowd'] = mosaic_labels[:, 5:6].astype(np.float32)
+        if 'difficult' in sample[0]:
+            sample0['difficult'] = mosaic_labels[:, 5:6].astype(np.float32)
+        return sample0
+
+    def mixup_augment(self, origin_img, origin_labels, input_dim, cp_labels,
+                      img):
+        jit_factor = random.uniform(*self.mixup_scale)
+        FLIP = random.uniform(0, 1) > 0.5
+        if len(img.shape) == 3:
+            cp_img = np.ones(
+                (input_dim[0], input_dim[1], 3), dtype=np.uint8) * 114
+        else:
+            cp_img = np.ones(input_dim, dtype=np.uint8) * 114
+
+        cp_scale_ratio = min(input_dim[0] / img.shape[0],
+                             input_dim[1] / img.shape[1])
+        resized_img = cv2.resize(
+            img, (int(img.shape[1] * cp_scale_ratio),
+                  int(img.shape[0] * cp_scale_ratio)),
+            interpolation=cv2.INTER_LINEAR)
+
+        cp_img[:int(img.shape[0] * cp_scale_ratio), :int(img.shape[
+            1] * cp_scale_ratio)] = resized_img
+
+        cp_img = cv2.resize(cp_img, (int(cp_img.shape[1] * jit_factor),
+                                     int(cp_img.shape[0] * jit_factor)))
+        cp_scale_ratio *= jit_factor
+
+        if FLIP:
+            cp_img = cp_img[:, ::-1, :]
+
+        origin_h, origin_w = cp_img.shape[:2]
+        target_h, target_w = origin_img.shape[:2]
+        padded_img = np.zeros(
+            (max(origin_h, target_h), max(origin_w, target_w), 3),
+            dtype=np.uint8)
+        padded_img[:origin_h, :origin_w] = cp_img
+
+        x_offset, y_offset = 0, 0
+        if padded_img.shape[0] > target_h:
+            y_offset = random.randint(0, padded_img.shape[0] - target_h - 1)
+        if padded_img.shape[1] > target_w:
+            x_offset = random.randint(0, padded_img.shape[1] - target_w - 1)
+        padded_cropped_img = padded_img[y_offset:y_offset + target_h, x_offset:
+                                        x_offset + target_w]
+
+        # adjust boxes
+        cp_bboxes_origin_np = cp_labels[:, :4].copy()
+        cp_bboxes_origin_np[:, 0::2] = np.clip(cp_bboxes_origin_np[:, 0::2] *
+                                               cp_scale_ratio, 0, origin_w)
+        cp_bboxes_origin_np[:, 1::2] = np.clip(cp_bboxes_origin_np[:, 1::2] *
+                                               cp_scale_ratio, 0, origin_h)
+
+        if FLIP:
+            cp_bboxes_origin_np[:, 0::2] = (
+                origin_w - cp_bboxes_origin_np[:, 0::2][:, ::-1])
+        cp_bboxes_transformed_np = cp_bboxes_origin_np.copy()
+        if self.remove_outside_box:
+            # for MOT dataset
+            cp_bboxes_transformed_np[:, 0::2] -= x_offset
+            cp_bboxes_transformed_np[:, 1::2] -= y_offset
+        else:
+            cp_bboxes_transformed_np[:, 0::2] = np.clip(
+                cp_bboxes_transformed_np[:, 0::2] - x_offset, 0, target_w)
+            cp_bboxes_transformed_np[:, 1::2] = np.clip(
+                cp_bboxes_transformed_np[:, 1::2] - y_offset, 0, target_h)
+
+        cls_labels = cp_labels[:, 4:5].copy()
+        box_labels = cp_bboxes_transformed_np
+        if cp_labels.shape[-1] == 6:
+            crd_labels = cp_labels[:, 5:6].copy()
+            labels = np.hstack((box_labels, cls_labels, crd_labels))
+        else:
+            labels = np.hstack((box_labels, cls_labels))
+        if self.remove_outside_box:
+            labels = labels[labels[:, 0] < target_w]
+            labels = labels[labels[:, 2] > 0]
+            labels = labels[labels[:, 1] < target_h]
+            labels = labels[labels[:, 3] > 0]
+
+        origin_labels = np.vstack((origin_labels, labels))
+        origin_img = origin_img.astype(np.float32)
+        origin_img = 0.5 * origin_img + 0.5 * padded_cropped_img.astype(
+            np.float32)
+
+        return origin_img.astype(np.uint8), origin_labels
+
+
+@register_op
+class PadResize(BaseOperator):
+    """ PadResize for image and gt_bbbox
+
+    Args:
+        target_size (list[int]): input shape
+        fill_value (float): pixel value of padded image
+    """
+
+    def __init__(self, target_size, fill_value=114):
+        super(PadResize, self).__init__()
+        if isinstance(target_size, Integral):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+        self.fill_value = fill_value
+
+    def _resize(self, img, bboxes, labels):
+        ratio = min(self.target_size[0] / img.shape[0],
+                    self.target_size[1] / img.shape[1])
+        w, h = int(img.shape[1] * ratio), int(img.shape[0] * ratio)
+        resized_img = cv2.resize(img, (w, h), interpolation=cv2.INTER_LINEAR)
+
+        if len(bboxes) > 0:
+            bboxes *= ratio
+            mask = np.minimum(bboxes[:, 2] - bboxes[:, 0],
+                              bboxes[:, 3] - bboxes[:, 1]) > 1
+            bboxes = bboxes[mask]
+            labels = labels[mask]
+        return resized_img, bboxes, labels
+
+    def _pad(self, img):
+        h, w, _ = img.shape
+        if h == self.target_size[0] and w == self.target_size[1]:
+            return img
+        padded_img = np.full(
+            (self.target_size[0], self.target_size[1], 3),
+            self.fill_value,
+            dtype=np.uint8)
+        padded_img[:h, :w] = img
+        return padded_img
+
+    def apply(self, sample, context=None):
+        image = sample['image']
+        bboxes = sample['gt_bbox']
+        labels = sample['gt_class']
+        image, bboxes, labels = self._resize(image, bboxes, labels)
+        sample['image'] = self._pad(image).astype(np.float32)
+        sample['gt_bbox'] = bboxes
+        sample['gt_class'] = labels
+        return sample
+
+
+@register_op
+class RandomShift(BaseOperator):
+    """
+    Randomly shift image
+
+    Args:
+        prob (float): probability to do random shift.
+        max_shift (int): max shift pixels
+        filter_thr (int): filter gt bboxes if one side is smaller than this
+    """
+
+    def __init__(self, prob=0.5, max_shift=32, filter_thr=1):
+        super(RandomShift, self).__init__()
+        self.prob = prob
+        self.max_shift = max_shift
+        self.filter_thr = filter_thr
+
+    def calc_shift_coor(self, im_h, im_w, shift_h, shift_w):
+        return [
+            max(0, shift_w), max(0, shift_h), min(im_w, im_w + shift_w),
+            min(im_h, im_h + shift_h)
+        ]
+
+    def apply(self, sample, context=None):
+        if random.random() > self.prob:
+            return sample
+
+        im = sample['image']
+        gt_bbox = sample['gt_bbox']
+        gt_class = sample['gt_class']
+        im_h, im_w = im.shape[:2]
+        shift_h = random.randint(-self.max_shift, self.max_shift)
+        shift_w = random.randint(-self.max_shift, self.max_shift)
+
+        gt_bbox[:, 0::2] += shift_w
+        gt_bbox[:, 1::2] += shift_h
+        gt_bbox[:, 0::2] = np.clip(gt_bbox[:, 0::2], 0, im_w)
+        gt_bbox[:, 1::2] = np.clip(gt_bbox[:, 1::2], 0, im_h)
+        gt_bbox_h = gt_bbox[:, 2] - gt_bbox[:, 0]
+        gt_bbox_w = gt_bbox[:, 3] - gt_bbox[:, 1]
+        keep = (gt_bbox_w > self.filter_thr) & (gt_bbox_h > self.filter_thr)
+        if not keep.any():
+            return sample
+
+        gt_bbox = gt_bbox[keep]
+        gt_class = gt_class[keep]
+
+        # shift image
+        coor_new = self.calc_shift_coor(im_h, im_w, shift_h, shift_w)
+        # shift frame to the opposite direction
+        coor_old = self.calc_shift_coor(im_h, im_w, -shift_h, -shift_w)
+        canvas = np.zeros_like(im)
+        canvas[coor_new[1]:coor_new[3], coor_new[0]:coor_new[2]] \
+            = im[coor_old[1]:coor_old[3], coor_old[0]:coor_old[2]]
+
+        sample['image'] = canvas
+        sample['gt_bbox'] = gt_bbox
+        sample['gt_class'] = gt_class
+        return sample
+
+
+@register_op
+class StrongAugImage(BaseOperator):
+    def __init__(self, transforms):
+        super(StrongAugImage, self).__init__()
+        self.transforms = Compose(transforms)
+
+    def apply(self, sample, context=None):
+        im = sample
+        im['image'] = sample['image'].astype('uint8')
+        results = self.transforms(im)
+        sample['image'] = results['image'].astype('uint8')
+        return sample
+
+
+@register_op
+class RandomColorJitter(BaseOperator):
+    def __init__(self,
+                 prob=0.8,
+                 brightness=0.4,
+                 contrast=0.4,
+                 saturation=0.4,
+                 hue=0.1):
+        super(RandomColorJitter, self).__init__()
+        self.prob = prob
+        self.brightness = brightness
+        self.contrast = contrast
+        self.saturation = saturation
+        self.hue = hue
+
+    def apply(self, sample, context=None):
+        if np.random.uniform(0, 1) < self.prob:
+            from paddle.vision.transforms import ColorJitter
+            transform = ColorJitter(self.brightness, self.contrast,
+                                    self.saturation, self.hue)
+            sample['image'] = transform(sample['image'].astype(np.uint8))
+            sample['image'] = sample['image'].astype(np.float32)
+        return sample
+
+
+@register_op
+class RandomGrayscale(BaseOperator):
+    def __init__(self, prob=0.2):
+        super(RandomGrayscale, self).__init__()
+        self.prob = prob
+
+    def apply(self, sample, context=None):
+        if np.random.uniform(0, 1) < self.prob:
+            from paddle.vision.transforms import Grayscale
+            transform = Grayscale(num_output_channels=3)
+            sample['image'] = transform(sample['image'])
+        return sample
+
+
+@register_op
+class RandomGaussianBlur(BaseOperator):
+    def __init__(self, prob=0.5, sigma=[0.1, 2.0]):
+        super(RandomGaussianBlur, self).__init__()
+        self.prob = prob
+        self.sigma = sigma
+
+    def apply(self, sample, context=None):
+        if np.random.uniform(0, 1) < self.prob:
+            sigma = np.random.uniform(self.sigma[0], self.sigma[1])
+            im = cv2.GaussianBlur(sample['image'], (23, 23), sigma)
+            sample['image'] = im
+        return sample
+
+
+@register_op
+class RandomErasing(BaseOperator):
+    def __init__(self,
+                 prob=0.5,
+                 scale=(0.02, 0.33),
+                 ratio=(0.3, 3.3),
+                 value=0,
+                 inplace=False):
+        super(RandomErasing, self).__init__()
+        assert isinstance(scale,
+                          (tuple, list)), "scale should be a tuple or list"
+        assert (scale[0] >= 0 and scale[1] <= 1 and scale[0] <= scale[1]
+                ), "scale should be of kind (min, max) and in range [0, 1]"
+        assert isinstance(ratio,
+                          (tuple, list)), "ratio should be a tuple or list"
+        assert (ratio[0] >= 0 and
+                ratio[0] <= ratio[1]), "ratio should be of kind (min, max)"
+        assert isinstance(
+            value, (Number, str, tuple,
+                    list)), "value should be a number, tuple, list or str"
+        if isinstance(value, str) and value != "random":
+            raise ValueError("value must be 'random' when type is str")
+        self.prob = prob
+        self.scale = scale
+        self.ratio = ratio
+        self.value = value
+        self.inplace = inplace
+
+    def _erase(self, img, i, j, h, w, v, inplace=False):
+        if not inplace:
+            img = img.copy()
+        img[i:i + h, j:j + w, ...] = v
+        return img
+
+    def _get_param(self, img, scale, ratio, value):
+        shape = np.asarray(img).astype(np.uint8).shape
+        h, w, c = shape[-3], shape[-2], shape[-1]
+        img_area = h * w
+        log_ratio = np.log(ratio)
+        for _ in range(1):
+            erase_area = np.random.uniform(*scale) * img_area
+            aspect_ratio = np.exp(np.random.uniform(*log_ratio))
+            erase_h = int(round(np.sqrt(erase_area * aspect_ratio)))
+            erase_w = int(round(np.sqrt(erase_area / aspect_ratio)))
+            if erase_h >= h or erase_w >= w:
+                continue
+
+            if value is None:
+                v = np.random.normal(size=[erase_h, erase_w, c]) * 255
+            else:
+                v = np.array(value)[None, None, :]
+            top = np.random.randint(0, h - erase_h + 1)
+            left = np.random.randint(0, w - erase_w + 1)
+            return top, left, erase_h, erase_w, v
+        return 0, 0, h, w, img
+
+    def apply(self, sample, context=None):
+        if random.random() < self.prob:
+            if isinstance(self.value, Number):
+                value = [self.value]
+            elif isinstance(self.value, str):
+                value = None
+            else:
+                value = self.value
+            if value is not None and not (len(value) == 1 or len(value) == 3):
+                raise ValueError(
+                    "Value should be a single number or a sequence with length equals to image's channel."
+                )
+            im = sample['image']
+            top, left, erase_h, erase_w, v = self._get_param(im, self.scale,
+                                                             self.ratio, value)
+            im = self._erase(im, top, left, erase_h, erase_w, v, self.inplace)
+            sample['image'] = im
+        return sample
+
+
+@register_op
+class RandomErasingCrop(BaseOperator):
+    def __init__(self):
+        super(RandomErasingCrop, self).__init__()
+        self.transform1 = RandomErasing(
+            prob=0.7, scale=(0.05, 0.2), ratio=(0.3, 3.3), value="random")
+        self.transform2 = RandomErasing(
+            prob=0.5, scale=(0.05, 0.2), ratio=(0.1, 6), value="random")
+        self.transform3 = RandomErasing(
+            prob=0.3, scale=(0.05, 0.2), ratio=(0.05, 8), value="random")
+
+    def apply(self, sample, context=None):
+        sample = self.transform1(sample)
+        sample = self.transform2(sample)
+        sample = self.transform3(sample)
+        return sample
diff --git a/rtdetr_paddle/ppdet/data/utils.py b/rtdetr_paddle/ppdet/data/utils.py
new file mode 100644
index 0000000..c01b3d2
--- /dev/null
+++ b/rtdetr_paddle/ppdet/data/utils.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numbers
+import numpy as np
+
+try:
+    from collections.abc import Sequence, Mapping
+except:
+    from collections import Sequence, Mapping
+
+
+def default_collate_fn(batch):
+    """
+    Default batch collating function for :code:`paddle.io.DataLoader`,
+    get input data as a list of sample datas, each element in list
+    if the data of a sample, and sample data should composed of list,
+    dictionary, string, number, numpy array, this
+    function will parse input data recursively and stack number,
+    numpy array and paddle.Tensor datas as batch datas. e.g. for
+    following input data:
+    [{'image': np.array(shape=[3, 224, 224]), 'label': 1},
+     {'image': np.array(shape=[3, 224, 224]), 'label': 3},
+     {'image': np.array(shape=[3, 224, 224]), 'label': 4},
+     {'image': np.array(shape=[3, 224, 224]), 'label': 5},]
+    
+    
+    This default collate function zipped each number and numpy array
+    field together and stack each field as the batch field as follows:
+    {'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])}
+    Args:  
+        batch(list of sample data): batch should be a list of sample data.
+    
+    Returns:
+        Batched data: batched each number, numpy array and paddle.Tensor
+                      in input data.
+    """
+    sample = batch[0]
+    if isinstance(sample, np.ndarray):
+        batch = np.stack(batch, axis=0)
+        return batch
+    elif isinstance(sample, numbers.Number):
+        batch = np.array(batch)
+        return batch
+    elif isinstance(sample, (str, bytes)):
+        return batch
+    elif isinstance(sample, Mapping):
+        return {
+            key: default_collate_fn([d[key] for d in batch])
+            for key in sample
+        }
+    elif isinstance(sample, Sequence):
+        sample_fields_num = len(sample)
+        if not all(len(sample) == sample_fields_num for sample in iter(batch)):
+            raise RuntimeError(
+                "fileds number not same among samples in a batch")
+        return [default_collate_fn(fields) for fields in zip(*batch)]
+
+    raise TypeError("batch data con only contains: tensor, numpy.ndarray, "
+                    "dict, list, number, but got {}".format(type(sample)))
diff --git a/rtdetr_paddle/ppdet/engine/__init__.py b/rtdetr_paddle/ppdet/engine/__init__.py
new file mode 100644
index 0000000..dfded9e
--- /dev/null
+++ b/rtdetr_paddle/ppdet/engine/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from . import trainer
+from .trainer import *
+
+from . import callbacks
+from .callbacks import *
+
+from . import env
+from .env import *
+
+__all__ = trainer.__all__ \
+        + callbacks.__all__ \
+        + env.__all__
diff --git a/rtdetr_paddle/ppdet/engine/callbacks.py b/rtdetr_paddle/ppdet/engine/callbacks.py
new file mode 100644
index 0000000..35ebb3e
--- /dev/null
+++ b/rtdetr_paddle/ppdet/engine/callbacks.py
@@ -0,0 +1,557 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import datetime
+import six
+import copy
+import json
+
+import paddle
+import paddle.distributed as dist
+
+from ppdet.utils.checkpoint import save_model
+from ppdet.metrics import get_infer_results
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('ppdet.engine')
+
+__all__ = [
+    'Callback', 'ComposeCallback', 'LogPrinter', 'Checkpointer',
+    'VisualDLWriter', 'SniperProposalsGenerator'
+]
+
+
+class Callback(object):
+    def __init__(self, model):
+        self.model = model
+
+    def on_step_begin(self, status):
+        pass
+
+    def on_step_end(self, status):
+        pass
+
+    def on_epoch_begin(self, status):
+        pass
+
+    def on_epoch_end(self, status):
+        pass
+
+    def on_train_begin(self, status):
+        pass
+
+    def on_train_end(self, status):
+        pass
+
+
+class ComposeCallback(object):
+    def __init__(self, callbacks):
+        callbacks = [c for c in list(callbacks) if c is not None]
+        for c in callbacks:
+            assert isinstance(
+                c, Callback), "callback should be subclass of Callback"
+        self._callbacks = callbacks
+
+    def on_step_begin(self, status):
+        for c in self._callbacks:
+            c.on_step_begin(status)
+
+    def on_step_end(self, status):
+        for c in self._callbacks:
+            c.on_step_end(status)
+
+    def on_epoch_begin(self, status):
+        for c in self._callbacks:
+            c.on_epoch_begin(status)
+
+    def on_epoch_end(self, status):
+        for c in self._callbacks:
+            c.on_epoch_end(status)
+
+    def on_train_begin(self, status):
+        for c in self._callbacks:
+            c.on_train_begin(status)
+
+    def on_train_end(self, status):
+        for c in self._callbacks:
+            c.on_train_end(status)
+
+
+class LogPrinter(Callback):
+    def __init__(self, model):
+        super(LogPrinter, self).__init__(model)
+
+    def on_step_end(self, status):
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            mode = status['mode']
+            if mode == 'train':
+                epoch_id = status['epoch_id']
+                step_id = status['step_id']
+                steps_per_epoch = status['steps_per_epoch']
+                training_status = status['training_status']
+                batch_time = status['batch_time']
+                data_time = status['data_time']
+
+                epoches = self.model.cfg.epoch
+                batch_size = self.model.cfg['{}Reader'.format(mode.capitalize(
+                ))]['batch_size']
+
+                logs = training_status.log()
+                space_fmt = ':' + str(len(str(steps_per_epoch))) + 'd'
+                if step_id % self.model.cfg.log_iter == 0:
+                    eta_steps = (epoches - epoch_id) * steps_per_epoch - step_id
+                    eta_sec = eta_steps * batch_time.global_avg
+                    eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
+                    ips = float(batch_size) / batch_time.avg
+                    fmt = ' '.join([
+                        'Epoch: [{}]',
+                        '[{' + space_fmt + '}/{}]',
+                        'learning_rate: {lr:.6f}',
+                        '{meters}',
+                        'eta: {eta}',
+                        'batch_cost: {btime}',
+                        'data_cost: {dtime}',
+                        'ips: {ips:.4f} images/s',
+                    ])
+                    fmt = fmt.format(
+                        epoch_id,
+                        step_id,
+                        steps_per_epoch,
+                        lr=status['learning_rate'],
+                        meters=logs,
+                        eta=eta_str,
+                        btime=str(batch_time),
+                        dtime=str(data_time),
+                        ips=ips)
+                    logger.info(fmt)
+            if mode == 'eval':
+                step_id = status['step_id']
+                if step_id % 100 == 0:
+                    logger.info("Eval iter: {}".format(step_id))
+
+    def on_epoch_end(self, status):
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            mode = status['mode']
+            if mode == 'eval':
+                sample_num = status['sample_num']
+                cost_time = status['cost_time']
+                logger.info('Total sample number: {}, average FPS: {}'.format(
+                    sample_num, sample_num / cost_time))
+
+
+class Checkpointer(Callback):
+    def __init__(self, model):
+        super(Checkpointer, self).__init__(model)
+        self.best_ap = -1000.
+        self.save_dir = os.path.join(self.model.cfg.save_dir,
+                                     self.model.cfg.filename)
+        if hasattr(self.model.model, 'student_model'):
+            self.weight = self.model.model.student_model
+        else:
+            self.weight = self.model.model
+
+    def on_epoch_end(self, status):
+        # Checkpointer only performed during training
+        mode = status['mode']
+        epoch_id = status['epoch_id']
+        weight = None
+        save_name = None
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            if mode == 'train':
+                end_epoch = self.model.cfg.epoch
+                if (
+                        epoch_id + 1
+                ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:
+                    save_name = str(
+                        epoch_id) if epoch_id != end_epoch - 1 else "model_final"
+                    weight = self.weight.state_dict()
+            elif mode == 'eval':
+                if 'save_best_model' in status and status['save_best_model']:
+                    for metric in self.model._metrics:
+                        map_res = metric.get_results()
+                        eval_func = "ap"
+                        if 'pose3d' in map_res:
+                            key = 'pose3d'
+                            eval_func = "mpjpe"
+                        elif 'bbox' in map_res:
+                            key = 'bbox'
+                        elif 'keypoint' in map_res:
+                            key = 'keypoint'
+                        else:
+                            key = 'mask'
+                        if key not in map_res:
+                            logger.warning("Evaluation results empty, this may be due to " \
+                                        "training iterations being too few or not " \
+                                        "loading the correct weights.")
+                            return
+                        if map_res[key][0] >= self.best_ap:
+                            self.best_ap = map_res[key][0]
+                            save_name = 'best_model'
+                            weight = self.weight.state_dict()
+                        logger.info("Best test {} {} is {:0.3f}.".format(
+                            key, eval_func, abs(self.best_ap)))
+            if weight:
+                if self.model.use_ema:
+                    exchange_save_model = status.get('exchange_save_model',
+                                                     False)
+                    if not exchange_save_model:
+                        # save model and ema_model
+                        save_model(
+                            status['weight'],
+                            self.model.optimizer,
+                            self.save_dir,
+                            save_name,
+                            epoch_id + 1,
+                            ema_model=weight)
+                    else:
+                        # save model(student model) and ema_model(teacher model)
+                        # in DenseTeacher SSOD, the teacher model will be higher,
+                        # so exchange when saving pdparams
+                        student_model = status['weight']  # model
+                        teacher_model = weight  # ema_model
+                        save_model(
+                            teacher_model,
+                            self.model.optimizer,
+                            self.save_dir,
+                            save_name,
+                            epoch_id + 1,
+                            ema_model=student_model)
+                        del teacher_model
+                        del student_model
+                else:
+                    save_model(weight, self.model.optimizer, self.save_dir,
+                               save_name, epoch_id + 1)
+
+
+class WiferFaceEval(Callback):
+    def __init__(self, model):
+        super(WiferFaceEval, self).__init__(model)
+
+    def on_epoch_begin(self, status):
+        assert self.model.mode == 'eval', \
+            "WiferFaceEval can only be set during evaluation"
+        for metric in self.model._metrics:
+            metric.update(self.model.model)
+        sys.exit()
+
+
+class VisualDLWriter(Callback):
+    """
+    Use VisualDL to log data or image
+    """
+
+    def __init__(self, model):
+        super(VisualDLWriter, self).__init__(model)
+
+        assert six.PY3, "VisualDL requires Python >= 3.5"
+        try:
+            from visualdl import LogWriter
+        except Exception as e:
+            logger.error('visualdl not found, plaese install visualdl. '
+                         'for example: `pip install visualdl`.')
+            raise e
+        self.vdl_writer = LogWriter(
+            model.cfg.get('vdl_log_dir', 'vdl_log_dir/scalar'))
+        self.vdl_loss_step = 0
+        self.vdl_mAP_step = 0
+        self.vdl_image_step = 0
+        self.vdl_image_frame = 0
+
+    def on_step_end(self, status):
+        mode = status['mode']
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            if mode == 'train':
+                training_status = status['training_status']
+                for loss_name, loss_value in training_status.get().items():
+                    self.vdl_writer.add_scalar(loss_name, loss_value,
+                                               self.vdl_loss_step)
+                self.vdl_loss_step += 1
+            elif mode == 'test':
+                ori_image = status['original_image']
+                result_image = status['result_image']
+                self.vdl_writer.add_image(
+                    "original/frame_{}".format(self.vdl_image_frame), ori_image,
+                    self.vdl_image_step)
+                self.vdl_writer.add_image(
+                    "result/frame_{}".format(self.vdl_image_frame),
+                    result_image, self.vdl_image_step)
+                self.vdl_image_step += 1
+                # each frame can display ten pictures at most.
+                if self.vdl_image_step % 10 == 0:
+                    self.vdl_image_step = 0
+                    self.vdl_image_frame += 1
+
+    def on_epoch_end(self, status):
+        mode = status['mode']
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            if mode == 'eval':
+                for metric in self.model._metrics:
+                    for key, map_value in metric.get_results().items():
+                        self.vdl_writer.add_scalar("{}-mAP".format(key),
+                                                   map_value[0],
+                                                   self.vdl_mAP_step)
+                self.vdl_mAP_step += 1
+
+
+class WandbCallback(Callback):
+    def __init__(self, model):
+        super(WandbCallback, self).__init__(model)
+
+        try:
+            import wandb
+            self.wandb = wandb
+        except Exception as e:
+            logger.error('wandb not found, please install wandb. '
+                         'Use: `pip install wandb`.')
+            raise e
+
+        self.wandb_params = model.cfg.get('wandb', None)
+        self.save_dir = os.path.join(self.model.cfg.save_dir,
+                                     self.model.cfg.filename)
+        if self.wandb_params is None:
+            self.wandb_params = {}
+        for k, v in model.cfg.items():
+            if k.startswith("wandb_"):
+                self.wandb_params.update({k.lstrip("wandb_"): v})
+
+        self._run = None
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            _ = self.run
+            self.run.config.update(self.model.cfg)
+            self.run.define_metric("epoch")
+            self.run.define_metric("eval/*", step_metric="epoch")
+
+        self.best_ap = -1000.
+        self.fps = []
+
+    @property
+    def run(self):
+        if self._run is None:
+            if self.wandb.run is not None:
+                logger.info(
+                    "There is an ongoing wandb run which will be used"
+                    "for logging. Please use `wandb.finish()` to end that"
+                    "if the behaviour is not intended")
+                self._run = self.wandb.run
+            else:
+                self._run = self.wandb.init(**self.wandb_params)
+        return self._run
+
+    def save_model(self,
+                   optimizer,
+                   save_dir,
+                   save_name,
+                   last_epoch,
+                   ema_model=None,
+                   ap=None,
+                   fps=None,
+                   tags=None):
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            model_path = os.path.join(save_dir, save_name)
+            metadata = {}
+            metadata["last_epoch"] = last_epoch
+            if ap:
+                metadata["ap"] = ap
+
+            if fps:
+                metadata["fps"] = fps
+
+            if ema_model is None:
+                ema_artifact = self.wandb.Artifact(
+                    name="ema_model-{}".format(self.run.id),
+                    type="model",
+                    metadata=metadata)
+                model_artifact = self.wandb.Artifact(
+                    name="model-{}".format(self.run.id),
+                    type="model",
+                    metadata=metadata)
+
+                ema_artifact.add_file(model_path + ".pdema", name="model_ema")
+                model_artifact.add_file(model_path + ".pdparams", name="model")
+
+                self.run.log_artifact(ema_artifact, aliases=tags)
+                self.run.log_artfact(model_artifact, aliases=tags)
+            else:
+                model_artifact = self.wandb.Artifact(
+                    name="model-{}".format(self.run.id),
+                    type="model",
+                    metadata=metadata)
+                model_artifact.add_file(model_path + ".pdparams", name="model")
+                self.run.log_artifact(model_artifact, aliases=tags)
+
+    def on_step_end(self, status):
+
+        mode = status['mode']
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            if mode == 'train':
+                training_status = status['training_status'].get()
+                for k, v in training_status.items():
+                    training_status[k] = float(v)
+
+                # calculate ips, data_cost, batch_cost
+                batch_time = status['batch_time']
+                data_time = status['data_time']
+                batch_size = self.model.cfg['{}Reader'.format(mode.capitalize(
+                ))]['batch_size']
+
+                ips = float(batch_size) / float(batch_time.avg)
+                data_cost = float(data_time.avg)
+                batch_cost = float(batch_time.avg)
+
+                metrics = {"train/" + k: v for k, v in training_status.items()}
+
+                metrics["train/ips"] = ips
+                metrics["train/data_cost"] = data_cost
+                metrics["train/batch_cost"] = batch_cost
+
+                self.fps.append(ips)
+                self.run.log(metrics)
+
+    def on_epoch_end(self, status):
+        mode = status['mode']
+        epoch_id = status['epoch_id']
+        save_name = None
+        if dist.get_world_size() < 2 or dist.get_rank() == 0:
+            if mode == 'train':
+                fps = sum(self.fps) / len(self.fps)
+                self.fps = []
+
+                end_epoch = self.model.cfg.epoch
+                if (
+                        epoch_id + 1
+                ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:
+                    save_name = str(
+                        epoch_id) if epoch_id != end_epoch - 1 else "model_final"
+                    tags = ["latest", "epoch_{}".format(epoch_id)]
+                    self.save_model(
+                        self.model.optimizer,
+                        self.save_dir,
+                        save_name,
+                        epoch_id + 1,
+                        self.model.use_ema,
+                        fps=fps,
+                        tags=tags)
+            if mode == 'eval':
+                sample_num = status['sample_num']
+                cost_time = status['cost_time']
+
+                fps = sample_num / cost_time
+
+                merged_dict = {}
+                for metric in self.model._metrics:
+                    for key, map_value in metric.get_results().items():
+                        merged_dict["eval/{}-mAP".format(key)] = map_value[0]
+                merged_dict["epoch"] = status["epoch_id"]
+                merged_dict["eval/fps"] = sample_num / cost_time
+
+                self.run.log(merged_dict)
+
+                if 'save_best_model' in status and status['save_best_model']:
+                    for metric in self.model._metrics:
+                        map_res = metric.get_results()
+                        if 'pose3d' in map_res:
+                            key = 'pose3d'
+                        elif 'bbox' in map_res:
+                            key = 'bbox'
+                        elif 'keypoint' in map_res:
+                            key = 'keypoint'
+                        else:
+                            key = 'mask'
+                        if key not in map_res:
+                            logger.warning("Evaluation results empty, this may be due to " \
+                                        "training iterations being too few or not " \
+                                        "loading the correct weights.")
+                            return
+                        if map_res[key][0] >= self.best_ap:
+                            self.best_ap = map_res[key][0]
+                            save_name = 'best_model'
+                            tags = ["best", "epoch_{}".format(epoch_id)]
+
+                            self.save_model(
+                                self.model.optimizer,
+                                self.save_dir,
+                                save_name,
+                                last_epoch=epoch_id + 1,
+                                ema_model=self.model.use_ema,
+                                ap=abs(self.best_ap),
+                                fps=fps,
+                                tags=tags)
+
+    def on_train_end(self, status):
+        self.run.finish()
+
+
+class SniperProposalsGenerator(Callback):
+    def __init__(self, model):
+        super(SniperProposalsGenerator, self).__init__(model)
+        ori_dataset = self.model.dataset
+        self.dataset = self._create_new_dataset(ori_dataset)
+        self.loader = self.model.loader
+        self.cfg = self.model.cfg
+        self.infer_model = self.model.model
+
+    def _create_new_dataset(self, ori_dataset):
+        dataset = copy.deepcopy(ori_dataset)
+        # init anno_cropper
+        dataset.init_anno_cropper()
+        # generate infer roidbs
+        ori_roidbs = dataset.get_ori_roidbs()
+        roidbs = dataset.anno_cropper.crop_infer_anno_records(ori_roidbs)
+        # set new roidbs
+        dataset.set_roidbs(roidbs)
+
+        return dataset
+
+    def _eval_with_loader(self, loader):
+        results = []
+        with paddle.no_grad():
+            self.infer_model.eval()
+            for step_id, data in enumerate(loader):
+                outs = self.infer_model(data)
+                for key in ['im_shape', 'scale_factor', 'im_id']:
+                    outs[key] = data[key]
+                for key, value in outs.items():
+                    if hasattr(value, 'numpy'):
+                        outs[key] = value.numpy()
+
+                results.append(outs)
+
+        return results
+
+    def on_train_end(self, status):
+        self.loader.dataset = self.dataset
+        results = self._eval_with_loader(self.loader)
+        results = self.dataset.anno_cropper.aggregate_chips_detections(results)
+        # sniper
+        proposals = []
+        clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()}
+        for outs in results:
+            batch_res = get_infer_results(outs, clsid2catid)
+            start = 0
+            for i, im_id in enumerate(outs['im_id']):
+                bbox_num = outs['bbox_num']
+                end = start + bbox_num[i]
+                bbox_res = batch_res['bbox'][start:end] \
+                    if 'bbox' in batch_res else None
+                if bbox_res:
+                    proposals += bbox_res
+        logger.info("save proposals in {}".format(self.cfg.proposals_path))
+        with open(self.cfg.proposals_path, 'w') as f:
+            json.dump(proposals, f)
diff --git a/rtdetr_paddle/ppdet/engine/env.py b/rtdetr_paddle/ppdet/engine/env.py
new file mode 100644
index 0000000..0a89657
--- /dev/null
+++ b/rtdetr_paddle/ppdet/engine/env.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import random
+import numpy as np
+
+import paddle
+from paddle.distributed import fleet
+
+__all__ = ['init_parallel_env', 'set_random_seed', 'init_fleet_env']
+
+
+def init_fleet_env(find_unused_parameters=False):
+    strategy = fleet.DistributedStrategy()
+    strategy.find_unused_parameters = find_unused_parameters
+    fleet.init(is_collective=True, strategy=strategy)
+
+
+def init_parallel_env():
+    env = os.environ
+    dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env
+    if dist:
+        trainer_id = int(env['PADDLE_TRAINER_ID'])
+        local_seed = (99 + trainer_id)
+        random.seed(local_seed)
+        np.random.seed(local_seed)
+
+    paddle.distributed.init_parallel_env()
+
+
+def set_random_seed(seed):
+    paddle.seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
diff --git a/rtdetr_paddle/ppdet/engine/export_utils.py b/rtdetr_paddle/ppdet/engine/export_utils.py
new file mode 100644
index 0000000..882dd5a
--- /dev/null
+++ b/rtdetr_paddle/ppdet/engine/export_utils.py
@@ -0,0 +1,349 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import yaml
+from collections import OrderedDict
+
+import paddle
+from ppdet.data.source.category import get_categories
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('ppdet.engine')
+
+# Global dictionary
+TRT_MIN_SUBGRAPH = {
+    'YOLO': 3,
+    'PPYOLOE': 3,
+    'SSD': 60,
+    'RCNN': 40,
+    'RetinaNet': 40,
+    'S2ANet': 80,
+    'EfficientDet': 40,
+    'Face': 3,
+    'TTFNet': 60,
+    'FCOS': 16,
+    'SOLOv2': 60,
+    'HigherHRNet': 3,
+    'HRNet': 3,
+    'DeepSORT': 3,
+    'ByteTrack': 10,
+    'CenterTrack': 5,
+    'JDE': 10,
+    'FairMOT': 5,
+    'GFL': 16,
+    'PicoDet': 3,
+    'CenterNet': 5,
+    'TOOD': 5,
+    'YOLOX': 8,
+    'YOLOF': 40,
+    'METRO_Body': 3,
+    'DETR': 3,
+}
+
+KEYPOINT_ARCH = ['HigherHRNet', 'TopDownHRNet']
+MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']
+
+TO_STATIC_SPEC = {
+    'yolov3_darknet53_270e_coco': [{
+        'im_id': paddle.static.InputSpec(
+            name='im_id', shape=[-1, 1], dtype='float32'),
+        'is_crowd': paddle.static.InputSpec(
+            name='is_crowd', shape=[-1, 50], dtype='float32'),
+        'gt_bbox': paddle.static.InputSpec(
+            name='gt_bbox', shape=[-1, 50, 4], dtype='float32'),
+        'curr_iter': paddle.static.InputSpec(
+            name='curr_iter', shape=[-1], dtype='float32'),
+        'image': paddle.static.InputSpec(
+            name='image', shape=[-1, 3, -1, -1], dtype='float32'),
+        'im_shape': paddle.static.InputSpec(
+            name='im_shape', shape=[-1, 2], dtype='float32'),
+        'scale_factor': paddle.static.InputSpec(
+            name='scale_factor', shape=[-1, 2], dtype='float32'),
+        'target0': paddle.static.InputSpec(
+            name='target0', shape=[-1, 3, 86, -1, -1], dtype='float32'),
+        'target1': paddle.static.InputSpec(
+            name='target1', shape=[-1, 3, 86, -1, -1], dtype='float32'),
+        'target2': paddle.static.InputSpec(
+            name='target2', shape=[-1, 3, 86, -1, -1], dtype='float32'),
+    }],
+    'tinypose_128x96': [{
+        'center': paddle.static.InputSpec(
+            name='center', shape=[-1, 2], dtype='float32'),
+        'scale': paddle.static.InputSpec(
+            name='scale', shape=[-1, 2], dtype='float32'),
+        'im_id': paddle.static.InputSpec(
+            name='im_id', shape=[-1, 1], dtype='float32'),
+        'image': paddle.static.InputSpec(
+            name='image', shape=[-1, 3, 128, 96], dtype='float32'),
+        'score': paddle.static.InputSpec(
+            name='score', shape=[-1], dtype='float32'),
+        'rotate': paddle.static.InputSpec(
+            name='rotate', shape=[-1], dtype='float32'),
+        'target': paddle.static.InputSpec(
+            name='target', shape=[-1, 17, 32, 24], dtype='float32'),
+        'target_weight': paddle.static.InputSpec(
+            name='target_weight', shape=[-1, 17, 1], dtype='float32'),
+    }],
+    'fcos_r50_fpn_1x_coco': [{
+        'im_id': paddle.static.InputSpec(
+            name='im_id', shape=[-1, 1], dtype='float32'),
+        'curr_iter': paddle.static.InputSpec(
+            name='curr_iter', shape=[-1], dtype='float32'),
+        'image': paddle.static.InputSpec(
+            name='image', shape=[-1, 3, -1, -1], dtype='float32'),
+        'im_shape': paddle.static.InputSpec(
+            name='im_shape', shape=[-1, 2], dtype='float32'),
+        'scale_factor': paddle.static.InputSpec(
+            name='scale_factor', shape=[-1, 2], dtype='float32'),
+        'reg_target0': paddle.static.InputSpec(
+            name='reg_target0', shape=[-1, 160, 160, 4], dtype='float32'),
+        'labels0': paddle.static.InputSpec(
+            name='labels0', shape=[-1, 160, 160, 1], dtype='int32'),
+        'centerness0': paddle.static.InputSpec(
+            name='centerness0', shape=[-1, 160, 160, 1], dtype='float32'),
+        'reg_target1': paddle.static.InputSpec(
+            name='reg_target1', shape=[-1, 80, 80, 4], dtype='float32'),
+        'labels1': paddle.static.InputSpec(
+            name='labels1', shape=[-1, 80, 80, 1], dtype='int32'),
+        'centerness1': paddle.static.InputSpec(
+            name='centerness1', shape=[-1, 80, 80, 1], dtype='float32'),
+        'reg_target2': paddle.static.InputSpec(
+            name='reg_target2', shape=[-1, 40, 40, 4], dtype='float32'),
+        'labels2': paddle.static.InputSpec(
+            name='labels2', shape=[-1, 40, 40, 1], dtype='int32'),
+        'centerness2': paddle.static.InputSpec(
+            name='centerness2', shape=[-1, 40, 40, 1], dtype='float32'),
+        'reg_target3': paddle.static.InputSpec(
+            name='reg_target3', shape=[-1, 20, 20, 4], dtype='float32'),
+        'labels3': paddle.static.InputSpec(
+            name='labels3', shape=[-1, 20, 20, 1], dtype='int32'),
+        'centerness3': paddle.static.InputSpec(
+            name='centerness3', shape=[-1, 20, 20, 1], dtype='float32'),
+        'reg_target4': paddle.static.InputSpec(
+            name='reg_target4', shape=[-1, 10, 10, 4], dtype='float32'),
+        'labels4': paddle.static.InputSpec(
+            name='labels4', shape=[-1, 10, 10, 1], dtype='int32'),
+        'centerness4': paddle.static.InputSpec(
+            name='centerness4', shape=[-1, 10, 10, 1], dtype='float32'),
+    }],
+    'picodet_s_320_coco_lcnet': [{
+        'im_id': paddle.static.InputSpec(
+            name='im_id', shape=[-1, 1], dtype='float32'),
+        'is_crowd': paddle.static.InputSpec(
+            name='is_crowd', shape=[-1, -1, 1], dtype='float32'),
+        'gt_class': paddle.static.InputSpec(
+            name='gt_class', shape=[-1, -1, 1], dtype='int32'),
+        'gt_bbox': paddle.static.InputSpec(
+            name='gt_bbox', shape=[-1, -1, 4], dtype='float32'),
+        'curr_iter': paddle.static.InputSpec(
+            name='curr_iter', shape=[-1], dtype='float32'),
+        'image': paddle.static.InputSpec(
+            name='image', shape=[-1, 3, -1, -1], dtype='float32'),
+        'im_shape': paddle.static.InputSpec(
+            name='im_shape', shape=[-1, 2], dtype='float32'),
+        'scale_factor': paddle.static.InputSpec(
+            name='scale_factor', shape=[-1, 2], dtype='float32'),
+        'pad_gt_mask': paddle.static.InputSpec(
+            name='pad_gt_mask', shape=[-1, -1, 1], dtype='float32'),
+    }],
+    'ppyoloe_crn_s_300e_coco': [{
+        'im_id': paddle.static.InputSpec(
+            name='im_id', shape=[-1, 1], dtype='float32'),
+        'is_crowd': paddle.static.InputSpec(
+            name='is_crowd', shape=[-1, -1, 1], dtype='float32'),
+        'gt_class': paddle.static.InputSpec(
+            name='gt_class', shape=[-1, -1, 1], dtype='int32'),
+        'gt_bbox': paddle.static.InputSpec(
+            name='gt_bbox', shape=[-1, -1, 4], dtype='float32'),
+        'curr_iter': paddle.static.InputSpec(
+            name='curr_iter', shape=[-1], dtype='float32'),
+        'image': paddle.static.InputSpec(
+            name='image', shape=[-1, 3, -1, -1], dtype='float32'),
+        'im_shape': paddle.static.InputSpec(
+            name='im_shape', shape=[-1, 2], dtype='float32'),
+        'scale_factor': paddle.static.InputSpec(
+            name='scale_factor', shape=[-1, 2], dtype='float32'),
+        'pad_gt_mask': paddle.static.InputSpec(
+            name='pad_gt_mask', shape=[-1, -1, 1], dtype='float32'),
+    }],
+}
+
+
+def apply_to_static(config, model):
+    filename = config.get('filename', None)
+    spec = TO_STATIC_SPEC.get(filename, None)
+    model = paddle.jit.to_static(model, input_spec=spec)
+    logger.info("Successfully to apply @to_static with specs: {}".format(spec))
+    return model
+
+
+def _prune_input_spec(input_spec, program, targets):
+    # try to prune static program to figure out pruned input spec
+    # so we perform following operations in static mode
+    device = paddle.get_device()
+    paddle.enable_static()
+    paddle.set_device(device)
+    pruned_input_spec = [{}]
+    program = program.clone()
+    program = program._prune(targets=targets)
+    global_block = program.global_block()
+    for name, spec in input_spec[0].items():
+        try:
+            v = global_block.var(name)
+            pruned_input_spec[0][name] = spec
+        except Exception:
+            pass
+    paddle.disable_static(place=device)
+    return pruned_input_spec
+
+
+def _parse_reader(reader_cfg, dataset_cfg, metric, arch, image_shape):
+    preprocess_list = []
+
+    anno_file = dataset_cfg.get_anno()
+
+    clsid2catid, catid2name = get_categories(metric, anno_file, arch)
+
+    label_list = [str(cat) for cat in catid2name.values()]
+
+    fuse_normalize = reader_cfg.get('fuse_normalize', False)
+    sample_transforms = reader_cfg['sample_transforms']
+    for st in sample_transforms[1:]:
+        for key, value in st.items():
+            p = {'type': key}
+            if key == 'Resize':
+                if int(image_shape[1]) != -1:
+                    value['target_size'] = image_shape[1:]
+                value['interp'] = value.get('interp', 1)  # cv2.INTER_LINEAR
+            if fuse_normalize and key == 'NormalizeImage':
+                continue
+            p.update(value)
+            preprocess_list.append(p)
+    batch_transforms = reader_cfg.get('batch_transforms', None)
+    if batch_transforms:
+        for bt in batch_transforms:
+            for key, value in bt.items():
+                # for deploy/infer, use PadStride(stride) instead PadBatch(pad_to_stride)
+                if key == 'PadBatch':
+                    preprocess_list.append({
+                        'type': 'PadStride',
+                        'stride': value['pad_to_stride']
+                    })
+                    break
+
+    return preprocess_list, label_list
+
+
+def _parse_tracker(tracker_cfg):
+    tracker_params = {}
+    for k, v in tracker_cfg.items():
+        tracker_params.update({k: v})
+    return tracker_params
+
+
+def _dump_infer_config(config, path, image_shape, model):
+    arch_state = False
+    from ppdet.core.config.yaml_helpers import setup_orderdict
+    setup_orderdict()
+    use_dynamic_shape = True if image_shape[2] == -1 else False
+    infer_cfg = OrderedDict({
+        'mode': 'paddle',
+        'draw_threshold': 0.5,
+        'metric': config['metric'],
+        'use_dynamic_shape': use_dynamic_shape
+    })
+    export_onnx = config.get('export_onnx', False)
+    export_eb = config.get('export_eb', False)
+
+    infer_arch = config['architecture']
+    if 'RCNN' in infer_arch and export_onnx:
+        logger.warning(
+            "Exporting RCNN model to ONNX only support batch_size = 1")
+        infer_cfg['export_onnx'] = True
+        infer_cfg['export_eb'] = export_eb
+
+    if infer_arch in MOT_ARCH:
+        if infer_arch == 'DeepSORT':
+            tracker_cfg = config['DeepSORTTracker']
+        elif infer_arch == 'CenterTrack':
+            tracker_cfg = config['CenterTracker']
+        else:
+            tracker_cfg = config['JDETracker']
+        infer_cfg['tracker'] = _parse_tracker(tracker_cfg)
+
+    for arch, min_subgraph_size in TRT_MIN_SUBGRAPH.items():
+        if arch in infer_arch:
+            infer_cfg['arch'] = arch
+            infer_cfg['min_subgraph_size'] = min_subgraph_size
+            arch_state = True
+            break
+
+    if infer_arch == 'PPYOLOEWithAuxHead':
+        infer_arch = 'PPYOLOE'
+
+    if infer_arch in ['PPYOLOE', 'YOLOX', 'YOLOF']:
+        infer_cfg['arch'] = infer_arch
+        infer_cfg['min_subgraph_size'] = TRT_MIN_SUBGRAPH[infer_arch]
+        arch_state = True
+
+    if not arch_state:
+        logger.error(
+            'Architecture: {} is not supported for exporting model now.\n'.
+            format(infer_arch) +
+            'Please set TRT_MIN_SUBGRAPH in ppdet/engine/export_utils.py')
+        os._exit(0)
+    if 'mask_head' in config[config['architecture']] and config[config[
+            'architecture']]['mask_head']:
+        infer_cfg['mask'] = True
+    label_arch = 'detection_arch'
+    if infer_arch in KEYPOINT_ARCH:
+        label_arch = 'keypoint_arch'
+
+    if infer_arch in MOT_ARCH:
+        if config['metric'] in ['COCO', 'VOC']:
+            # MOT model run as Detector
+            reader_cfg = config['TestReader']
+            dataset_cfg = config['TestDataset']
+        else:
+            # 'metric' in ['MOT', 'MCMOT', 'KITTI']
+            label_arch = 'mot_arch'
+            reader_cfg = config['TestMOTReader']
+            dataset_cfg = config['TestMOTDataset']
+    else:
+        reader_cfg = config['TestReader']
+        dataset_cfg = config['TestDataset']
+
+    infer_cfg['Preprocess'], infer_cfg['label_list'] = _parse_reader(
+        reader_cfg, dataset_cfg, config['metric'], label_arch, image_shape[1:])
+
+    if infer_arch == 'PicoDet':
+        if hasattr(config, 'export') and config['export'].get(
+                'post_process',
+                False) and not config['export'].get('benchmark', False):
+            infer_cfg['arch'] = 'GFL'
+        head_name = 'PicoHeadV2' if config['PicoHeadV2'] else 'PicoHead'
+        infer_cfg['NMS'] = config[head_name]['nms']
+        # In order to speed up the prediction, the threshold of nms 
+        # is adjusted here, which can be changed in infer_cfg.yml
+        config[head_name]['nms']["score_threshold"] = 0.3
+        config[head_name]['nms']["nms_threshold"] = 0.5
+        infer_cfg['fpn_stride'] = config[head_name]['fpn_stride']
+
+    yaml.dump(infer_cfg, open(path, 'w'))
+    logger.info("Export inference config file to {}".format(os.path.join(path)))
diff --git a/rtdetr_paddle/ppdet/engine/trainer.py b/rtdetr_paddle/ppdet/engine/trainer.py
new file mode 100644
index 0000000..6c3f229
--- /dev/null
+++ b/rtdetr_paddle/ppdet/engine/trainer.py
@@ -0,0 +1,966 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import copy
+import time
+from tqdm import tqdm
+
+import numpy as np
+import typing
+from PIL import Image, ImageOps, ImageFile
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+import paddle
+import paddle.nn as nn
+import paddle.distributed as dist
+from paddle.distributed import fleet
+from paddle.static import InputSpec
+from ppdet.optimizer import ModelEMA
+
+from ppdet.core.workspace import create
+from ppdet.utils.checkpoint import load_weight, load_pretrain_weight
+from ppdet.utils.visualizer import visualize_results, save_result
+from ppdet.metrics import Metric, COCOMetric, VOCMetric, get_infer_results
+from ppdet.data.source.category import get_categories
+import ppdet.utils.stats as stats
+from ppdet.utils.fuse_utils import fuse_conv_bn
+from ppdet.utils import profiler
+from ppdet.modeling.post_process import multiclass_nms
+
+from .callbacks import Callback, ComposeCallback, LogPrinter, Checkpointer, VisualDLWriter, WandbCallback
+from .export_utils import _dump_infer_config, _prune_input_spec, apply_to_static
+
+from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('ppdet.engine')
+
+__all__ = ['Trainer']
+
+class Trainer(object):
+    def __init__(self, cfg, mode='train'):
+        self.cfg = cfg.copy()
+        assert mode.lower() in ['train', 'eval', 'test'], \
+                "mode should be 'train', 'eval' or 'test'"
+        self.mode = mode.lower()
+        self.optimizer = None
+        self.is_loaded_weights = False
+        self.use_amp = self.cfg.get('amp', False)
+        self.amp_level = self.cfg.get('amp_level', 'O1')
+        self.custom_white_list = self.cfg.get('custom_white_list', None)
+        self.custom_black_list = self.cfg.get('custom_black_list', None)
+
+        # build data loader
+        capital_mode = self.mode.capitalize()
+        self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(
+            '{}Dataset'.format(capital_mode))()
+
+        if self.mode == 'train':
+            self.loader = create('{}Reader'.format(capital_mode))(
+                self.dataset, cfg.worker_num)
+
+        # build model
+        if 'model' not in self.cfg:
+            self.model = create(cfg.architecture)
+        else:
+            self.model = self.cfg.model
+            self.is_loaded_weights = True
+
+        # EvalDataset build with BatchSampler to evaluate in single device
+        # TODO: multi-device evaluate
+        if self.mode == 'eval':
+            self._eval_batch_sampler = paddle.io.BatchSampler(
+                self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
+            reader_name = '{}Reader'.format(self.mode.capitalize())
+            # If metric is VOC, need to be set collate_batch=False.
+            if cfg.metric == 'VOC':
+                self.cfg[reader_name]['collate_batch'] = False
+            self.loader = create(reader_name)(self.dataset, cfg.worker_num,
+                                                self._eval_batch_sampler)
+            
+        # TestDataset build after user set images, skip loader creation here
+
+        # get Params
+        print_params = self.cfg.get('print_params', False)
+        if print_params:
+            params = sum([
+                p.numel() for n, p in self.model.named_parameters()
+                if all([x not in n for x in ['_mean', '_variance', 'aux_']])
+            ])  # exclude BatchNorm running status
+            logger.info('Model Params : {} M.'.format((params / 1e6).numpy()[
+                0]))
+
+        # build optimizer in train mode
+        if self.mode == 'train':
+            steps_per_epoch = len(self.loader)
+            if steps_per_epoch < 1:
+                logger.warning(
+                    "Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader."
+                )
+            self.lr = create('LearningRate')(steps_per_epoch)
+            self.optimizer = create('OptimizerBuilder')(self.lr, self.model)
+
+        if self.use_amp and self.amp_level == 'O2':
+            self.model, self.optimizer = paddle.amp.decorate(
+                models=self.model,
+                optimizers=self.optimizer,
+                level=self.amp_level)
+        self.use_ema = ('use_ema' in cfg and cfg['use_ema'])
+        if self.use_ema:
+            ema_decay = self.cfg.get('ema_decay', 0.9998)
+            ema_decay_type = self.cfg.get('ema_decay_type', 'threshold')
+            cycle_epoch = self.cfg.get('cycle_epoch', -1)
+            ema_black_list = self.cfg.get('ema_black_list', None)
+            ema_filter_no_grad = self.cfg.get('ema_filter_no_grad', False)
+            self.ema = ModelEMA(
+                self.model,
+                decay=ema_decay,
+                ema_decay_type=ema_decay_type,
+                cycle_epoch=cycle_epoch,
+                ema_black_list=ema_black_list,
+                ema_filter_no_grad=ema_filter_no_grad)
+
+        self._nranks = dist.get_world_size()
+        self._local_rank = dist.get_rank()
+
+        self.status = {}
+
+        self.start_epoch = 0
+        self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch
+
+        # initial default callbacks
+        self._init_callbacks()
+
+        # initial default metrics
+        self._init_metrics()
+        self._reset_metrics()
+
+    def _init_callbacks(self):
+        if self.mode == 'train':
+            self._callbacks = [LogPrinter(self), Checkpointer(self)]
+            if self.cfg.get('use_vdl', False):
+                self._callbacks.append(VisualDLWriter(self))
+            if self.cfg.get('use_wandb', False) or 'wandb' in self.cfg:
+                self._callbacks.append(WandbCallback(self))
+            self._compose_callback = ComposeCallback(self._callbacks)
+        elif self.mode == 'eval':
+            self._callbacks = [LogPrinter(self)]
+            self._compose_callback = ComposeCallback(self._callbacks)
+        elif self.mode == 'test' and self.cfg.get('use_vdl', False):
+            self._callbacks = [VisualDLWriter(self)]
+            self._compose_callback = ComposeCallback(self._callbacks)
+        else:
+            self._callbacks = []
+            self._compose_callback = None
+
+    def _init_metrics(self, validate=False):
+        if self.mode == 'test' or (self.mode == 'train' and not validate):
+            self._metrics = []
+            return
+        classwise = self.cfg['classwise'] if 'classwise' in self.cfg else False
+        if self.cfg.metric == 'COCO':
+            # TODO: bias should be unified
+            bias = 1 if self.cfg.get('bias', False) else 0
+            output_eval = self.cfg['output_eval'] \
+                if 'output_eval' in self.cfg else None
+            save_prediction_only = self.cfg.get('save_prediction_only', False)
+
+            # pass clsid2catid info to metric instance to avoid multiple loading
+            # annotation file
+            clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()} \
+                                if self.mode == 'eval' else None
+
+            # when do validation in train, annotation file should be get from
+            # EvalReader instead of self.dataset(which is TrainReader)
+            if self.mode == 'train' and validate:
+                eval_dataset = self.cfg['EvalDataset']
+                eval_dataset.check_or_download_dataset()
+                anno_file = eval_dataset.get_anno()
+                dataset = eval_dataset
+            else:
+                dataset = self.dataset
+                anno_file = dataset.get_anno()
+
+            IouType = self.cfg['IouType'] if 'IouType' in self.cfg else 'bbox'
+            self._metrics = [
+                COCOMetric(
+                    anno_file=anno_file,
+                    clsid2catid=clsid2catid,
+                    classwise=classwise,
+                    output_eval=output_eval,
+                    bias=bias,
+                    IouType=IouType,
+                    save_prediction_only=save_prediction_only)
+            ]
+
+        elif self.cfg.metric == 'VOC':
+            output_eval = self.cfg['output_eval'] \
+                if 'output_eval' in self.cfg else None
+            save_prediction_only = self.cfg.get('save_prediction_only', False)
+            self._metrics = [
+                VOCMetric(
+                    label_list=self.dataset.get_label_list(),
+                    class_num=self.cfg.num_classes,
+                    map_type=self.cfg.map_type,
+                    classwise=classwise,
+                    output_eval=output_eval,
+                    save_prediction_only=save_prediction_only)
+            ]
+        else:
+            logger.warning("Metric not support for metric type {}".format(
+                self.cfg.metric))
+            self._metrics = []
+
+    def _reset_metrics(self):
+        for metric in self._metrics:
+            metric.reset()
+
+    def register_callbacks(self, callbacks):
+        callbacks = [c for c in list(callbacks) if c is not None]
+        for c in callbacks:
+            assert isinstance(c, Callback), \
+                    "metrics shoule be instances of subclass of Metric"
+        self._callbacks.extend(callbacks)
+        self._compose_callback = ComposeCallback(self._callbacks)
+
+    def register_metrics(self, metrics):
+        metrics = [m for m in list(metrics) if m is not None]
+        for m in metrics:
+            assert isinstance(m, Metric), \
+                    "metrics shoule be instances of subclass of Metric"
+        self._metrics.extend(metrics)
+
+    def load_weights(self, weights, ARSL_eval=False):
+        if self.is_loaded_weights:
+            return
+        self.start_epoch = 0
+        load_pretrain_weight(self.model, weights, ARSL_eval)
+        logger.debug("Load weights {} to start training".format(weights))
+
+    def resume_weights(self, weights):
+        self.start_epoch = load_weight(self.model, weights, self.optimizer,
+                                        self.ema if self.use_ema else None)
+        logger.debug("Resume weights of epoch {}".format(self.start_epoch))
+
+    def train(self, validate=False):
+        assert self.mode == 'train', "Model not in 'train' mode"
+        Init_mark = False
+        if validate:
+            self.cfg['EvalDataset'] = self.cfg.EvalDataset = create(
+                "EvalDataset")()
+
+        model = self.model
+        if self.cfg.get('to_static', False):
+            model = apply_to_static(self.cfg, model)
+        sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and
+                   (self.cfg.use_gpu or self.cfg.use_mlu) and self._nranks > 1)
+        if sync_bn:
+            model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+
+        # enabel auto mixed precision mode
+        if self.use_amp:
+            scaler = paddle.amp.GradScaler(
+                enable=self.cfg.use_gpu or self.cfg.use_npu or self.cfg.use_mlu,
+                init_loss_scaling=self.cfg.get('init_loss_scaling', 1024))
+        # get distributed model
+        if self.cfg.get('fleet', False):
+            model = fleet.distributed_model(model)
+            self.optimizer = fleet.distributed_optimizer(self.optimizer)
+        elif self._nranks > 1:
+            find_unused_parameters = self.cfg[
+                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
+            model = paddle.DataParallel(
+                model, find_unused_parameters=find_unused_parameters)
+
+        self.status.update({
+            'epoch_id': self.start_epoch,
+            'step_id': 0,
+            'steps_per_epoch': len(self.loader)
+        })
+
+        self.status['batch_time'] = stats.SmoothedValue(
+            self.cfg.log_iter, fmt='{avg:.4f}')
+        self.status['data_time'] = stats.SmoothedValue(
+            self.cfg.log_iter, fmt='{avg:.4f}')
+        self.status['training_status'] = stats.TrainingStats(self.cfg.log_iter)
+
+        profiler_options = self.cfg.get('profiler_options', None)
+
+        self._compose_callback.on_train_begin(self.status)
+
+        use_fused_allreduce_gradients = self.cfg[
+            'use_fused_allreduce_gradients'] if 'use_fused_allreduce_gradients' in self.cfg else False
+
+        for epoch_id in range(self.start_epoch, self.cfg.epoch):
+            self.status['mode'] = 'train'
+            self.status['epoch_id'] = epoch_id
+            self._compose_callback.on_epoch_begin(self.status)
+            self.loader.dataset.set_epoch(epoch_id)
+            model.train()
+            iter_tic = time.time()
+            for step_id, data in enumerate(self.loader):
+                self.status['data_time'].update(time.time() - iter_tic)
+                self.status['step_id'] = step_id
+                profiler.add_profiler_step(profiler_options)
+                self._compose_callback.on_step_begin(self.status)
+                data['epoch_id'] = epoch_id
+                if self.cfg.get('to_static',
+                                False) and 'image_file' in data.keys():
+                    data.pop('image_file')
+
+                if self.use_amp:
+                    if isinstance(
+                            model, paddle.
+                            DataParallel) and use_fused_allreduce_gradients:
+                        with model.no_sync():
+                            with paddle.amp.auto_cast(
+                                    enable=self.cfg.use_gpu or
+                                    self.cfg.use_npu or self.cfg.use_mlu,
+                                    custom_white_list=self.custom_white_list,
+                                    custom_black_list=self.custom_black_list,
+                                    level=self.amp_level):
+                                # model forward
+                                outputs = model(data)
+                                loss = outputs['loss']
+                            # model backward
+                            scaled_loss = scaler.scale(loss)
+                            scaled_loss.backward()
+                        fused_allreduce_gradients(
+                            list(model.parameters()), None)
+                    else:
+                        with paddle.amp.auto_cast(
+                                enable=self.cfg.use_gpu or self.cfg.use_npu or
+                                self.cfg.use_mlu,
+                                custom_white_list=self.custom_white_list,
+                                custom_black_list=self.custom_black_list,
+                                level=self.amp_level):
+                            # model forward
+                            outputs = model(data)
+                            loss = outputs['loss']
+                        # model backward
+                        scaled_loss = scaler.scale(loss)
+                        scaled_loss.backward()
+                    # in dygraph mode, optimizer.minimize is equal to optimizer.step
+                    scaler.minimize(self.optimizer, scaled_loss)
+                else:
+                    if isinstance(
+                            model, paddle.
+                            DataParallel) and use_fused_allreduce_gradients:
+                        with model.no_sync():
+                            # model forward
+                            outputs = model(data)
+                            loss = outputs['loss']
+                            # model backward
+                            loss.backward()
+                        fused_allreduce_gradients(
+                            list(model.parameters()), None)
+                    else:
+                        # model forward
+                        outputs = model(data)
+                        loss = outputs['loss']
+                        # model backward
+                        loss.backward()
+                    self.optimizer.step()
+                curr_lr = self.optimizer.get_lr()
+                self.lr.step()
+                self.optimizer.clear_grad()
+                self.status['learning_rate'] = curr_lr
+
+                if self._nranks < 2 or self._local_rank == 0:
+                    self.status['training_status'].update(outputs)
+
+                self.status['batch_time'].update(time.time() - iter_tic)
+                self._compose_callback.on_step_end(self.status)
+                if self.use_ema:
+                    self.ema.update()
+                iter_tic = time.time()
+
+            is_snapshot = (self._nranks < 2 or (self._local_rank == 0 or self.cfg.metric == "Pose3DEval")) \
+                       and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1)
+            if is_snapshot and self.use_ema:
+                # apply ema weight on model
+                weight = copy.deepcopy(self.model.state_dict())
+                self.model.set_dict(self.ema.apply())
+                self.status['weight'] = weight
+
+            self._compose_callback.on_epoch_end(self.status)
+
+            if validate and is_snapshot:
+                if not hasattr(self, '_eval_loader'):
+                    # build evaluation dataset and loader
+                    self._eval_dataset = self.cfg.EvalDataset
+                    self._eval_batch_sampler = \
+                        paddle.io.BatchSampler(
+                            self._eval_dataset,
+                            batch_size=self.cfg.EvalReader['batch_size'])
+                    # If metric is VOC, need to be set collate_batch=False.
+                    if self.cfg.metric == 'VOC':
+                        self.cfg['EvalReader']['collate_batch'] = False
+                    else:
+                        self._eval_loader = create('EvalReader')(
+                            self._eval_dataset,
+                            self.cfg.worker_num,
+                            batch_sampler=self._eval_batch_sampler)
+                # if validation in training is enabled, metrics should be re-init
+                # Init_mark makes sure this code will only execute once
+                if validate and Init_mark == False:
+                    Init_mark = True
+                    self._init_metrics(validate=validate)
+                    self._reset_metrics()
+
+                with paddle.no_grad():
+                    self.status['save_best_model'] = True
+                    self._eval_with_loader(self._eval_loader)
+
+            if is_snapshot and self.use_ema:
+                # reset original weight
+                self.model.set_dict(weight)
+                self.status.pop('weight')
+
+        self._compose_callback.on_train_end(self.status)
+
+    def _eval_with_loader(self, loader):
+        sample_num = 0
+        tic = time.time()
+        self._compose_callback.on_epoch_begin(self.status)
+        self.status['mode'] = 'eval'
+
+        self.model.eval()
+        for step_id, data in enumerate(loader):
+            self.status['step_id'] = step_id
+            self._compose_callback.on_step_begin(self.status)
+            # forward
+            if self.use_amp:
+                with paddle.amp.auto_cast(
+                        enable=self.cfg.use_gpu or self.cfg.use_npu or
+                        self.cfg.use_mlu,
+                        custom_white_list=self.custom_white_list,
+                        custom_black_list=self.custom_black_list,
+                        level=self.amp_level):
+                    outs = self.model(data)
+            else:
+                outs = self.model(data)
+
+            # update metrics
+            for metric in self._metrics:
+                metric.update(data, outs)
+
+            # multi-scale inputs: all inputs have same im_id
+            if isinstance(data, typing.Sequence):
+                sample_num += data[0]['im_id'].numpy().shape[0]
+            else:
+                sample_num += data['im_id'].numpy().shape[0]
+            self._compose_callback.on_step_end(self.status)
+
+        self.status['sample_num'] = sample_num
+        self.status['cost_time'] = time.time() - tic
+
+        # accumulate metric to log out
+        for metric in self._metrics:
+            metric.accumulate()
+            metric.log()
+        self._compose_callback.on_epoch_end(self.status)
+        # reset metric states for metric may performed multiple times
+        self._reset_metrics()
+
+    def evaluate(self):
+        # get distributed model
+        if self.cfg.get('fleet', False):
+            self.model = fleet.distributed_model(self.model)
+            self.optimizer = fleet.distributed_optimizer(self.optimizer)
+        elif self._nranks > 1:
+            find_unused_parameters = self.cfg[
+                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
+            self.model = paddle.DataParallel(
+                self.model, find_unused_parameters=find_unused_parameters)
+        with paddle.no_grad():
+            self._eval_with_loader(self.loader)
+
+    def _eval_with_loader_slice(self,
+                                loader,
+                                slice_size=[640, 640],
+                                overlap_ratio=[0.25, 0.25],
+                                combine_method='nms',
+                                match_threshold=0.6,
+                                match_metric='iou'):
+        sample_num = 0
+        tic = time.time()
+        self._compose_callback.on_epoch_begin(self.status)
+        self.status['mode'] = 'eval'
+        self.model.eval()
+        merged_bboxs = []
+        for step_id, data in enumerate(loader):
+            self.status['step_id'] = step_id
+            self._compose_callback.on_step_begin(self.status)
+            # forward
+            if self.use_amp:
+                with paddle.amp.auto_cast(
+                        enable=self.cfg.use_gpu or self.cfg.use_npu or
+                        self.cfg.use_mlu,
+                        custom_white_list=self.custom_white_list,
+                        custom_black_list=self.custom_black_list,
+                        level=self.amp_level):
+                    outs = self.model(data)
+            else:
+                outs = self.model(data)
+
+            shift_amount = data['st_pix']
+            outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount
+            outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount
+            merged_bboxs.append(outs['bbox'])
+
+            if data['is_last'] > 0:
+                # merge matching predictions
+                merged_results = {'bbox': []}
+                if combine_method == 'nms':
+                    final_boxes = multiclass_nms(
+                        np.concatenate(merged_bboxs), self.cfg.num_classes,
+                        match_threshold, match_metric)
+                    merged_results['bbox'] = np.concatenate(final_boxes)
+                elif combine_method == 'concat':
+                    merged_results['bbox'] = np.concatenate(merged_bboxs)
+                else:
+                    raise ValueError(
+                        "Now only support 'nms' or 'concat' to fuse detection results."
+                    )
+                merged_results['im_id'] = np.array([[0]])
+                merged_results['bbox_num'] = np.array(
+                    [len(merged_results['bbox'])])
+
+                merged_bboxs = []
+                data['im_id'] = data['ori_im_id']
+                # update metrics
+                for metric in self._metrics:
+                    metric.update(data, merged_results)
+
+                # multi-scale inputs: all inputs have same im_id
+                if isinstance(data, typing.Sequence):
+                    sample_num += data[0]['im_id'].numpy().shape[0]
+                else:
+                    sample_num += data['im_id'].numpy().shape[0]
+
+            self._compose_callback.on_step_end(self.status)
+
+        self.status['sample_num'] = sample_num
+        self.status['cost_time'] = time.time() - tic
+
+        # accumulate metric to log out
+        for metric in self._metrics:
+            metric.accumulate()
+            metric.log()
+        self._compose_callback.on_epoch_end(self.status)
+        # reset metric states for metric may performed multiple times
+        self._reset_metrics()
+
+    def evaluate_slice(self,
+                       slice_size=[640, 640],
+                       overlap_ratio=[0.25, 0.25],
+                       combine_method='nms',
+                       match_threshold=0.6,
+                       match_metric='iou'):
+        with paddle.no_grad():
+            self._eval_with_loader_slice(self.loader, slice_size, overlap_ratio,
+                                         combine_method, match_threshold,
+                                         match_metric)
+
+    def slice_predict(self,
+                      images,
+                      slice_size=[640, 640],
+                      overlap_ratio=[0.25, 0.25],
+                      combine_method='nms',
+                      match_threshold=0.6,
+                      match_metric='iou',
+                      draw_threshold=0.5,
+                      output_dir='output',
+                      save_results=False,
+                      visualize=True):
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        self.dataset.set_slice_images(images, slice_size, overlap_ratio)
+        loader = create('TestReader')(self.dataset, 0)
+        imid2path = self.dataset.get_imid2path()
+
+        def setup_metrics_for_loader():
+            # mem
+            metrics = copy.deepcopy(self._metrics)
+            mode = self.mode
+            save_prediction_only = self.cfg[
+                'save_prediction_only'] if 'save_prediction_only' in self.cfg else None
+            output_eval = self.cfg[
+                'output_eval'] if 'output_eval' in self.cfg else None
+
+            # modify
+            self.mode = '_test'
+            self.cfg['save_prediction_only'] = True
+            self.cfg['output_eval'] = output_dir
+            self.cfg['imid2path'] = imid2path
+            self._init_metrics()
+
+            # restore
+            self.mode = mode
+            self.cfg.pop('save_prediction_only')
+            if save_prediction_only is not None:
+                self.cfg['save_prediction_only'] = save_prediction_only
+
+            self.cfg.pop('output_eval')
+            if output_eval is not None:
+                self.cfg['output_eval'] = output_eval
+
+            self.cfg.pop('imid2path')
+
+            _metrics = copy.deepcopy(self._metrics)
+            self._metrics = metrics
+
+            return _metrics
+
+        if save_results:
+            metrics = setup_metrics_for_loader()
+        else:
+            metrics = []
+
+        anno_file = self.dataset.get_anno()
+        clsid2catid, catid2name = get_categories(
+            self.cfg.metric, anno_file=anno_file)
+
+        # Run Infer 
+        self.status['mode'] = 'test'
+        self.model.eval()
+
+        results = []  # all images
+        merged_bboxs = []  # single image
+        for step_id, data in enumerate(tqdm(loader)):
+            self.status['step_id'] = step_id
+            # forward
+            with paddle.no_grad():
+                outs = self.model(data)
+
+            outs['bbox'] = outs['bbox'].numpy()  # only in test mode
+            shift_amount = data['st_pix']
+            outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount.numpy()
+            outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount.numpy()
+            merged_bboxs.append(outs['bbox'])
+
+            if data['is_last'] > 0:
+                # merge matching predictions
+                merged_results = {'bbox': []}
+                if combine_method == 'nms':
+                    final_boxes = multiclass_nms(
+                        np.concatenate(merged_bboxs), self.cfg.num_classes,
+                        match_threshold, match_metric)
+                    merged_results['bbox'] = np.concatenate(final_boxes)
+                elif combine_method == 'concat':
+                    merged_results['bbox'] = np.concatenate(merged_bboxs)
+                else:
+                    raise ValueError(
+                        "Now only support 'nms' or 'concat' to fuse detection results."
+                    )
+                merged_results['im_id'] = np.array([[0]])
+                merged_results['bbox_num'] = np.array(
+                    [len(merged_results['bbox'])])
+
+                merged_bboxs = []
+                data['im_id'] = data['ori_im_id']
+
+                for _m in metrics:
+                    _m.update(data, merged_results)
+
+                for key in ['im_shape', 'scale_factor', 'im_id']:
+                    if isinstance(data, typing.Sequence):
+                        merged_results[key] = data[0][key]
+                    else:
+                        merged_results[key] = data[key]
+                for key, value in merged_results.items():
+                    if hasattr(value, 'numpy'):
+                        merged_results[key] = value.numpy()
+                results.append(merged_results)
+
+        for _m in metrics:
+            _m.accumulate()
+            _m.reset()
+
+        if visualize:
+            for outs in results:
+                batch_res = get_infer_results(outs, clsid2catid)
+                bbox_num = outs['bbox_num']
+
+                start = 0
+                for i, im_id in enumerate(outs['im_id']):
+                    image_path = imid2path[int(im_id)]
+                    image = Image.open(image_path).convert('RGB')
+                    image = ImageOps.exif_transpose(image)
+                    self.status['original_image'] = np.array(image.copy())
+
+                    end = start + bbox_num[i]
+                    bbox_res = batch_res['bbox'][start:end] \
+                            if 'bbox' in batch_res else None
+                    mask_res = batch_res['mask'][start:end] \
+                            if 'mask' in batch_res else None
+                    segm_res = batch_res['segm'][start:end] \
+                            if 'segm' in batch_res else None
+                    keypoint_res = batch_res['keypoint'][start:end] \
+                            if 'keypoint' in batch_res else None
+                    pose3d_res = batch_res['pose3d'][start:end] \
+                            if 'pose3d' in batch_res else None
+                    image = visualize_results(
+                        image, bbox_res, mask_res, segm_res, keypoint_res,
+                        pose3d_res, int(im_id), catid2name, draw_threshold)
+                    self.status['result_image'] = np.array(image.copy())
+                    if self._compose_callback:
+                        self._compose_callback.on_step_end(self.status)
+                    # save image with detection
+                    save_name = self._get_save_image_name(output_dir,
+                                                          image_path)
+                    logger.info("Detection bbox results save in {}".format(
+                        save_name))
+                    image.save(save_name, quality=95)
+
+                    start = end
+
+    def predict(self,
+                images,
+                draw_threshold=0.5,
+                output_dir='output',
+                save_results=False,
+                visualize=True):
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        self.dataset.set_images(images)
+        loader = create('TestReader')(self.dataset, 0)
+
+        imid2path = self.dataset.get_imid2path()
+
+        def setup_metrics_for_loader():
+            # mem
+            metrics = copy.deepcopy(self._metrics)
+            mode = self.mode
+            save_prediction_only = self.cfg[
+                'save_prediction_only'] if 'save_prediction_only' in self.cfg else None
+            output_eval = self.cfg[
+                'output_eval'] if 'output_eval' in self.cfg else None
+
+            # modify
+            self.mode = '_test'
+            self.cfg['save_prediction_only'] = True
+            self.cfg['output_eval'] = output_dir
+            self.cfg['imid2path'] = imid2path
+            self._init_metrics()
+
+            # restore
+            self.mode = mode
+            self.cfg.pop('save_prediction_only')
+            if save_prediction_only is not None:
+                self.cfg['save_prediction_only'] = save_prediction_only
+
+            self.cfg.pop('output_eval')
+            if output_eval is not None:
+                self.cfg['output_eval'] = output_eval
+
+            self.cfg.pop('imid2path')
+
+            _metrics = copy.deepcopy(self._metrics)
+            self._metrics = metrics
+
+            return _metrics
+
+        if save_results:
+            metrics = setup_metrics_for_loader()
+        else:
+            metrics = []
+
+        anno_file = self.dataset.get_anno()
+        clsid2catid, catid2name = get_categories(
+            self.cfg.metric, anno_file=anno_file)
+
+        # Run Infer 
+        self.status['mode'] = 'test'
+        self.model.eval()
+
+        results = []
+        for step_id, data in enumerate(tqdm(loader)):
+            self.status['step_id'] = step_id
+            # forward
+            with paddle.no_grad():
+                if hasattr(self.model, 'modelTeacher'):
+                    outs = self.model.modelTeacher(data)
+                else:
+                    outs = self.model(data)
+
+            for _m in metrics:
+                _m.update(data, outs)
+
+            for key in ['im_shape', 'scale_factor', 'im_id']:
+                if isinstance(data, typing.Sequence):
+                    outs[key] = data[0][key]
+                else:
+                    outs[key] = data[key]
+            for key, value in outs.items():
+                if hasattr(value, 'numpy'):
+                    outs[key] = value.numpy()
+            results.append(outs)
+
+        for _m in metrics:
+            _m.accumulate()
+            _m.reset()
+
+        if visualize:
+            for outs in results:
+                batch_res = get_infer_results(outs, clsid2catid)
+                bbox_num = outs['bbox_num']
+
+                start = 0
+                for i, im_id in enumerate(outs['im_id']):
+                    image_path = imid2path[int(im_id)]
+                    image = Image.open(image_path).convert('RGB')
+                    image = ImageOps.exif_transpose(image)
+                    self.status['original_image'] = np.array(image.copy())
+
+                    end = start + bbox_num[i]
+                    bbox_res = batch_res['bbox'][start:end] \
+                            if 'bbox' in batch_res else None
+                    mask_res = batch_res['mask'][start:end] \
+                            if 'mask' in batch_res else None
+                    segm_res = batch_res['segm'][start:end] \
+                            if 'segm' in batch_res else None
+                    keypoint_res = batch_res['keypoint'][start:end] \
+                            if 'keypoint' in batch_res else None
+                    pose3d_res = batch_res['pose3d'][start:end] \
+                            if 'pose3d' in batch_res else None
+                    image = visualize_results(
+                        image, bbox_res, mask_res, segm_res, keypoint_res,
+                        pose3d_res, int(im_id), catid2name, draw_threshold)
+                    self.status['result_image'] = np.array(image.copy())
+                    if self._compose_callback:
+                        self._compose_callback.on_step_end(self.status)
+                    # save image with detection
+                    save_name = self._get_save_image_name(output_dir,
+                                                          image_path)
+                    logger.info("Detection bbox results save in {}".format(
+                        save_name))
+                    image.save(save_name, quality=95)
+
+                    start = end
+        return results
+
+    def _get_save_image_name(self, output_dir, image_path):
+        """
+        Get save image name from source image path.
+        """
+        image_name = os.path.split(image_path)[-1]
+        name, ext = os.path.splitext(image_name)
+        return os.path.join(output_dir, "{}".format(name)) + ext
+
+    def _get_infer_cfg_and_input_spec(self,
+                                      save_dir,
+                                      prune_input=True,
+                                      kl_quant=False):
+        image_shape = None
+        im_shape = [None, 2]
+        scale_factor = [None, 2]
+        test_reader_name = 'TestReader'
+        if 'inputs_def' in self.cfg[test_reader_name]:
+            inputs_def = self.cfg[test_reader_name]['inputs_def']
+            image_shape = inputs_def.get('image_shape', None)
+        # set image_shape=[None, 3, -1, -1] as default
+        if image_shape is None:
+            image_shape = [None, 3, -1, -1]
+    
+        if len(image_shape) == 3:
+            image_shape = [None] + image_shape
+        else:
+            im_shape = [image_shape[0], 2]
+            scale_factor = [image_shape[0], 2]
+    
+        if hasattr(self.model, 'deploy'):
+            self.model.deploy = True
+    
+        for layer in self.model.sublayers():
+            if hasattr(layer, 'convert_to_deploy'):
+                layer.convert_to_deploy()
+    
+        if hasattr(self.cfg, 'export') and 'fuse_conv_bn' in self.cfg[
+                'export'] and self.cfg['export']['fuse_conv_bn']:
+            self.model = fuse_conv_bn(self.model)
+    
+        export_post_process = self.cfg['export'].get(
+            'post_process', False) if hasattr(self.cfg, 'export') else True
+        export_nms = self.cfg['export'].get('nms', False) if hasattr(
+            self.cfg, 'export') else True
+        export_benchmark = self.cfg['export'].get(
+            'benchmark', False) if hasattr(self.cfg, 'export') else False
+        if hasattr(self.model, 'export_post_process'):
+            self.model.export_post_process = export_post_process if not export_benchmark else False
+        if hasattr(self.model, 'export_nms'):
+            self.model.export_nms = export_nms if not export_benchmark else False
+        if export_post_process and not export_benchmark:
+            image_shape = [None] + image_shape[1:]
+    
+        # Save infer cfg
+        _dump_infer_config(self.cfg,
+                           os.path.join(save_dir, 'infer_cfg.yml'), image_shape,
+                           self.model)
+    
+        input_spec = [{
+            "image": InputSpec(
+                shape=image_shape, name='image'),
+            "im_shape": InputSpec(
+                shape=im_shape, name='im_shape'),
+            "scale_factor": InputSpec(
+                shape=scale_factor, name='scale_factor')
+        }]
+    
+        if prune_input:
+            static_model = paddle.jit.to_static(
+                self.model, input_spec=input_spec, full_graph=True)
+            # NOTE: dy2st do not pruned program, but jit.save will prune program
+            # input spec, prune input spec here and save with pruned input spec
+            pruned_input_spec = _prune_input_spec(
+                input_spec, static_model.forward.main_program,
+                static_model.forward.outputs)
+        else:
+            static_model = None
+            pruned_input_spec = input_spec
+    
+        return static_model, pruned_input_spec
+
+    def export(self, output_dir='output_inference'):
+        if hasattr(self.model, 'aux_neck'):
+            self.model.__delattr__('aux_neck')
+        if hasattr(self.model, 'aux_head'):
+            self.model.__delattr__('aux_head')
+        self.model.eval()
+
+        model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]
+        save_dir = os.path.join(output_dir, model_name)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+
+        static_model, pruned_input_spec = self._get_infer_cfg_and_input_spec(
+            save_dir)
+
+        # dy2st and save model
+        paddle.jit.save(
+            static_model,
+            os.path.join(save_dir, 'model'),
+            input_spec=pruned_input_spec)
+
+        logger.info("Export model and saved in {}".format(save_dir))
diff --git a/rtdetr_paddle/ppdet/metrics/__init__.py b/rtdetr_paddle/ppdet/metrics/__init__.py
new file mode 100644
index 0000000..afca8d0
--- /dev/null
+++ b/rtdetr_paddle/ppdet/metrics/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import metrics
+
+from .metrics import *
+from .pose3d_metrics import *
+
+from . import mot_metrics
+from .mot_metrics import *
+__all__ = metrics.__all__ + mot_metrics.__all__
+
+from . import mcmot_metrics
+from .mcmot_metrics import *
+__all__ = metrics.__all__ + mcmot_metrics.__all__
\ No newline at end of file
diff --git a/rtdetr_paddle/ppdet/metrics/coco_utils.py b/rtdetr_paddle/ppdet/metrics/coco_utils.py
new file mode 100644
index 0000000..b7a4d7e
--- /dev/null
+++ b/rtdetr_paddle/ppdet/metrics/coco_utils.py
@@ -0,0 +1,188 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import numpy as np
+import itertools
+
+from ppdet.metrics.json_results import get_det_res, get_det_poly_res, get_seg_res, get_solov2_segm_res, get_keypoint_res, get_pose3d_res
+from ppdet.metrics.map_utils import draw_pr_curve
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+def get_infer_results(outs, catid, bias=0):
+    """
+    Get result at the stage of inference.
+    The output format is dictionary containing bbox or mask result.
+
+    For example, bbox result is a list and each element contains
+    image_id, category_id, bbox and score.
+    """
+    if outs is None or len(outs) == 0:
+        raise ValueError(
+            'The number of valid detection result if zero. Please use reasonable model and check input data.'
+        )
+
+    im_id = outs['im_id']
+
+    infer_res = {}
+    if 'bbox' in outs:
+        if len(outs['bbox']) > 0 and len(outs['bbox'][0]) > 6:
+            infer_res['bbox'] = get_det_poly_res(
+                outs['bbox'], outs['bbox_num'], im_id, catid, bias=bias)
+        else:
+            infer_res['bbox'] = get_det_res(
+                outs['bbox'], outs['bbox_num'], im_id, catid, bias=bias)
+
+    if 'mask' in outs:
+        # mask post process
+        infer_res['mask'] = get_seg_res(outs['mask'], outs['bbox'],
+                                        outs['bbox_num'], im_id, catid)
+
+    if 'segm' in outs:
+        infer_res['segm'] = get_solov2_segm_res(outs, im_id, catid)
+
+    if 'keypoint' in outs:
+        infer_res['keypoint'] = get_keypoint_res(outs, im_id)
+        outs['bbox_num'] = [len(infer_res['keypoint'])]
+
+    if 'pose3d' in outs:
+        infer_res['pose3d'] = get_pose3d_res(outs, im_id)
+        outs['bbox_num'] = [len(infer_res['pose3d'])]
+
+    return infer_res
+
+
+def cocoapi_eval(jsonfile,
+                 style,
+                 coco_gt=None,
+                 anno_file=None,
+                 max_dets=(100, 300, 1000),
+                 classwise=False,
+                 sigmas=None,
+                 use_area=True):
+    """
+    Args:
+        jsonfile (str): Evaluation json file, eg: bbox.json, mask.json.
+        style (str): COCOeval style, can be `bbox` , `segm` , `proposal`, `keypoints` and `keypoints_crowd`.
+        coco_gt (str): Whether to load COCOAPI through anno_file,
+                 eg: coco_gt = COCO(anno_file)
+        anno_file (str): COCO annotations file.
+        max_dets (tuple): COCO evaluation maxDets.
+        classwise (bool): Whether per-category AP and draw P-R Curve or not.
+        sigmas (nparray): keypoint labelling sigmas.
+        use_area (bool): If gt annotations (eg. CrowdPose, AIC)
+                         do not have 'area', please set use_area=False.
+    """
+    assert coco_gt != None or anno_file != None
+    if style == 'keypoints_crowd':
+        #please install xtcocotools==1.6
+        from xtcocotools.coco import COCO
+        from xtcocotools.cocoeval import COCOeval
+    else:
+        from pycocotools.coco import COCO
+        from pycocotools.cocoeval import COCOeval
+
+    if coco_gt == None:
+        coco_gt = COCO(anno_file)
+    logger.info("Start evaluate...")
+    coco_dt = coco_gt.loadRes(jsonfile)
+    if style == 'proposal':
+        coco_eval = COCOeval(coco_gt, coco_dt, 'bbox')
+        coco_eval.params.useCats = 0
+        coco_eval.params.maxDets = list(max_dets)
+    elif style == 'keypoints_crowd':
+        coco_eval = COCOeval(coco_gt, coco_dt, style, sigmas, use_area)
+    else:
+        coco_eval = COCOeval(coco_gt, coco_dt, style)
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    if classwise:
+        # Compute per-category AP and PR curve
+        try:
+            from terminaltables import AsciiTable
+        except Exception as e:
+            logger.error(
+                'terminaltables not found, plaese install terminaltables. '
+                'for example: `pip install terminaltables`.')
+            raise e
+        precisions = coco_eval.eval['precision']
+        cat_ids = coco_gt.getCatIds()
+        # precision: (iou, recall, cls, area range, max dets)
+        assert len(cat_ids) == precisions.shape[2]
+        results_per_category = []
+        for idx, catId in enumerate(cat_ids):
+            # area range index 0: all area ranges
+            # max dets index -1: typically 100 per image
+            nm = coco_gt.loadCats(catId)[0]
+            precision = precisions[:, :, idx, 0, -1]
+            precision = precision[precision > -1]
+            if precision.size:
+                ap = np.mean(precision)
+            else:
+                ap = float('nan')
+            results_per_category.append(
+                (str(nm["name"]), '{:0.3f}'.format(float(ap))))
+            pr_array = precisions[0, :, idx, 0, 2]
+            recall_array = np.arange(0.0, 1.01, 0.01)
+            draw_pr_curve(
+                pr_array,
+                recall_array,
+                out_dir=style + '_pr_curve',
+                file_name='{}_precision_recall_curve.jpg'.format(nm["name"]))
+
+        num_columns = min(6, len(results_per_category) * 2)
+        results_flatten = list(itertools.chain(*results_per_category))
+        headers = ['category', 'AP'] * (num_columns // 2)
+        results_2d = itertools.zip_longest(
+            * [results_flatten[i::num_columns] for i in range(num_columns)])
+        table_data = [headers]
+        table_data += [result for result in results_2d]
+        table = AsciiTable(table_data)
+        logger.info('Per-category of {} AP: \n{}'.format(style, table.table))
+        logger.info("per-category PR curve has output to {} folder.".format(
+            style + '_pr_curve'))
+    # flush coco evaluation result
+    sys.stdout.flush()
+    return coco_eval.stats
+
+
+def json_eval_results(metric, json_directory, dataset):
+    """
+    cocoapi eval with already exists proposal.json, bbox.json or mask.json
+    """
+    assert metric == 'COCO'
+    anno_file = dataset.get_anno()
+    json_file_list = ['proposal.json', 'bbox.json', 'mask.json']
+    if json_directory:
+        assert os.path.exists(
+            json_directory), "The json directory:{} does not exist".format(
+                json_directory)
+        for k, v in enumerate(json_file_list):
+            json_file_list[k] = os.path.join(str(json_directory), v)
+
+    coco_eval_style = ['proposal', 'bbox', 'segm']
+    for i, v_json in enumerate(json_file_list):
+        if os.path.exists(v_json):
+            cocoapi_eval(v_json, coco_eval_style[i], anno_file=anno_file)
+        else:
+            logger.info("{} not exists!".format(v_json))
diff --git a/rtdetr_paddle/ppdet/metrics/json_results.py b/rtdetr_paddle/ppdet/metrics/json_results.py
new file mode 100755
index 0000000..d2575af
--- /dev/null
+++ b/rtdetr_paddle/ppdet/metrics/json_results.py
@@ -0,0 +1,175 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import six
+import numpy as np
+
+
+def get_det_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0):
+    det_res = []
+    k = 0
+    for i in range(len(bbox_nums)):
+        cur_image_id = int(image_id[i][0])
+        det_nums = bbox_nums[i]
+        for j in range(det_nums):
+            dt = bboxes[k]
+            k = k + 1
+            num_id, score, xmin, ymin, xmax, ymax = dt.tolist()
+            if int(num_id) < 0:
+                continue
+            category_id = label_to_cat_id_map[int(num_id)]
+            w = xmax - xmin + bias
+            h = ymax - ymin + bias
+            bbox = [xmin, ymin, w, h]
+            dt_res = {
+                'image_id': cur_image_id,
+                'category_id': category_id,
+                'bbox': bbox,
+                'score': score
+            }
+            det_res.append(dt_res)
+    return det_res
+
+
+def get_det_poly_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0):
+    det_res = []
+    k = 0
+    for i in range(len(bbox_nums)):
+        cur_image_id = int(image_id[i][0])
+        det_nums = bbox_nums[i]
+        for j in range(det_nums):
+            dt = bboxes[k]
+            k = k + 1
+            num_id, score, x1, y1, x2, y2, x3, y3, x4, y4 = dt.tolist()
+            if int(num_id) < 0:
+                continue
+            category_id = label_to_cat_id_map[int(num_id)]
+            rbox = [x1, y1, x2, y2, x3, y3, x4, y4]
+            dt_res = {
+                'image_id': cur_image_id,
+                'category_id': category_id,
+                'bbox': rbox,
+                'score': score
+            }
+            det_res.append(dt_res)
+    return det_res
+
+
+def strip_mask(mask):
+    row = mask[0, 0, :]
+    col = mask[0, :, 0]
+    im_h = len(col) - np.count_nonzero(col == -1)
+    im_w = len(row) - np.count_nonzero(row == -1)
+    return mask[:, :im_h, :im_w]
+
+
+def get_seg_res(masks, bboxes, mask_nums, image_id, label_to_cat_id_map):
+    import pycocotools.mask as mask_util
+    seg_res = []
+    k = 0
+    for i in range(len(mask_nums)):
+        cur_image_id = int(image_id[i][0])
+        det_nums = mask_nums[i]
+        mask_i = masks[k:k + det_nums]
+        mask_i = strip_mask(mask_i)
+        for j in range(det_nums):
+            mask = mask_i[j].astype(np.uint8)
+            score = float(bboxes[k][1])
+            label = int(bboxes[k][0])
+            k = k + 1
+            if label == -1:
+                continue
+            cat_id = label_to_cat_id_map[label]
+            rle = mask_util.encode(
+                np.array(
+                    mask[:, :, None], order="F", dtype="uint8"))[0]
+            if six.PY3:
+                if 'counts' in rle:
+                    rle['counts'] = rle['counts'].decode("utf8")
+            sg_res = {
+                'image_id': cur_image_id,
+                'category_id': cat_id,
+                'segmentation': rle,
+                'score': score
+            }
+            seg_res.append(sg_res)
+    return seg_res
+
+
+def get_solov2_segm_res(results, image_id, num_id_to_cat_id_map):
+    import pycocotools.mask as mask_util
+    segm_res = []
+    # for each batch
+    segms = results['segm'].astype(np.uint8)
+    clsid_labels = results['cate_label']
+    clsid_scores = results['cate_score']
+    lengths = segms.shape[0]
+    im_id = int(image_id[0][0])
+    if lengths == 0 or segms is None:
+        return None
+    # for each sample
+    for i in range(lengths - 1):
+        clsid = int(clsid_labels[i])
+        catid = num_id_to_cat_id_map[clsid]
+        score = float(clsid_scores[i])
+        mask = segms[i]
+        segm = mask_util.encode(np.array(mask[:, :, np.newaxis], order='F'))[0]
+        segm['counts'] = segm['counts'].decode('utf8')
+        coco_res = {
+            'image_id': im_id,
+            'category_id': catid,
+            'segmentation': segm,
+            'score': score
+        }
+        segm_res.append(coco_res)
+    return segm_res
+
+
+def get_keypoint_res(results, im_id):
+    anns = []
+    preds = results['keypoint']
+    for idx in range(im_id.shape[0]):
+        image_id = im_id[idx].item()
+        kpts, scores = preds[idx]
+        for kpt, score in zip(kpts, scores):
+            kpt = kpt.flatten()
+            ann = {
+                'image_id': image_id,
+                'category_id': 1,  # XXX hard code
+                'keypoints': kpt.tolist(),
+                'score': float(score)
+            }
+            x = kpt[0::3]
+            y = kpt[1::3]
+            x0, x1, y0, y1 = np.min(x).item(), np.max(x).item(), np.min(y).item(
+            ), np.max(y).item()
+            ann['area'] = (x1 - x0) * (y1 - y0)
+            ann['bbox'] = [x0, y0, x1 - x0, y1 - y0]
+            anns.append(ann)
+    return anns
+
+
+def get_pose3d_res(results, im_id):
+    anns = []
+    preds = results['pose3d']
+    for idx in range(im_id.shape[0]):
+        image_id = im_id[idx].item()
+        pose3d = preds[idx]
+        ann = {
+            'image_id': image_id,
+            'category_id': 1,  # XXX hard code
+            'pose3d': pose3d.tolist(),
+            'score': float(1.)
+        }
+        anns.append(ann)
+    return anns
diff --git a/rtdetr_paddle/ppdet/metrics/keypoint_metrics.py b/rtdetr_paddle/ppdet/metrics/keypoint_metrics.py
new file mode 100644
index 0000000..cbd52d0
--- /dev/null
+++ b/rtdetr_paddle/ppdet/metrics/keypoint_metrics.py
@@ -0,0 +1,410 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import os
+import json
+from collections import defaultdict, OrderedDict
+import numpy as np
+import paddle
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from ..modeling.keypoint_utils import oks_nms
+from scipy.io import loadmat, savemat
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['KeyPointTopDownCOCOEval', 'KeyPointTopDownMPIIEval']
+
+
+class KeyPointTopDownCOCOEval(object):
+    """refer to
+        https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
+        Copyright (c) Microsoft, under the MIT License.
+    """
+
+    def __init__(self,
+                 anno_file,
+                 num_samples,
+                 num_joints,
+                 output_eval,
+                 iou_type='keypoints',
+                 in_vis_thre=0.2,
+                 oks_thre=0.9,
+                 save_prediction_only=False):
+        super(KeyPointTopDownCOCOEval, self).__init__()
+        self.coco = COCO(anno_file)
+        self.num_samples = num_samples
+        self.num_joints = num_joints
+        self.iou_type = iou_type
+        self.in_vis_thre = in_vis_thre
+        self.oks_thre = oks_thre
+        self.output_eval = output_eval
+        self.res_file = os.path.join(output_eval, "keypoints_results.json")
+        self.save_prediction_only = save_prediction_only
+        self.reset()
+
+    def reset(self):
+        self.results = {
+            'all_preds': np.zeros(
+                (self.num_samples, self.num_joints, 3), dtype=np.float32),
+            'all_boxes': np.zeros((self.num_samples, 6)),
+            'image_path': []
+        }
+        self.eval_results = {}
+        self.idx = 0
+
+    def update(self, inputs, outputs):
+        kpts, _ = outputs['keypoint'][0]
+
+        num_images = inputs['image'].shape[0]
+        self.results['all_preds'][self.idx:self.idx + num_images, :, 0:
+                                  3] = kpts[:, :, 0:3]
+        self.results['all_boxes'][self.idx:self.idx + num_images, 0:2] = inputs[
+            'center'].numpy()[:, 0:2] if isinstance(
+                inputs['center'], paddle.Tensor) else inputs['center'][:, 0:2]
+        self.results['all_boxes'][self.idx:self.idx + num_images, 2:4] = inputs[
+            'scale'].numpy()[:, 0:2] if isinstance(
+                inputs['scale'], paddle.Tensor) else inputs['scale'][:, 0:2]
+        self.results['all_boxes'][self.idx:self.idx + num_images, 4] = np.prod(
+            inputs['scale'].numpy() * 200,
+            1) if isinstance(inputs['scale'], paddle.Tensor) else np.prod(
+                inputs['scale'] * 200, 1)
+        self.results['all_boxes'][
+            self.idx:self.idx + num_images,
+            5] = np.squeeze(inputs['score'].numpy()) if isinstance(
+                inputs['score'], paddle.Tensor) else np.squeeze(inputs['score'])
+        if isinstance(inputs['im_id'], paddle.Tensor):
+            self.results['image_path'].extend(inputs['im_id'].numpy())
+        else:
+            self.results['image_path'].extend(inputs['im_id'])
+        self.idx += num_images
+
+    def _write_coco_keypoint_results(self, keypoints):
+        data_pack = [{
+            'cat_id': 1,
+            'cls': 'person',
+            'ann_type': 'keypoints',
+            'keypoints': keypoints
+        }]
+        results = self._coco_keypoint_results_one_category_kernel(data_pack[0])
+        if not os.path.exists(self.output_eval):
+            os.makedirs(self.output_eval)
+        with open(self.res_file, 'w') as f:
+            json.dump(results, f, sort_keys=True, indent=4)
+            logger.info(f'The keypoint result is saved to {self.res_file}.')
+        try:
+            json.load(open(self.res_file))
+        except Exception:
+            content = []
+            with open(self.res_file, 'r') as f:
+                for line in f:
+                    content.append(line)
+            content[-1] = ']'
+            with open(self.res_file, 'w') as f:
+                for c in content:
+                    f.write(c)
+
+    def _coco_keypoint_results_one_category_kernel(self, data_pack):
+        cat_id = data_pack['cat_id']
+        keypoints = data_pack['keypoints']
+        cat_results = []
+
+        for img_kpts in keypoints:
+            if len(img_kpts) == 0:
+                continue
+
+            _key_points = np.array(
+                [img_kpts[k]['keypoints'] for k in range(len(img_kpts))])
+            _key_points = _key_points.reshape(_key_points.shape[0], -1)
+
+            result = [{
+                'image_id': img_kpts[k]['image'],
+                'category_id': cat_id,
+                'keypoints': _key_points[k].tolist(),
+                'score': img_kpts[k]['score'],
+                'center': list(img_kpts[k]['center']),
+                'scale': list(img_kpts[k]['scale'])
+            } for k in range(len(img_kpts))]
+            cat_results.extend(result)
+
+        return cat_results
+
+    def get_final_results(self, preds, all_boxes, img_path):
+        _kpts = []
+        for idx, kpt in enumerate(preds):
+            _kpts.append({
+                'keypoints': kpt,
+                'center': all_boxes[idx][0:2],
+                'scale': all_boxes[idx][2:4],
+                'area': all_boxes[idx][4],
+                'score': all_boxes[idx][5],
+                'image': int(img_path[idx])
+            })
+        # image x person x (keypoints)
+        kpts = defaultdict(list)
+        for kpt in _kpts:
+            kpts[kpt['image']].append(kpt)
+
+        # rescoring and oks nms
+        num_joints = preds.shape[1]
+        in_vis_thre = self.in_vis_thre
+        oks_thre = self.oks_thre
+        oks_nmsed_kpts = []
+        for img in kpts.keys():
+            img_kpts = kpts[img]
+            for n_p in img_kpts:
+                box_score = n_p['score']
+                kpt_score = 0
+                valid_num = 0
+                for n_jt in range(0, num_joints):
+                    t_s = n_p['keypoints'][n_jt][2]
+                    if t_s > in_vis_thre:
+                        kpt_score = kpt_score + t_s
+                        valid_num = valid_num + 1
+                if valid_num != 0:
+                    kpt_score = kpt_score / valid_num
+                # rescoring
+                n_p['score'] = kpt_score * box_score
+
+            keep = oks_nms([img_kpts[i] for i in range(len(img_kpts))],
+                           oks_thre)
+
+            if len(keep) == 0:
+                oks_nmsed_kpts.append(img_kpts)
+            else:
+                oks_nmsed_kpts.append([img_kpts[_keep] for _keep in keep])
+
+        self._write_coco_keypoint_results(oks_nmsed_kpts)
+
+    def accumulate(self):
+        self.get_final_results(self.results['all_preds'],
+                               self.results['all_boxes'],
+                               self.results['image_path'])
+        if self.save_prediction_only:
+            logger.info(f'The keypoint result is saved to {self.res_file} '
+                        'and do not evaluate the mAP.')
+            return
+        coco_dt = self.coco.loadRes(self.res_file)
+        coco_eval = COCOeval(self.coco, coco_dt, 'keypoints')
+        coco_eval.params.useSegm = None
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        keypoint_stats = []
+        for ind in range(len(coco_eval.stats)):
+            keypoint_stats.append((coco_eval.stats[ind]))
+        self.eval_results['keypoint'] = keypoint_stats
+
+    def log(self):
+        if self.save_prediction_only:
+            return
+        stats_names = [
+            'AP', 'Ap .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
+            'AR .75', 'AR (M)', 'AR (L)'
+        ]
+        num_values = len(stats_names)
+        print(' '.join(['| {}'.format(name) for name in stats_names]) + ' |')
+        print('|---' * (num_values + 1) + '|')
+
+        print(' '.join([
+            '| {:.3f}'.format(value) for value in self.eval_results['keypoint']
+        ]) + ' |')
+
+    def get_results(self):
+        return self.eval_results
+
+
+class KeyPointTopDownMPIIEval(object):
+    def __init__(self,
+                 anno_file,
+                 num_samples,
+                 num_joints,
+                 output_eval,
+                 oks_thre=0.9,
+                 save_prediction_only=False):
+        super(KeyPointTopDownMPIIEval, self).__init__()
+        self.ann_file = anno_file
+        self.res_file = os.path.join(output_eval, "keypoints_results.json")
+        self.save_prediction_only = save_prediction_only
+        self.reset()
+
+    def reset(self):
+        self.results = []
+        self.eval_results = {}
+        self.idx = 0
+
+    def update(self, inputs, outputs):
+        kpts, _ = outputs['keypoint'][0]
+
+        num_images = inputs['image'].shape[0]
+        results = {}
+        results['preds'] = kpts[:, :, 0:3]
+        results['boxes'] = np.zeros((num_images, 6))
+        results['boxes'][:, 0:2] = inputs['center'].numpy()[:, 0:2]
+        results['boxes'][:, 2:4] = inputs['scale'].numpy()[:, 0:2]
+        results['boxes'][:, 4] = np.prod(inputs['scale'].numpy() * 200, 1)
+        results['boxes'][:, 5] = np.squeeze(inputs['score'].numpy())
+        results['image_path'] = inputs['image_file']
+
+        self.results.append(results)
+
+    def accumulate(self):
+        self._mpii_keypoint_results_save()
+        if self.save_prediction_only:
+            logger.info(f'The keypoint result is saved to {self.res_file} '
+                        'and do not evaluate the mAP.')
+            return
+
+        self.eval_results = self.evaluate(self.results)
+
+    def _mpii_keypoint_results_save(self):
+        results = []
+        for res in self.results:
+            if len(res) == 0:
+                continue
+            result = [{
+                'preds': res['preds'][k].tolist(),
+                'boxes': res['boxes'][k].tolist(),
+                'image_path': res['image_path'][k],
+            } for k in range(len(res))]
+            results.extend(result)
+        with open(self.res_file, 'w') as f:
+            json.dump(results, f, sort_keys=True, indent=4)
+            logger.info(f'The keypoint result is saved to {self.res_file}.')
+
+    def log(self):
+        if self.save_prediction_only:
+            return
+        for item, value in self.eval_results.items():
+            print("{} : {}".format(item, value))
+
+    def get_results(self):
+        return self.eval_results
+
+    def evaluate(self, outputs, savepath=None):
+        """Evaluate PCKh for MPII dataset. refer to
+        https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
+        Copyright (c) Microsoft, under the MIT License.
+
+        Args:
+            outputs(list(preds, boxes)):
+
+                * preds (np.ndarray[N,K,3]): The first two dimensions are
+                  coordinates, score is the third dimension of the array.
+                * boxes (np.ndarray[N,6]): [center[0], center[1], scale[0]
+                  , scale[1],area, score]
+
+        Returns:
+            dict: PCKh for each joint
+        """
+
+        kpts = []
+        for output in outputs:
+            preds = output['preds']
+            batch_size = preds.shape[0]
+            for i in range(batch_size):
+                kpts.append({'keypoints': preds[i]})
+
+        preds = np.stack([kpt['keypoints'] for kpt in kpts])
+
+        # convert 0-based index to 1-based index,
+        # and get the first two dimensions.
+        preds = preds[..., :2] + 1.0
+
+        if savepath is not None:
+            pred_file = os.path.join(savepath, 'pred.mat')
+            savemat(pred_file, mdict={'preds': preds})
+
+        SC_BIAS = 0.6
+        threshold = 0.5
+
+        gt_file = os.path.join(
+            os.path.dirname(self.ann_file), 'mpii_gt_val.mat')
+        gt_dict = loadmat(gt_file)
+        dataset_joints = gt_dict['dataset_joints']
+        jnt_missing = gt_dict['jnt_missing']
+        pos_gt_src = gt_dict['pos_gt_src']
+        headboxes_src = gt_dict['headboxes_src']
+
+        pos_pred_src = np.transpose(preds, [1, 2, 0])
+
+        head = np.where(dataset_joints == 'head')[1][0]
+        lsho = np.where(dataset_joints == 'lsho')[1][0]
+        lelb = np.where(dataset_joints == 'lelb')[1][0]
+        lwri = np.where(dataset_joints == 'lwri')[1][0]
+        lhip = np.where(dataset_joints == 'lhip')[1][0]
+        lkne = np.where(dataset_joints == 'lkne')[1][0]
+        lank = np.where(dataset_joints == 'lank')[1][0]
+
+        rsho = np.where(dataset_joints == 'rsho')[1][0]
+        relb = np.where(dataset_joints == 'relb')[1][0]
+        rwri = np.where(dataset_joints == 'rwri')[1][0]
+        rkne = np.where(dataset_joints == 'rkne')[1][0]
+        rank = np.where(dataset_joints == 'rank')[1][0]
+        rhip = np.where(dataset_joints == 'rhip')[1][0]
+
+        jnt_visible = 1 - jnt_missing
+        uv_error = pos_pred_src - pos_gt_src
+        uv_err = np.linalg.norm(uv_error, axis=1)
+        headsizes = headboxes_src[1, :, :] - headboxes_src[0, :, :]
+        headsizes = np.linalg.norm(headsizes, axis=0)
+        headsizes *= SC_BIAS
+        scale = headsizes * np.ones((len(uv_err), 1), dtype=np.float32)
+        scaled_uv_err = uv_err / scale
+        scaled_uv_err = scaled_uv_err * jnt_visible
+        jnt_count = np.sum(jnt_visible, axis=1)
+        less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible
+        PCKh = 100. * np.sum(less_than_threshold, axis=1) / jnt_count
+
+        # save
+        rng = np.arange(0, 0.5 + 0.01, 0.01)
+        pckAll = np.zeros((len(rng), 16), dtype=np.float32)
+
+        for r, threshold in enumerate(rng):
+            less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible
+            pckAll[r, :] = 100. * np.sum(less_than_threshold,
+                                         axis=1) / jnt_count
+
+        PCKh = np.ma.array(PCKh, mask=False)
+        PCKh.mask[6:8] = True
+
+        jnt_count = np.ma.array(jnt_count, mask=False)
+        jnt_count.mask[6:8] = True
+        jnt_ratio = jnt_count / np.sum(jnt_count).astype(np.float64)
+
+        name_value = [  #noqa
+            ('Head', PCKh[head]),
+            ('Shoulder', 0.5 * (PCKh[lsho] + PCKh[rsho])),
+            ('Elbow', 0.5 * (PCKh[lelb] + PCKh[relb])),
+            ('Wrist', 0.5 * (PCKh[lwri] + PCKh[rwri])),
+            ('Hip', 0.5 * (PCKh[lhip] + PCKh[rhip])),
+            ('Knee', 0.5 * (PCKh[lkne] + PCKh[rkne])),
+            ('Ankle', 0.5 * (PCKh[lank] + PCKh[rank])),
+            ('PCKh', np.sum(PCKh * jnt_ratio)),
+            ('PCKh@0.1', np.sum(pckAll[11, :] * jnt_ratio))
+        ]
+        name_value = OrderedDict(name_value)
+
+        return name_value
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        kpts = sorted(kpts, key=lambda x: x[key])
+        num = len(kpts)
+        for i in range(num - 1, 0, -1):
+            if kpts[i][key] == kpts[i - 1][key]:
+                del kpts[i]
+
+        return kpts
diff --git a/rtdetr_paddle/ppdet/metrics/map_utils.py b/rtdetr_paddle/ppdet/metrics/map_utils.py
new file mode 100644
index 0000000..77ccf5e
--- /dev/null
+++ b/rtdetr_paddle/ppdet/metrics/map_utils.py
@@ -0,0 +1,397 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import os
+import sys
+import numpy as np
+import itertools
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = [
+    'draw_pr_curve',
+    'bbox_area',
+    'jaccard_overlap',
+    'prune_zero_padding',
+    'DetectionMAP',
+    'ap_per_class',
+    'compute_ap',
+]
+
+
+def draw_pr_curve(precision,
+                  recall,
+                  iou=0.5,
+                  out_dir='pr_curve',
+                  file_name='precision_recall_curve.jpg'):
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+    output_path = os.path.join(out_dir, file_name)
+    try:
+        import matplotlib.pyplot as plt
+    except Exception as e:
+        logger.error('Matplotlib not found, plaese install matplotlib.'
+                     'for example: `pip install matplotlib`.')
+        raise e
+    plt.cla()
+    plt.figure('P-R Curve')
+    plt.title('Precision/Recall Curve(IoU={})'.format(iou))
+    plt.xlabel('Recall')
+    plt.ylabel('Precision')
+    plt.grid(True)
+    plt.plot(recall, precision)
+    plt.savefig(output_path)
+
+
+def bbox_area(bbox, is_bbox_normalized):
+    """
+    Calculate area of a bounding box
+    """
+    norm = 1. - float(is_bbox_normalized)
+    width = bbox[2] - bbox[0] + norm
+    height = bbox[3] - bbox[1] + norm
+    return width * height
+
+
+def jaccard_overlap(pred, gt, is_bbox_normalized=False):
+    """
+    Calculate jaccard overlap ratio between two bounding box
+    """
+    if pred[0] >= gt[2] or pred[2] <= gt[0] or \
+        pred[1] >= gt[3] or pred[3] <= gt[1]:
+        return 0.
+    inter_xmin = max(pred[0], gt[0])
+    inter_ymin = max(pred[1], gt[1])
+    inter_xmax = min(pred[2], gt[2])
+    inter_ymax = min(pred[3], gt[3])
+    inter_size = bbox_area([inter_xmin, inter_ymin, inter_xmax, inter_ymax],
+                           is_bbox_normalized)
+    pred_size = bbox_area(pred, is_bbox_normalized)
+    gt_size = bbox_area(gt, is_bbox_normalized)
+    overlap = float(inter_size) / (pred_size + gt_size - inter_size)
+    return overlap
+
+
+def prune_zero_padding(gt_box, gt_label, difficult=None):
+    valid_cnt = 0
+    for i in range(len(gt_box)):
+        if (gt_box[i] == 0).all():
+            break
+        valid_cnt += 1
+    return (gt_box[:valid_cnt], gt_label[:valid_cnt], difficult[:valid_cnt]
+            if difficult is not None else None)
+
+
+class DetectionMAP(object):
+    """
+    Calculate detection mean average precision.
+    Currently support two types: 11point and integral
+
+    Args:
+        class_num (int): The class number.
+        overlap_thresh (float): The threshold of overlap
+            ratio between prediction bounding box and 
+            ground truth bounding box for deciding 
+            true/false positive. Default 0.5.
+        map_type (str): Calculation method of mean average
+            precision, currently support '11point' and
+            'integral'. Default '11point'.
+        is_bbox_normalized (bool): Whether bounding boxes
+            is normalized to range[0, 1]. Default False.
+        evaluate_difficult (bool): Whether to evaluate
+            difficult bounding boxes. Default False.
+        catid2name (dict): Mapping between category id and category name.
+        classwise (bool): Whether per-category AP and draw
+            P-R Curve or not.
+    """
+
+    def __init__(self,
+                 class_num,
+                 overlap_thresh=0.5,
+                 map_type='11point',
+                 is_bbox_normalized=False,
+                 evaluate_difficult=False,
+                 catid2name=None,
+                 classwise=False):
+        self.class_num = class_num
+        self.overlap_thresh = overlap_thresh
+        assert map_type in ['11point', 'integral'], \
+                "map_type currently only support '11point' "\
+                "and 'integral'"
+        self.map_type = map_type
+        self.is_bbox_normalized = is_bbox_normalized
+        self.evaluate_difficult = evaluate_difficult
+        self.classwise = classwise
+        self.classes = []
+        for cname in catid2name.values():
+            self.classes.append(cname)
+        self.reset()
+
+    def update(self, bbox, score, label, gt_box, gt_label, difficult=None):
+        """
+        Update metric statics from given prediction and ground
+        truth infomations.
+        """
+        if difficult is None:
+            difficult = np.zeros_like(gt_label)
+
+        # record class gt count
+        for gtl, diff in zip(gt_label, difficult):
+            if self.evaluate_difficult or int(diff) == 0:
+                self.class_gt_counts[int(np.array(gtl))] += 1
+
+        # record class score positive
+        visited = [False] * len(gt_label)
+        for b, s, l in zip(bbox, score, label):
+            pred = b.tolist() if isinstance(b, np.ndarray) else b
+            max_idx = -1
+            max_overlap = -1.0
+            for i, gl in enumerate(gt_label):
+                if int(gl) == int(l):
+                    if len(gt_box[i]) == 8:
+                        overlap = calc_rbox_iou(pred, gt_box[i])
+                    else:
+                        overlap = jaccard_overlap(pred, gt_box[i],
+                                                  self.is_bbox_normalized)
+                    if overlap > max_overlap:
+                        max_overlap = overlap
+                        max_idx = i
+
+            if max_overlap > self.overlap_thresh:
+                if self.evaluate_difficult or \
+                        int(np.array(difficult[max_idx])) == 0:
+                    if not visited[max_idx]:
+                        self.class_score_poss[int(l)].append([s, 1.0])
+                        visited[max_idx] = True
+                    else:
+                        self.class_score_poss[int(l)].append([s, 0.0])
+            else:
+                self.class_score_poss[int(l)].append([s, 0.0])
+
+    def reset(self):
+        """
+        Reset metric statics
+        """
+        self.class_score_poss = [[] for _ in range(self.class_num)]
+        self.class_gt_counts = [0] * self.class_num
+        self.mAP = 0.0
+
+    def accumulate(self):
+        """
+        Accumulate metric results and calculate mAP
+        """
+        mAP = 0.
+        valid_cnt = 0
+        eval_results = []
+        for score_pos, count in zip(self.class_score_poss,
+                                    self.class_gt_counts):
+            if count == 0: continue
+            if len(score_pos) == 0:
+                valid_cnt += 1
+                continue
+
+            accum_tp_list, accum_fp_list = \
+                    self._get_tp_fp_accum(score_pos)
+            precision = []
+            recall = []
+            for ac_tp, ac_fp in zip(accum_tp_list, accum_fp_list):
+                precision.append(float(ac_tp) / (ac_tp + ac_fp))
+                recall.append(float(ac_tp) / count)
+
+            one_class_ap = 0.0
+            if self.map_type == '11point':
+                max_precisions = [0.] * 11
+                start_idx = len(precision) - 1
+                for j in range(10, -1, -1):
+                    for i in range(start_idx, -1, -1):
+                        if recall[i] < float(j) / 10.:
+                            start_idx = i
+                            if j > 0:
+                                max_precisions[j - 1] = max_precisions[j]
+                                break
+                        else:
+                            if max_precisions[j] < precision[i]:
+                                max_precisions[j] = precision[i]
+                one_class_ap = sum(max_precisions) / 11.
+                mAP += one_class_ap
+                valid_cnt += 1
+            elif self.map_type == 'integral':
+                import math
+                prev_recall = 0.
+                for i in range(len(precision)):
+                    recall_gap = math.fabs(recall[i] - prev_recall)
+                    if recall_gap > 1e-6:
+                        one_class_ap += precision[i] * recall_gap
+                        prev_recall = recall[i]
+                mAP += one_class_ap
+                valid_cnt += 1
+            else:
+                logger.error("Unspported mAP type {}".format(self.map_type))
+                sys.exit(1)
+            eval_results.append({
+                'class': self.classes[valid_cnt - 1],
+                'ap': one_class_ap,
+                'precision': precision,
+                'recall': recall,
+            })
+        self.eval_results = eval_results
+        self.mAP = mAP / float(valid_cnt) if valid_cnt > 0 else mAP
+
+    def get_map(self):
+        """
+        Get mAP result
+        """
+        if self.mAP is None:
+            logger.error("mAP is not calculated.")
+        if self.classwise:
+            # Compute per-category AP and PR curve
+            try:
+                from terminaltables import AsciiTable
+            except Exception as e:
+                logger.error(
+                    'terminaltables not found, plaese install terminaltables. '
+                    'for example: `pip install terminaltables`.')
+                raise e
+            results_per_category = []
+            for eval_result in self.eval_results:
+                results_per_category.append(
+                    (str(eval_result['class']),
+                     '{:0.3f}'.format(float(eval_result['ap']))))
+                draw_pr_curve(
+                    eval_result['precision'],
+                    eval_result['recall'],
+                    out_dir='voc_pr_curve',
+                    file_name='{}_precision_recall_curve.jpg'.format(
+                        eval_result['class']))
+
+            num_columns = min(6, len(results_per_category) * 2)
+            results_flatten = list(itertools.chain(*results_per_category))
+            headers = ['category', 'AP'] * (num_columns // 2)
+            results_2d = itertools.zip_longest(* [
+                results_flatten[i::num_columns] for i in range(num_columns)
+            ])
+            table_data = [headers]
+            table_data += [result for result in results_2d]
+            table = AsciiTable(table_data)
+            logger.info('Per-category of VOC AP: \n{}'.format(table.table))
+            logger.info(
+                "per-category PR curve has output to voc_pr_curve folder.")
+        return self.mAP
+
+    def _get_tp_fp_accum(self, score_pos_list):
+        """
+        Calculate accumulating true/false positive results from
+        [score, pos] records
+        """
+        sorted_list = sorted(score_pos_list, key=lambda s: s[0], reverse=True)
+        accum_tp = 0
+        accum_fp = 0
+        accum_tp_list = []
+        accum_fp_list = []
+        for (score, pos) in sorted_list:
+            accum_tp += int(pos)
+            accum_tp_list.append(accum_tp)
+            accum_fp += 1 - int(pos)
+            accum_fp_list.append(accum_fp)
+        return accum_tp_list, accum_fp_list
+
+
+def ap_per_class(tp, conf, pred_cls, target_cls):
+    """
+    Computes the average precision, given the recall and precision curves.
+    Method originally from https://github.com/rafaelpadilla/Object-Detection-Metrics.
+    
+    Args:
+        tp (list): True positives.
+        conf (list): Objectness value from 0-1.
+        pred_cls (list): Predicted object classes.
+        target_cls (list): Target object classes.
+    """
+    tp, conf, pred_cls, target_cls = np.array(tp), np.array(conf), np.array(
+        pred_cls), np.array(target_cls)
+
+    # Sort by objectness
+    i = np.argsort(-conf)
+    tp, conf, pred_cls = tp[i], conf[i], pred_cls[i]
+
+    # Find unique classes
+    unique_classes = np.unique(np.concatenate((pred_cls, target_cls), 0))
+
+    # Create Precision-Recall curve and compute AP for each class
+    ap, p, r = [], [], []
+    for c in unique_classes:
+        i = pred_cls == c
+        n_gt = sum(target_cls == c)  # Number of ground truth objects
+        n_p = sum(i)  # Number of predicted objects
+
+        if (n_p == 0) and (n_gt == 0):
+            continue
+        elif (n_p == 0) or (n_gt == 0):
+            ap.append(0)
+            r.append(0)
+            p.append(0)
+        else:
+            # Accumulate FPs and TPs
+            fpc = np.cumsum(1 - tp[i])
+            tpc = np.cumsum(tp[i])
+
+            # Recall
+            recall_curve = tpc / (n_gt + 1e-16)
+            r.append(tpc[-1] / (n_gt + 1e-16))
+
+            # Precision
+            precision_curve = tpc / (tpc + fpc)
+            p.append(tpc[-1] / (tpc[-1] + fpc[-1]))
+
+            # AP from recall-precision curve
+            ap.append(compute_ap(recall_curve, precision_curve))
+
+    return np.array(ap), unique_classes.astype('int32'), np.array(r), np.array(
+        p)
+
+
+def compute_ap(recall, precision):
+    """
+    Computes the average precision, given the recall and precision curves.
+    Code originally from https://github.com/rbgirshick/py-faster-rcnn.
+    
+    Args:
+        recall (list): The recall curve.
+        precision (list): The precision curve.
+
+    Returns:
+        The average precision as computed in py-faster-rcnn.
+    """
+    # correct AP calculation
+    # first append sentinel values at the end
+    mrec = np.concatenate(([0.], recall, [1.]))
+    mpre = np.concatenate(([0.], precision, [0.]))
+
+    # compute the precision envelope
+    for i in range(mpre.size - 1, 0, -1):
+        mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+    # to calculate area under PR curve, look for points
+    # where X axis (recall) changes value
+    i = np.where(mrec[1:] != mrec[:-1])[0]
+
+    # and sum (\Delta recall) * prec
+    ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap
diff --git a/rtdetr_paddle/ppdet/metrics/mcmot_metrics.py b/rtdetr_paddle/ppdet/metrics/mcmot_metrics.py
new file mode 100644
index 0000000..c9b5ef7
--- /dev/null
+++ b/rtdetr_paddle/ppdet/metrics/mcmot_metrics.py
@@ -0,0 +1,473 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import copy
+import sys
+import math
+from collections import defaultdict
+
+import numpy as np
+import pandas as pd
+
+from .metrics import Metric
+try:
+    import motmetrics as mm
+    from motmetrics.math_util import quiet_divide
+    metrics = mm.metrics.motchallenge_metrics
+    mh = mm.metrics.create()
+except:
+    print(
+        'Warning: Unable to use MCMOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'
+    )
+    pass
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['MCMOTEvaluator', 'MCMOTMetric']
+
+METRICS_LIST = [
+    'num_frames', 'num_matches', 'num_switches', 'num_transfer', 'num_ascend',
+    'num_migrate', 'num_false_positives', 'num_misses', 'num_detections',
+    'num_objects', 'num_predictions', 'num_unique_objects', 'mostly_tracked',
+    'partially_tracked', 'mostly_lost', 'num_fragmentations', 'motp', 'mota',
+    'precision', 'recall', 'idfp', 'idfn', 'idtp', 'idp', 'idr', 'idf1'
+]
+
+NAME_MAP = {
+    'num_frames': 'num_frames',
+    'num_matches': 'num_matches',
+    'num_switches': 'IDs',
+    'num_transfer': 'IDt',
+    'num_ascend': 'IDa',
+    'num_migrate': 'IDm',
+    'num_false_positives': 'FP',
+    'num_misses': 'FN',
+    'num_detections': 'num_detections',
+    'num_objects': 'num_objects',
+    'num_predictions': 'num_predictions',
+    'num_unique_objects': 'GT',
+    'mostly_tracked': 'MT',
+    'partially_tracked': 'partially_tracked',
+    'mostly_lost': 'ML',
+    'num_fragmentations': 'FM',
+    'motp': 'MOTP',
+    'mota': 'MOTA',
+    'precision': 'Prcn',
+    'recall': 'Rcll',
+    'idfp': 'idfp',
+    'idfn': 'idfn',
+    'idtp': 'idtp',
+    'idp': 'IDP',
+    'idr': 'IDR',
+    'idf1': 'IDF1'
+}
+
+
+def parse_accs_metrics(seq_acc, index_name, verbose=False):
+    """
+    Parse the evaluation indicators of multiple MOTAccumulator 
+    """
+    mh = mm.metrics.create()
+    summary = MCMOTEvaluator.get_summary(seq_acc, index_name, METRICS_LIST)
+    summary.loc['OVERALL', 'motp'] = (summary['motp'] * summary['num_detections']).sum() / \
+                                     summary.loc['OVERALL', 'num_detections']
+    if verbose:
+        strsummary = mm.io.render_summary(
+            summary, formatters=mh.formatters, namemap=NAME_MAP)
+        print(strsummary)
+
+    return summary
+
+
+def seqs_overall_metrics(summary_df, verbose=False):
+    """
+    Calculate overall metrics for multiple sequences
+    """
+    add_col = [
+        'num_frames', 'num_matches', 'num_switches', 'num_transfer',
+        'num_ascend', 'num_migrate', 'num_false_positives', 'num_misses',
+        'num_detections', 'num_objects', 'num_predictions',
+        'num_unique_objects', 'mostly_tracked', 'partially_tracked',
+        'mostly_lost', 'num_fragmentations', 'idfp', 'idfn', 'idtp'
+    ]
+    calc_col = ['motp', 'mota', 'precision', 'recall', 'idp', 'idr', 'idf1']
+    calc_df = summary_df.copy()
+
+    overall_dic = {}
+    for col in add_col:
+        overall_dic[col] = calc_df[col].sum()
+
+    for col in calc_col:
+        overall_dic[col] = getattr(MCMOTMetricOverall, col + '_overall')(
+            calc_df, overall_dic)
+
+    overall_df = pd.DataFrame(overall_dic, index=['overall_calc'])
+    calc_df = pd.concat([calc_df, overall_df])
+
+    if verbose:
+        mh = mm.metrics.create()
+        str_calc_df = mm.io.render_summary(
+            calc_df, formatters=mh.formatters, namemap=NAME_MAP)
+        print(str_calc_df)
+
+    return calc_df
+
+
+class MCMOTMetricOverall(object):
+    def motp_overall(summary_df, overall_dic):
+        motp = quiet_divide((summary_df['motp'] *
+                             summary_df['num_detections']).sum(),
+                            overall_dic['num_detections'])
+        return motp
+
+    def mota_overall(summary_df, overall_dic):
+        del summary_df
+        mota = 1. - quiet_divide(
+            (overall_dic['num_misses'] + overall_dic['num_switches'] +
+             overall_dic['num_false_positives']), overall_dic['num_objects'])
+        return mota
+
+    def precision_overall(summary_df, overall_dic):
+        del summary_df
+        precision = quiet_divide(overall_dic['num_detections'], (
+            overall_dic['num_false_positives'] + overall_dic['num_detections']))
+        return precision
+
+    def recall_overall(summary_df, overall_dic):
+        del summary_df
+        recall = quiet_divide(overall_dic['num_detections'],
+                              overall_dic['num_objects'])
+        return recall
+
+    def idp_overall(summary_df, overall_dic):
+        del summary_df
+        idp = quiet_divide(overall_dic['idtp'],
+                           (overall_dic['idtp'] + overall_dic['idfp']))
+        return idp
+
+    def idr_overall(summary_df, overall_dic):
+        del summary_df
+        idr = quiet_divide(overall_dic['idtp'],
+                           (overall_dic['idtp'] + overall_dic['idfn']))
+        return idr
+
+    def idf1_overall(summary_df, overall_dic):
+        del summary_df
+        idf1 = quiet_divide(2. * overall_dic['idtp'], (
+            overall_dic['num_objects'] + overall_dic['num_predictions']))
+        return idf1
+
+
+def read_mcmot_results_union(filename, is_gt, is_ignore):
+    results_dict = dict()
+    if os.path.isfile(filename):
+        all_result = np.loadtxt(filename, delimiter=',')
+        if all_result.shape[0] == 0 or all_result.shape[1] < 7:
+            return results_dict
+        if is_ignore:
+            return results_dict
+        if is_gt:
+            # only for test use
+            all_result = all_result[all_result[:, 7] != 0]
+            all_result[:, 7] = all_result[:, 7] - 1
+
+        if all_result.shape[0] == 0:
+            return results_dict
+
+        class_unique = np.unique(all_result[:, 7])
+
+        last_max_id = 0
+        result_cls_list = []
+        for cls in class_unique:
+            result_cls_split = all_result[all_result[:, 7] == cls]
+            result_cls_split[:, 1] = result_cls_split[:, 1] + last_max_id
+            # make sure track id different between every category
+            last_max_id = max(np.unique(result_cls_split[:, 1])) + 1
+            result_cls_list.append(result_cls_split)
+
+        results_con = np.concatenate(result_cls_list)
+
+        for line in range(len(results_con)):
+            linelist = results_con[line]
+            fid = int(linelist[0])
+            if fid < 1:
+                continue
+            results_dict.setdefault(fid, list())
+
+            if is_gt:
+                score = 1
+            else:
+                score = float(linelist[6])
+
+            tlwh = tuple(map(float, linelist[2:6]))
+            target_id = int(linelist[1])
+            cls = int(linelist[7])
+
+            results_dict[fid].append((tlwh, target_id, cls, score))
+
+        return results_dict
+
+
+def read_mcmot_results(filename, is_gt, is_ignore):
+    results_dict = dict()
+    if os.path.isfile(filename):
+        with open(filename, 'r') as f:
+            for line in f.readlines():
+                linelist = line.strip().split(',')
+                if len(linelist) < 7:
+                    continue
+                fid = int(linelist[0])
+                if fid < 1:
+                    continue
+                cid = int(linelist[7])
+                if is_gt:
+                    score = 1
+                    # only for test use
+                    cid -= 1
+                else:
+                    score = float(linelist[6])
+
+                cls_result_dict = results_dict.setdefault(cid, dict())
+                cls_result_dict.setdefault(fid, list())
+
+                tlwh = tuple(map(float, linelist[2:6]))
+                target_id = int(linelist[1])
+                cls_result_dict[fid].append((tlwh, target_id, score))
+    return results_dict
+
+
+def read_results(filename,
+                 data_type,
+                 is_gt=False,
+                 is_ignore=False,
+                 multi_class=False,
+                 union=False):
+    if data_type in ['mcmot', 'lab']:
+        if multi_class:
+            if union:
+                # The results are evaluated by union all the categories.
+                # Track IDs between different categories cannot be duplicate.
+                read_fun = read_mcmot_results_union
+            else:
+                # The results are evaluated separately by category.
+                read_fun = read_mcmot_results
+        else:
+            raise ValueError('multi_class: {}, MCMOT should have cls_id.'.
+                             format(multi_class))
+    else:
+        raise ValueError('Unknown data type: {}'.format(data_type))
+
+    return read_fun(filename, is_gt, is_ignore)
+
+
+def unzip_objs(objs):
+    if len(objs) > 0:
+        tlwhs, ids, scores = zip(*objs)
+    else:
+        tlwhs, ids, scores = [], [], []
+    tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4)
+    return tlwhs, ids, scores
+
+
+def unzip_objs_cls(objs):
+    if len(objs) > 0:
+        tlwhs, ids, cls, scores = zip(*objs)
+    else:
+        tlwhs, ids, cls, scores = [], [], [], []
+    tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4)
+    ids = np.array(ids)
+    cls = np.array(cls)
+    scores = np.array(scores)
+    return tlwhs, ids, cls, scores
+
+
+class MCMOTEvaluator(object):
+    def __init__(self, data_root, seq_name, data_type, num_classes):
+        self.data_root = data_root
+        self.seq_name = seq_name
+        self.data_type = data_type
+        self.num_classes = num_classes
+
+        self.load_annotations()
+        try:
+            import motmetrics as mm
+            mm.lap.default_solver = 'lap'
+        except Exception as e:
+            raise RuntimeError(
+                'Unable to use MCMOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'
+            )
+        self.reset_accumulator()
+
+        self.class_accs = []
+
+    def load_annotations(self):
+        assert self.data_type == 'mcmot'
+        self.gt_filename = os.path.join(self.data_root, '../', 'sequences',
+                                        '{}.txt'.format(self.seq_name))
+        if not os.path.exists(self.gt_filename):
+            logger.warning(
+                "gt_filename '{}' of MCMOTEvaluator is not exist, so the MOTA will be -INF."
+            )
+
+    def reset_accumulator(self):
+        self.acc = mm.MOTAccumulator(auto_id=True)
+
+    def eval_frame_dict(self, trk_objs, gt_objs, rtn_events=False, union=False):
+        if union:
+            trk_tlwhs, trk_ids, trk_cls = unzip_objs_cls(trk_objs)[:3]
+            gt_tlwhs, gt_ids, gt_cls = unzip_objs_cls(gt_objs)[:3]
+
+            # get distance matrix
+            iou_distance = mm.distances.iou_matrix(
+                gt_tlwhs, trk_tlwhs, max_iou=0.5)
+
+            # Set the distance between objects of different categories to nan
+            gt_cls_len = len(gt_cls)
+            trk_cls_len = len(trk_cls)
+            # When the number of GT or Trk is 0, iou_distance dimension is (0,0)
+            if gt_cls_len != 0 and trk_cls_len != 0:
+                gt_cls = gt_cls.reshape(gt_cls_len, 1)
+                gt_cls = np.repeat(gt_cls, trk_cls_len, axis=1)
+                trk_cls = trk_cls.reshape(1, trk_cls_len)
+                trk_cls = np.repeat(trk_cls, gt_cls_len, axis=0)
+                iou_distance = np.where(gt_cls == trk_cls, iou_distance, np.nan)
+
+        else:
+            trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2]
+            gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2]
+
+            # get distance matrix
+            iou_distance = mm.distances.iou_matrix(
+                gt_tlwhs, trk_tlwhs, max_iou=0.5)
+
+        self.acc.update(gt_ids, trk_ids, iou_distance)
+
+        if rtn_events and iou_distance.size > 0 and hasattr(self.acc,
+                                                            'mot_events'):
+            events = self.acc.mot_events  # only supported by https://github.com/longcw/py-motmetrics
+        else:
+            events = None
+        return events
+
+    def eval_file(self, result_filename):
+        # evaluation of each category
+        gt_frame_dict = read_results(
+            self.gt_filename,
+            self.data_type,
+            is_gt=True,
+            multi_class=True,
+            union=False)
+        result_frame_dict = read_results(
+            result_filename,
+            self.data_type,
+            is_gt=False,
+            multi_class=True,
+            union=False)
+
+        for cid in range(self.num_classes):
+            self.reset_accumulator()
+            cls_result_frame_dict = result_frame_dict.setdefault(cid, dict())
+            cls_gt_frame_dict = gt_frame_dict.setdefault(cid, dict())
+
+            # only labeled frames will be evaluated
+            frames = sorted(list(set(cls_gt_frame_dict.keys())))
+
+            for frame_id in frames:
+                trk_objs = cls_result_frame_dict.get(frame_id, [])
+                gt_objs = cls_gt_frame_dict.get(frame_id, [])
+                self.eval_frame_dict(trk_objs, gt_objs, rtn_events=False)
+
+            self.class_accs.append(self.acc)
+
+        return self.class_accs
+
+    @staticmethod
+    def get_summary(accs,
+                    names,
+                    metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1',
+                             'precision', 'recall')):
+        names = copy.deepcopy(names)
+        if metrics is None:
+            metrics = mm.metrics.motchallenge_metrics
+        metrics = copy.deepcopy(metrics)
+
+        mh = mm.metrics.create()
+        summary = mh.compute_many(
+            accs, metrics=metrics, names=names, generate_overall=True)
+
+        return summary
+
+    @staticmethod
+    def save_summary(summary, filename):
+        import pandas as pd
+        writer = pd.ExcelWriter(filename)
+        summary.to_excel(writer)
+        writer.save()
+
+
+class MCMOTMetric(Metric):
+    def __init__(self, num_classes, save_summary=False):
+        self.num_classes = num_classes
+        self.save_summary = save_summary
+        self.MCMOTEvaluator = MCMOTEvaluator
+        self.result_root = None
+        self.reset()
+
+        self.seqs_overall = defaultdict(list)
+
+    def reset(self):
+        self.accs = []
+        self.seqs = []
+
+    def update(self, data_root, seq, data_type, result_root, result_filename):
+        evaluator = self.MCMOTEvaluator(data_root, seq, data_type,
+                                        self.num_classes)
+        seq_acc = evaluator.eval_file(result_filename)
+        self.accs.append(seq_acc)
+        self.seqs.append(seq)
+        self.result_root = result_root
+
+        cls_index_name = [
+            '{}_{}'.format(seq, i) for i in range(self.num_classes)
+        ]
+        summary = parse_accs_metrics(seq_acc, cls_index_name)
+        summary.rename(
+            index={'OVERALL': '{}_OVERALL'.format(seq)}, inplace=True)
+        for row in range(len(summary)):
+            self.seqs_overall[row].append(summary.iloc[row:row + 1])
+
+    def accumulate(self):
+        self.cls_summary_list = []
+        for row in range(self.num_classes):
+            seqs_cls_df = pd.concat(self.seqs_overall[row])
+            seqs_cls_summary = seqs_overall_metrics(seqs_cls_df)
+            cls_summary_overall = seqs_cls_summary.iloc[-1:].copy()
+            cls_summary_overall.rename(
+                index={'overall_calc': 'overall_calc_{}'.format(row)},
+                inplace=True)
+            self.cls_summary_list.append(cls_summary_overall)
+
+    def log(self):
+        seqs_summary = seqs_overall_metrics(
+            pd.concat(self.seqs_overall[self.num_classes]), verbose=True)
+        class_summary = seqs_overall_metrics(
+            pd.concat(self.cls_summary_list), verbose=True)
+
+    def get_results(self):
+        return 1
diff --git a/rtdetr_paddle/ppdet/metrics/metrics.py b/rtdetr_paddle/ppdet/metrics/metrics.py
new file mode 100644
index 0000000..4916ca4
--- /dev/null
+++ b/rtdetr_paddle/ppdet/metrics/metrics.py
@@ -0,0 +1,505 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import json
+import paddle
+import numpy as np
+import typing
+from collections import defaultdict
+from pathlib import Path
+
+from .map_utils import prune_zero_padding, DetectionMAP
+from .coco_utils import get_infer_results, cocoapi_eval
+from .widerface_utils import face_eval_run
+from ppdet.data.source.category import get_categories
+
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = [
+    'Metric', 'COCOMetric', 'VOCMetric', 'WiderFaceMetric', 'get_infer_results',
+    'RBoxMetric', 'SNIPERCOCOMetric'
+]
+
+COCO_SIGMAS = np.array([
+    .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87,
+    .89, .89
+]) / 10.0
+CROWD_SIGMAS = np.array(
+    [.79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89, .79,
+     .79]) / 10.0
+
+
+class Metric(paddle.metric.Metric):
+    def name(self):
+        return self.__class__.__name__
+
+    def reset(self):
+        pass
+
+    def accumulate(self):
+        pass
+
+    # paddle.metric.Metric defined :metch:`update`, :meth:`accumulate`
+    # :metch:`reset`, in ppdet, we also need following 2 methods:
+
+    # abstract method for logging metric results
+    def log(self):
+        pass
+
+    # abstract method for getting metric results
+    def get_results(self):
+        pass
+
+
+class COCOMetric(Metric):
+    def __init__(self, anno_file, **kwargs):
+        self.anno_file = anno_file
+        self.clsid2catid = kwargs.get('clsid2catid', None)
+        if self.clsid2catid is None:
+            self.clsid2catid, _ = get_categories('COCO', anno_file)
+        self.classwise = kwargs.get('classwise', False)
+        self.output_eval = kwargs.get('output_eval', None)
+        # TODO: bias should be unified
+        self.bias = kwargs.get('bias', 0)
+        self.save_prediction_only = kwargs.get('save_prediction_only', False)
+        self.iou_type = kwargs.get('IouType', 'bbox')
+
+        if not self.save_prediction_only:
+            assert os.path.isfile(anno_file), \
+                    "anno_file {} not a file".format(anno_file)
+
+        if self.output_eval is not None:
+            Path(self.output_eval).mkdir(exist_ok=True)
+
+        self.reset()
+
+    def reset(self):
+        # only bbox and mask evaluation support currently
+        self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []}
+        self.eval_results = {}
+
+    def update(self, inputs, outputs):
+        outs = {}
+        # outputs Tensor -> numpy.ndarray
+        for k, v in outputs.items():
+            outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v
+
+        # multi-scale inputs: all inputs have same im_id
+        if isinstance(inputs, typing.Sequence):
+            im_id = inputs[0]['im_id']
+        else:
+            im_id = inputs['im_id']
+        outs['im_id'] = im_id.numpy() if isinstance(im_id,
+                                                    paddle.Tensor) else im_id
+
+        infer_results = get_infer_results(
+            outs, self.clsid2catid, bias=self.bias)
+        self.results['bbox'] += infer_results[
+            'bbox'] if 'bbox' in infer_results else []
+        self.results['mask'] += infer_results[
+            'mask'] if 'mask' in infer_results else []
+        self.results['segm'] += infer_results[
+            'segm'] if 'segm' in infer_results else []
+        self.results['keypoint'] += infer_results[
+            'keypoint'] if 'keypoint' in infer_results else []
+
+    def accumulate(self):
+        if len(self.results['bbox']) > 0:
+            output = "bbox.json"
+            if self.output_eval:
+                output = os.path.join(self.output_eval, output)
+            with open(output, 'w') as f:
+                json.dump(self.results['bbox'], f)
+                logger.info('The bbox result is saved to bbox.json.')
+
+            if self.save_prediction_only:
+                logger.info('The bbox result is saved to {} and do not '
+                            'evaluate the mAP.'.format(output))
+            else:
+                bbox_stats = cocoapi_eval(
+                    output,
+                    'bbox',
+                    anno_file=self.anno_file,
+                    classwise=self.classwise)
+                self.eval_results['bbox'] = bbox_stats
+                sys.stdout.flush()
+
+        if len(self.results['mask']) > 0:
+            output = "mask.json"
+            if self.output_eval:
+                output = os.path.join(self.output_eval, output)
+            with open(output, 'w') as f:
+                json.dump(self.results['mask'], f)
+                logger.info('The mask result is saved to mask.json.')
+
+            if self.save_prediction_only:
+                logger.info('The mask result is saved to {} and do not '
+                            'evaluate the mAP.'.format(output))
+            else:
+                seg_stats = cocoapi_eval(
+                    output,
+                    'segm',
+                    anno_file=self.anno_file,
+                    classwise=self.classwise)
+                self.eval_results['mask'] = seg_stats
+                sys.stdout.flush()
+
+        if len(self.results['segm']) > 0:
+            output = "segm.json"
+            if self.output_eval:
+                output = os.path.join(self.output_eval, output)
+            with open(output, 'w') as f:
+                json.dump(self.results['segm'], f)
+                logger.info('The segm result is saved to segm.json.')
+
+            if self.save_prediction_only:
+                logger.info('The segm result is saved to {} and do not '
+                            'evaluate the mAP.'.format(output))
+            else:
+                seg_stats = cocoapi_eval(
+                    output,
+                    'segm',
+                    anno_file=self.anno_file,
+                    classwise=self.classwise)
+                self.eval_results['mask'] = seg_stats
+                sys.stdout.flush()
+
+        if len(self.results['keypoint']) > 0:
+            output = "keypoint.json"
+            if self.output_eval:
+                output = os.path.join(self.output_eval, output)
+            with open(output, 'w') as f:
+                json.dump(self.results['keypoint'], f)
+                logger.info('The keypoint result is saved to keypoint.json.')
+
+            if self.save_prediction_only:
+                logger.info('The keypoint result is saved to {} and do not '
+                            'evaluate the mAP.'.format(output))
+            else:
+                style = 'keypoints'
+                use_area = True
+                sigmas = COCO_SIGMAS
+                if self.iou_type == 'keypoints_crowd':
+                    style = 'keypoints_crowd'
+                    use_area = False
+                    sigmas = CROWD_SIGMAS
+                keypoint_stats = cocoapi_eval(
+                    output,
+                    style,
+                    anno_file=self.anno_file,
+                    classwise=self.classwise,
+                    sigmas=sigmas,
+                    use_area=use_area)
+                self.eval_results['keypoint'] = keypoint_stats
+                sys.stdout.flush()
+
+    def log(self):
+        pass
+
+    def get_results(self):
+        return self.eval_results
+
+
+class VOCMetric(Metric):
+    def __init__(self,
+                 label_list,
+                 class_num=20,
+                 overlap_thresh=0.5,
+                 map_type='11point',
+                 is_bbox_normalized=False,
+                 evaluate_difficult=False,
+                 classwise=False,
+                 output_eval=None,
+                 save_prediction_only=False):
+        assert os.path.isfile(label_list), \
+                "label_list {} not a file".format(label_list)
+        self.clsid2catid, self.catid2name = get_categories('VOC', label_list)
+
+        self.overlap_thresh = overlap_thresh
+        self.map_type = map_type
+        self.evaluate_difficult = evaluate_difficult
+        self.output_eval = output_eval
+        self.save_prediction_only = save_prediction_only
+        self.detection_map = DetectionMAP(
+            class_num=class_num,
+            overlap_thresh=overlap_thresh,
+            map_type=map_type,
+            is_bbox_normalized=is_bbox_normalized,
+            evaluate_difficult=evaluate_difficult,
+            catid2name=self.catid2name,
+            classwise=classwise)
+
+        self.reset()
+
+    def reset(self):
+        self.results = {'bbox': [], 'score': [], 'label': []}
+        self.detection_map.reset()
+
+    def update(self, inputs, outputs):
+        bbox_np = outputs['bbox'].numpy() if isinstance(
+            outputs['bbox'], paddle.Tensor) else outputs['bbox']
+        bboxes = bbox_np[:, 2:]
+        scores = bbox_np[:, 1]
+        labels = bbox_np[:, 0]
+        bbox_lengths = outputs['bbox_num'].numpy() if isinstance(
+            outputs['bbox_num'], paddle.Tensor) else outputs['bbox_num']
+
+        self.results['bbox'].append(bboxes.tolist())
+        self.results['score'].append(scores.tolist())
+        self.results['label'].append(labels.tolist())
+
+        if bboxes.shape == (1, 1) or bboxes is None:
+            return
+        if self.save_prediction_only:
+            return
+
+        gt_boxes = inputs['gt_bbox']
+        gt_labels = inputs['gt_class']
+        difficults = inputs['difficult'] if not self.evaluate_difficult \
+                            else None
+
+        if 'scale_factor' in inputs:
+            scale_factor = inputs['scale_factor'].numpy() if isinstance(
+                inputs['scale_factor'],
+                paddle.Tensor) else inputs['scale_factor']
+        else:
+            scale_factor = np.ones((gt_boxes.shape[0], 2)).astype('float32')
+
+        bbox_idx = 0
+        for i in range(len(gt_boxes)):
+            gt_box = gt_boxes[i].numpy() if isinstance(
+                gt_boxes[i], paddle.Tensor) else gt_boxes[i]
+            h, w = scale_factor[i]
+            gt_box = gt_box / np.array([w, h, w, h])
+            gt_label = gt_labels[i].numpy() if isinstance(
+                gt_labels[i], paddle.Tensor) else gt_labels[i]
+            if difficults is not None:
+                difficult = difficults[i].numpy() if isinstance(
+                    difficults[i], paddle.Tensor) else difficults[i]
+            else:
+                difficult = None
+            bbox_num = bbox_lengths[i]
+            bbox = bboxes[bbox_idx:bbox_idx + bbox_num]
+            score = scores[bbox_idx:bbox_idx + bbox_num]
+            label = labels[bbox_idx:bbox_idx + bbox_num]
+            gt_box, gt_label, difficult = prune_zero_padding(gt_box, gt_label,
+                                                             difficult)
+            self.detection_map.update(bbox, score, label, gt_box, gt_label,
+                                      difficult)
+            bbox_idx += bbox_num
+
+    def accumulate(self):
+        output = "bbox.json"
+        if self.output_eval:
+            output = os.path.join(self.output_eval, output)
+            with open(output, 'w') as f:
+                json.dump(self.results, f)
+                logger.info('The bbox result is saved to bbox.json.')
+        if self.save_prediction_only:
+            return
+
+        logger.info("Accumulating evaluatation results...")
+        self.detection_map.accumulate()
+
+    def log(self):
+        map_stat = 100. * self.detection_map.get_map()
+        logger.info("mAP({:.2f}, {}) = {:.2f}%".format(self.overlap_thresh,
+                                                       self.map_type, map_stat))
+
+    def get_results(self):
+        return {'bbox': [self.detection_map.get_map()]}
+
+
+class WiderFaceMetric(Metric):
+    def __init__(self, image_dir, anno_file, multi_scale=True):
+        self.image_dir = image_dir
+        self.anno_file = anno_file
+        self.multi_scale = multi_scale
+        self.clsid2catid, self.catid2name = get_categories('widerface')
+
+    def update(self, model):
+
+        face_eval_run(
+            model,
+            self.image_dir,
+            self.anno_file,
+            pred_dir='output/pred',
+            eval_mode='widerface',
+            multi_scale=self.multi_scale)
+
+
+class RBoxMetric(Metric):
+    def __init__(self, anno_file, **kwargs):
+        self.anno_file = anno_file
+        self.clsid2catid, self.catid2name = get_categories('RBOX', anno_file)
+        self.catid2clsid = {v: k for k, v in self.clsid2catid.items()}
+        self.classwise = kwargs.get('classwise', False)
+        self.output_eval = kwargs.get('output_eval', None)
+        self.save_prediction_only = kwargs.get('save_prediction_only', False)
+        self.overlap_thresh = kwargs.get('overlap_thresh', 0.5)
+        self.map_type = kwargs.get('map_type', '11point')
+        self.evaluate_difficult = kwargs.get('evaluate_difficult', False)
+        self.imid2path = kwargs.get('imid2path', None)
+        class_num = len(self.catid2name)
+        self.detection_map = DetectionMAP(
+            class_num=class_num,
+            overlap_thresh=self.overlap_thresh,
+            map_type=self.map_type,
+            is_bbox_normalized=False,
+            evaluate_difficult=self.evaluate_difficult,
+            catid2name=self.catid2name,
+            classwise=self.classwise)
+
+        self.reset()
+
+    def reset(self):
+        self.results = []
+        self.detection_map.reset()
+
+    def update(self, inputs, outputs):
+        outs = {}
+        # outputs Tensor -> numpy.ndarray
+        for k, v in outputs.items():
+            outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v
+
+        im_id = inputs['im_id']
+        im_id = im_id.numpy() if isinstance(im_id, paddle.Tensor) else im_id
+        outs['im_id'] = im_id
+
+        infer_results = get_infer_results(outs, self.clsid2catid)
+        infer_results = infer_results['bbox'] if 'bbox' in infer_results else []
+        self.results += infer_results
+        if self.save_prediction_only:
+            return
+
+        gt_boxes = inputs['gt_poly']
+        gt_labels = inputs['gt_class']
+
+        if 'scale_factor' in inputs:
+            scale_factor = inputs['scale_factor'].numpy() if isinstance(
+                inputs['scale_factor'],
+                paddle.Tensor) else inputs['scale_factor']
+        else:
+            scale_factor = np.ones((gt_boxes.shape[0], 2)).astype('float32')
+
+        for i in range(len(gt_boxes)):
+            gt_box = gt_boxes[i].numpy() if isinstance(
+                gt_boxes[i], paddle.Tensor) else gt_boxes[i]
+            h, w = scale_factor[i]
+            gt_box = gt_box / np.array([w, h, w, h, w, h, w, h])
+            gt_label = gt_labels[i].numpy() if isinstance(
+                gt_labels[i], paddle.Tensor) else gt_labels[i]
+            gt_box, gt_label, _ = prune_zero_padding(gt_box, gt_label)
+            bbox = [
+                res['bbox'] for res in infer_results
+                if int(res['image_id']) == int(im_id[i])
+            ]
+            score = [
+                res['score'] for res in infer_results
+                if int(res['image_id']) == int(im_id[i])
+            ]
+            label = [
+                self.catid2clsid[int(res['category_id'])]
+                for res in infer_results
+                if int(res['image_id']) == int(im_id[i])
+            ]
+            self.detection_map.update(bbox, score, label, gt_box, gt_label)
+
+    def save_results(self, results, output_dir, imid2path):
+        if imid2path:
+            data_dicts = defaultdict(list)
+            for result in results:
+                image_id = result['image_id']
+                data_dicts[image_id].append(result)
+
+            for image_id, image_path in imid2path.items():
+                basename = os.path.splitext(os.path.split(image_path)[-1])[0]
+                output = os.path.join(output_dir, "{}.txt".format(basename))
+                dets = data_dicts.get(image_id, [])
+                with open(output, 'w') as f:
+                    for det in dets:
+                        catid, bbox, score = det['category_id'], det[
+                            'bbox'], det['score']
+                        bbox_pred = '{} {} '.format(self.catid2name[catid],
+                                                    score) + ' '.join(
+                                                        [str(e) for e in bbox])
+                        f.write(bbox_pred + '\n')
+
+            logger.info('The bbox result is saved to {}.'.format(output_dir))
+        else:
+            output = os.path.join(output_dir, "bbox.json")
+            with open(output, 'w') as f:
+                json.dump(results, f)
+
+            logger.info('The bbox result is saved to {}.'.format(output))
+
+    def accumulate(self):
+        if self.output_eval:
+            self.save_results(self.results, self.output_eval, self.imid2path)
+
+        if not self.save_prediction_only:
+            logger.info("Accumulating evaluatation results...")
+            self.detection_map.accumulate()
+
+    def log(self):
+        map_stat = 100. * self.detection_map.get_map()
+        logger.info("mAP({:.2f}, {}) = {:.2f}%".format(self.overlap_thresh,
+                                                       self.map_type, map_stat))
+
+    def get_results(self):
+        return {'bbox': [self.detection_map.get_map()]}
+
+
+class SNIPERCOCOMetric(COCOMetric):
+    def __init__(self, anno_file, **kwargs):
+        super(SNIPERCOCOMetric, self).__init__(anno_file, **kwargs)
+        self.dataset = kwargs["dataset"]
+        self.chip_results = []
+
+    def reset(self):
+        # only bbox and mask evaluation support currently
+        self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []}
+        self.eval_results = {}
+        self.chip_results = []
+
+    def update(self, inputs, outputs):
+        outs = {}
+        # outputs Tensor -> numpy.ndarray
+        for k, v in outputs.items():
+            outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v
+
+        im_id = inputs['im_id']
+        outs['im_id'] = im_id.numpy() if isinstance(im_id,
+                                                    paddle.Tensor) else im_id
+
+        self.chip_results.append(outs)
+
+    def accumulate(self):
+        results = self.dataset.anno_cropper.aggregate_chips_detections(
+            self.chip_results)
+        for outs in results:
+            infer_results = get_infer_results(
+                outs, self.clsid2catid, bias=self.bias)
+            self.results['bbox'] += infer_results[
+                'bbox'] if 'bbox' in infer_results else []
+
+        super(SNIPERCOCOMetric, self).accumulate()
diff --git a/rtdetr_paddle/ppdet/metrics/mot_metrics.py b/rtdetr_paddle/ppdet/metrics/mot_metrics.py
new file mode 100644
index 0000000..b5ed8a2
--- /dev/null
+++ b/rtdetr_paddle/ppdet/metrics/mot_metrics.py
@@ -0,0 +1,1246 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import copy
+import sys
+import math
+from collections import defaultdict
+import numpy as np
+
+from ppdet.modeling.bbox_utils import bbox_iou_np_expand
+from .map_utils import ap_per_class
+from .metrics import Metric
+from .munkres import Munkres
+
+try:
+    import motmetrics as mm
+    mm.lap.default_solver = 'lap'
+except:
+    print(
+        'Warning: Unable to use MOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'
+    )
+    pass
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['MOTEvaluator', 'MOTMetric', 'JDEDetMetric', 'KITTIMOTMetric']
+
+
+def read_mot_results(filename, is_gt=False, is_ignore=False):
+    valid_label = [1]
+    ignore_labels = [2, 7, 8, 12]  # only in motchallenge datasets like 'MOT16'
+    if is_gt:
+        logger.info(
+            "In MOT16/17 dataset the valid_label of ground truth is '{}', "
+            "in other dataset it should be '0' for single classs MOT.".format(
+                valid_label[0]))
+    results_dict = dict()
+    if os.path.isfile(filename):
+        with open(filename, 'r') as f:
+            for line in f.readlines():
+                linelist = line.split(',')
+                if len(linelist) < 7:
+                    continue
+                fid = int(linelist[0])
+                if fid < 1:
+                    continue
+                results_dict.setdefault(fid, list())
+
+                if is_gt:
+                    label = int(float(linelist[7]))
+                    mark = int(float(linelist[6]))
+                    if mark == 0 or label not in valid_label:
+                        continue
+                    score = 1
+                elif is_ignore:
+                    if 'MOT16-' in filename or 'MOT17-' in filename or 'MOT15-' in filename or 'MOT20-' in filename:
+                        label = int(float(linelist[7]))
+                        vis_ratio = float(linelist[8])
+                        if label not in ignore_labels and vis_ratio >= 0:
+                            continue
+                    else:
+                        continue
+                    score = 1
+                else:
+                    score = float(linelist[6])
+
+                tlwh = tuple(map(float, linelist[2:6]))
+                target_id = int(linelist[1])
+
+                results_dict[fid].append((tlwh, target_id, score))
+    return results_dict
+
+
+"""
+MOT dataset label list, see in https://motchallenge.net
+labels={'ped', ...			    % 1
+        'person_on_vhcl', ...	% 2
+        'car', ...				% 3
+        'bicycle', ...			% 4
+        'mbike', ...			% 5
+        'non_mot_vhcl', ...		% 6
+        'static_person', ...	% 7
+        'distractor', ...		% 8
+        'occluder', ...			% 9
+        'occluder_on_grnd', ...	% 10
+        'occluder_full', ...	% 11
+        'reflection', ...		% 12
+        'crowd' ...			    % 13
+};
+"""
+
+
+def unzip_objs(objs):
+    if len(objs) > 0:
+        tlwhs, ids, scores = zip(*objs)
+    else:
+        tlwhs, ids, scores = [], [], []
+    tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4)
+    return tlwhs, ids, scores
+
+
+class MOTEvaluator(object):
+    def __init__(self, data_root, seq_name, data_type):
+        self.data_root = data_root
+        self.seq_name = seq_name
+        self.data_type = data_type
+
+        self.load_annotations()
+        try:
+            import motmetrics as mm
+            mm.lap.default_solver = 'lap'
+        except Exception as e:
+            raise RuntimeError(
+                'Unable to use MOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'
+            )
+        self.reset_accumulator()
+
+    def load_annotations(self):
+        assert self.data_type == 'mot'
+        gt_filename = os.path.join(self.data_root, self.seq_name, 'gt',
+                                   'gt.txt')
+        if not os.path.exists(gt_filename):
+            logger.warning(
+                "gt_filename '{}' of MOTEvaluator is not exist, so the MOTA will be -INF."
+            )
+        self.gt_frame_dict = read_mot_results(gt_filename, is_gt=True)
+        self.gt_ignore_frame_dict = read_mot_results(
+            gt_filename, is_ignore=True)
+
+    def reset_accumulator(self):
+        self.acc = mm.MOTAccumulator(auto_id=True)
+
+    def eval_frame(self, frame_id, trk_tlwhs, trk_ids, rtn_events=False):
+        # results
+        trk_tlwhs = np.copy(trk_tlwhs)
+        trk_ids = np.copy(trk_ids)
+
+        # gts
+        gt_objs = self.gt_frame_dict.get(frame_id, [])
+        gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2]
+
+        # ignore boxes
+        ignore_objs = self.gt_ignore_frame_dict.get(frame_id, [])
+        ignore_tlwhs = unzip_objs(ignore_objs)[0]
+
+        # remove ignored results
+        keep = np.ones(len(trk_tlwhs), dtype=bool)
+        iou_distance = mm.distances.iou_matrix(
+            ignore_tlwhs, trk_tlwhs, max_iou=0.5)
+        if len(iou_distance) > 0:
+            match_is, match_js = mm.lap.linear_sum_assignment(iou_distance)
+            match_is, match_js = map(lambda a: np.asarray(a, dtype=int), [match_is, match_js])
+            match_ious = iou_distance[match_is, match_js]
+
+            match_js = np.asarray(match_js, dtype=int)
+            match_js = match_js[np.logical_not(np.isnan(match_ious))]
+            keep[match_js] = False
+            trk_tlwhs = trk_tlwhs[keep]
+            trk_ids = trk_ids[keep]
+
+        # get distance matrix
+        iou_distance = mm.distances.iou_matrix(gt_tlwhs, trk_tlwhs, max_iou=0.5)
+
+        # acc
+        self.acc.update(gt_ids, trk_ids, iou_distance)
+
+        if rtn_events and iou_distance.size > 0 and hasattr(self.acc,
+                                                            'last_mot_events'):
+            events = self.acc.last_mot_events  # only supported by https://github.com/longcw/py-motmetrics
+        else:
+            events = None
+        return events
+
+    def eval_file(self, filename):
+        self.reset_accumulator()
+
+        result_frame_dict = read_mot_results(filename, is_gt=False)
+        frames = sorted(list(set(result_frame_dict.keys())))
+        for frame_id in frames:
+            trk_objs = result_frame_dict.get(frame_id, [])
+            trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2]
+            self.eval_frame(frame_id, trk_tlwhs, trk_ids, rtn_events=False)
+
+        return self.acc
+
+    @staticmethod
+    def get_summary(accs,
+                    names,
+                    metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1',
+                             'precision', 'recall')):
+        names = copy.deepcopy(names)
+        if metrics is None:
+            metrics = mm.metrics.motchallenge_metrics
+        metrics = copy.deepcopy(metrics)
+
+        mh = mm.metrics.create()
+        summary = mh.compute_many(
+            accs, metrics=metrics, names=names, generate_overall=True)
+        return summary
+
+    @staticmethod
+    def save_summary(summary, filename):
+        import pandas as pd
+        writer = pd.ExcelWriter(filename)
+        summary.to_excel(writer)
+        writer.save()
+
+
+class MOTMetric(Metric):
+    def __init__(self, save_summary=False):
+        self.save_summary = save_summary
+        self.MOTEvaluator = MOTEvaluator
+        self.result_root = None
+        self.reset()
+
+    def reset(self):
+        self.accs = []
+        self.seqs = []
+
+    def update(self, data_root, seq, data_type, result_root, result_filename):
+        evaluator = self.MOTEvaluator(data_root, seq, data_type)
+        self.accs.append(evaluator.eval_file(result_filename))
+        self.seqs.append(seq)
+        self.result_root = result_root
+
+    def accumulate(self):
+        metrics = mm.metrics.motchallenge_metrics
+        mh = mm.metrics.create()
+        summary = self.MOTEvaluator.get_summary(self.accs, self.seqs, metrics)
+        self.strsummary = mm.io.render_summary(
+            summary,
+            formatters=mh.formatters,
+            namemap=mm.io.motchallenge_metric_names)
+        if self.save_summary:
+            self.MOTEvaluator.save_summary(
+                summary, os.path.join(self.result_root, 'summary.xlsx'))
+
+    def log(self):
+        print(self.strsummary)
+
+    def get_results(self):
+        return self.strsummary
+
+
+class JDEDetMetric(Metric):
+    # Note this detection AP metric is different from COCOMetric or VOCMetric,
+    # and the bboxes coordinates are not scaled to the original image
+    def __init__(self, overlap_thresh=0.5):
+        self.overlap_thresh = overlap_thresh
+        self.reset()
+
+    def reset(self):
+        self.AP_accum = np.zeros(1)
+        self.AP_accum_count = np.zeros(1)
+
+    def update(self, inputs, outputs):
+        bboxes = outputs['bbox'][:, 2:].numpy()
+        scores = outputs['bbox'][:, 1].numpy()
+        labels = outputs['bbox'][:, 0].numpy()
+        bbox_lengths = outputs['bbox_num'].numpy()
+        if bboxes.shape[0] == 1 and bboxes.sum() == 0.0:
+            return
+
+        gt_boxes = inputs['gt_bbox'].numpy()[0]
+        gt_labels = inputs['gt_class'].numpy()[0]
+        if gt_labels.shape[0] == 0:
+            return
+
+        correct = []
+        detected = []
+        for i in range(bboxes.shape[0]):
+            obj_pred = 0
+            pred_bbox = bboxes[i].reshape(1, 4)
+            # Compute iou with target boxes
+            iou = bbox_iou_np_expand(pred_bbox, gt_boxes, x1y1x2y2=True)[0]
+            # Extract index of largest overlap
+            best_i = np.argmax(iou)
+            # If overlap exceeds threshold and classification is correct mark as correct
+            if iou[best_i] > self.overlap_thresh and obj_pred == gt_labels[
+                    best_i] and best_i not in detected:
+                correct.append(1)
+                detected.append(best_i)
+            else:
+                correct.append(0)
+
+        # Compute Average Precision (AP) per class
+        target_cls = list(gt_labels.T[0])
+        AP, AP_class, R, P = ap_per_class(
+            tp=correct,
+            conf=scores,
+            pred_cls=np.zeros_like(scores),
+            target_cls=target_cls)
+        self.AP_accum_count += np.bincount(AP_class, minlength=1)
+        self.AP_accum += np.bincount(AP_class, minlength=1, weights=AP)
+
+    def accumulate(self):
+        logger.info("Accumulating evaluatation results...")
+        self.map_stat = self.AP_accum[0] / (self.AP_accum_count[0] + 1E-16)
+
+    def log(self):
+        map_stat = 100. * self.map_stat
+        logger.info("mAP({:.2f}) = {:.2f}%".format(self.overlap_thresh,
+                                                   map_stat))
+
+    def get_results(self):
+        return self.map_stat
+
+
+"""
+Following code is borrow from https://github.com/xingyizhou/CenterTrack/blob/master/src/tools/eval_kitti_track/evaluate_tracking.py
+"""
+
+
+class tData:
+    """
+        Utility class to load data.
+    """
+    def __init__(self,frame=-1,obj_type="unset",truncation=-1,occlusion=-1,\
+                 obs_angle=-10,x1=-1,y1=-1,x2=-1,y2=-1,w=-1,h=-1,l=-1,\
+                 X=-1000,Y=-1000,Z=-1000,yaw=-10,score=-1000,track_id=-1):
+        """
+            Constructor, initializes the object given the parameters.
+        """
+        self.frame = frame
+        self.track_id = track_id
+        self.obj_type = obj_type
+        self.truncation = truncation
+        self.occlusion = occlusion
+        self.obs_angle = obs_angle
+        self.x1 = x1
+        self.y1 = y1
+        self.x2 = x2
+        self.y2 = y2
+        self.w = w
+        self.h = h
+        self.l = l
+        self.X = X
+        self.Y = Y
+        self.Z = Z
+        self.yaw = yaw
+        self.score = score
+        self.ignored = False
+        self.valid = False
+        self.tracker = -1
+
+    def __str__(self):
+        attrs = vars(self)
+        return '\n'.join("%s: %s" % item for item in attrs.items())
+
+
+class KITTIEvaluation(object):
+    """ KITTI tracking statistics (CLEAR MOT, id-switches, fragments, ML/PT/MT, precision/recall)
+             MOTA	- Multi-object tracking accuracy in [0,100]
+             MOTP	- Multi-object tracking precision in [0,100] (3D) / [td,100] (2D)
+             MOTAL	- Multi-object tracking accuracy in [0,100] with log10(id-switches)
+
+             id-switches - number of id switches
+             fragments   - number of fragmentations
+
+             MT, PT, ML	- number of mostly tracked, partially tracked and mostly lost trajectories
+
+             recall	        - recall = percentage of detected targets
+             precision	    - precision = percentage of correctly detected targets
+             FAR		    - number of false alarms per frame
+             falsepositives - number of false positives (FP)
+             missed         - number of missed targets (FN)
+    """
+    def __init__(self, result_path, gt_path, min_overlap=0.5, max_truncation = 0,\
+                min_height = 25, max_occlusion = 2, cls="car",\
+                n_frames=[], seqs=[], n_sequences=0):
+        # get number of sequences and
+        # get number of frames per sequence from test mapping
+        # (created while extracting the benchmark)
+        self.gt_path = os.path.join(gt_path, "../labels")
+        self.n_frames = n_frames
+        self.sequence_name = seqs
+        self.n_sequences = n_sequences
+
+        self.cls = cls  # class to evaluate, i.e. pedestrian or car
+
+        self.result_path = result_path
+
+        # statistics and numbers for evaluation
+        self.n_gt = 0  # number of ground truth detections minus ignored false negatives and true positives
+        self.n_igt = 0  # number of ignored ground truth detections
+        self.n_gts = [
+        ]  # number of ground truth detections minus ignored false negatives and true positives PER SEQUENCE
+        self.n_igts = [
+        ]  # number of ground ignored truth detections PER SEQUENCE
+        self.n_gt_trajectories = 0
+        self.n_gt_seq = []
+        self.n_tr = 0  # number of tracker detections minus ignored tracker detections
+        self.n_trs = [
+        ]  # number of tracker detections minus ignored tracker detections PER SEQUENCE
+        self.n_itr = 0  # number of ignored tracker detections
+        self.n_itrs = []  # number of ignored tracker detections PER SEQUENCE
+        self.n_igttr = 0  # number of ignored ground truth detections where the corresponding associated tracker detection is also ignored
+        self.n_tr_trajectories = 0
+        self.n_tr_seq = []
+        self.MOTA = 0
+        self.MOTP = 0
+        self.MOTAL = 0
+        self.MODA = 0
+        self.MODP = 0
+        self.MODP_t = []
+        self.recall = 0
+        self.precision = 0
+        self.F1 = 0
+        self.FAR = 0
+        self.total_cost = 0
+        self.itp = 0  # number of ignored true positives
+        self.itps = []  # number of ignored true positives PER SEQUENCE
+        self.tp = 0  # number of true positives including ignored true positives!
+        self.tps = [
+        ]  # number of true positives including ignored true positives PER SEQUENCE
+        self.fn = 0  # number of false negatives WITHOUT ignored false negatives
+        self.fns = [
+        ]  # number of false negatives WITHOUT ignored false negatives PER SEQUENCE
+        self.ifn = 0  # number of ignored false negatives
+        self.ifns = []  # number of ignored false negatives PER SEQUENCE
+        self.fp = 0  # number of false positives
+        # a bit tricky, the number of ignored false negatives and ignored true positives 
+        # is subtracted, but if both tracker detection and ground truth detection
+        # are ignored this number is added again to avoid double counting
+        self.fps = []  # above PER SEQUENCE
+        self.mme = 0
+        self.fragments = 0
+        self.id_switches = 0
+        self.MT = 0
+        self.PT = 0
+        self.ML = 0
+
+        self.min_overlap = min_overlap  # minimum bounding box overlap for 3rd party metrics
+        self.max_truncation = max_truncation  # maximum truncation of an object for evaluation
+        self.max_occlusion = max_occlusion  # maximum occlusion of an object for evaluation
+        self.min_height = min_height  # minimum height of an object for evaluation
+        self.n_sample_points = 500
+
+        # this should be enough to hold all groundtruth trajectories
+        # is expanded if necessary and reduced in any case
+        self.gt_trajectories = [[] for x in range(self.n_sequences)]
+        self.ign_trajectories = [[] for x in range(self.n_sequences)]
+
+    def loadGroundtruth(self):
+        try:
+            self._loadData(self.gt_path, cls=self.cls, loading_groundtruth=True)
+        except IOError:
+            return False
+        return True
+
+    def loadTracker(self):
+        try:
+            if not self._loadData(
+                    self.result_path, cls=self.cls, loading_groundtruth=False):
+                return False
+        except IOError:
+            return False
+        return True
+
+    def _loadData(self,
+                  root_dir,
+                  cls,
+                  min_score=-1000,
+                  loading_groundtruth=False):
+        """
+            Generic loader for ground truth and tracking data.
+            Use loadGroundtruth() or loadTracker() to load this data.
+            Loads detections in KITTI format from textfiles.
+        """
+        # construct objectDetections object to hold detection data
+        t_data = tData()
+        data = []
+        eval_2d = True
+        eval_3d = True
+
+        seq_data = []
+        n_trajectories = 0
+        n_trajectories_seq = []
+        for seq, s_name in enumerate(self.sequence_name):
+            i = 0
+            filename = os.path.join(root_dir, "%s.txt" % s_name)
+            f = open(filename, "r")
+
+            f_data = [
+                [] for x in range(self.n_frames[seq])
+            ]  # current set has only 1059 entries, sufficient length is checked anyway
+            ids = []
+            n_in_seq = 0
+            id_frame_cache = []
+            for line in f:
+                # KITTI tracking benchmark data format:
+                # (frame,tracklet_id,objectType,truncation,occlusion,alpha,x1,y1,x2,y2,h,w,l,X,Y,Z,ry)
+                line = line.strip()
+                fields = line.split(" ")
+                # classes that should be loaded (ignored neighboring classes)
+                if "car" in cls.lower():
+                    classes = ["car", "van"]
+                elif "pedestrian" in cls.lower():
+                    classes = ["pedestrian", "person_sitting"]
+                else:
+                    classes = [cls.lower()]
+                classes += ["dontcare"]
+                if not any([s for s in classes if s in fields[2].lower()]):
+                    continue
+                # get fields from table
+                t_data.frame = int(float(fields[0]))  # frame
+                t_data.track_id = int(float(fields[1]))  # id
+                t_data.obj_type = fields[
+                    2].lower()  # object type [car, pedestrian, cyclist, ...]
+                t_data.truncation = int(
+                    float(fields[3]))  # truncation [-1,0,1,2]
+                t_data.occlusion = int(
+                    float(fields[4]))  # occlusion  [-1,0,1,2]
+                t_data.obs_angle = float(fields[5])  # observation angle [rad]
+                t_data.x1 = float(fields[6])  # left   [px]
+                t_data.y1 = float(fields[7])  # top    [px]
+                t_data.x2 = float(fields[8])  # right  [px]
+                t_data.y2 = float(fields[9])  # bottom [px]
+                t_data.h = float(fields[10])  # height [m]
+                t_data.w = float(fields[11])  # width  [m]
+                t_data.l = float(fields[12])  # length [m]
+                t_data.X = float(fields[13])  # X [m]
+                t_data.Y = float(fields[14])  # Y [m]
+                t_data.Z = float(fields[15])  # Z [m]
+                t_data.yaw = float(fields[16])  # yaw angle [rad]
+                if not loading_groundtruth:
+                    if len(fields) == 17:
+                        t_data.score = -1
+                    elif len(fields) == 18:
+                        t_data.score = float(fields[17])  # detection score
+                    else:
+                        logger.info("file is not in KITTI format")
+                        return
+
+                # do not consider objects marked as invalid
+                if t_data.track_id is -1 and t_data.obj_type != "dontcare":
+                    continue
+
+                idx = t_data.frame
+                # check if length for frame data is sufficient
+                if idx >= len(f_data):
+                    print("extend f_data", idx, len(f_data))
+                    f_data += [[] for x in range(max(500, idx - len(f_data)))]
+                try:
+                    id_frame = (t_data.frame, t_data.track_id)
+                    if id_frame in id_frame_cache and not loading_groundtruth:
+                        logger.info(
+                            "track ids are not unique for sequence %d: frame %d"
+                            % (seq, t_data.frame))
+                        logger.info(
+                            "track id %d occurred at least twice for this frame"
+                            % t_data.track_id)
+                        logger.info("Exiting...")
+                        #continue # this allows to evaluate non-unique result files
+                        return False
+                    id_frame_cache.append(id_frame)
+                    f_data[t_data.frame].append(copy.copy(t_data))
+                except:
+                    print(len(f_data), idx)
+                    raise
+
+                if t_data.track_id not in ids and t_data.obj_type != "dontcare":
+                    ids.append(t_data.track_id)
+                    n_trajectories += 1
+                    n_in_seq += 1
+
+                # check if uploaded data provides information for 2D and 3D evaluation
+                if not loading_groundtruth and eval_2d is True and (
+                        t_data.x1 == -1 or t_data.x2 == -1 or t_data.y1 == -1 or
+                        t_data.y2 == -1):
+                    eval_2d = False
+                if not loading_groundtruth and eval_3d is True and (
+                        t_data.X == -1000 or t_data.Y == -1000 or
+                        t_data.Z == -1000):
+                    eval_3d = False
+
+            # only add existing frames
+            n_trajectories_seq.append(n_in_seq)
+            seq_data.append(f_data)
+            f.close()
+
+        if not loading_groundtruth:
+            self.tracker = seq_data
+            self.n_tr_trajectories = n_trajectories
+            self.eval_2d = eval_2d
+            self.eval_3d = eval_3d
+            self.n_tr_seq = n_trajectories_seq
+            if self.n_tr_trajectories == 0:
+                return False
+        else:
+            # split ground truth and DontCare areas
+            self.dcareas = []
+            self.groundtruth = []
+            for seq_idx in range(len(seq_data)):
+                seq_gt = seq_data[seq_idx]
+                s_g, s_dc = [], []
+                for f in range(len(seq_gt)):
+                    all_gt = seq_gt[f]
+                    g, dc = [], []
+                    for gg in all_gt:
+                        if gg.obj_type == "dontcare":
+                            dc.append(gg)
+                        else:
+                            g.append(gg)
+                    s_g.append(g)
+                    s_dc.append(dc)
+                self.dcareas.append(s_dc)
+                self.groundtruth.append(s_g)
+            self.n_gt_seq = n_trajectories_seq
+            self.n_gt_trajectories = n_trajectories
+        return True
+
+    def boxoverlap(self, a, b, criterion="union"):
+        """
+            boxoverlap computes intersection over union for bbox a and b in KITTI format.
+            If the criterion is 'union', overlap = (a inter b) / a union b).
+            If the criterion is 'a', overlap = (a inter b) / a, where b should be a dontcare area.
+        """
+        x1 = max(a.x1, b.x1)
+        y1 = max(a.y1, b.y1)
+        x2 = min(a.x2, b.x2)
+        y2 = min(a.y2, b.y2)
+
+        w = x2 - x1
+        h = y2 - y1
+
+        if w <= 0. or h <= 0.:
+            return 0.
+        inter = w * h
+        aarea = (a.x2 - a.x1) * (a.y2 - a.y1)
+        barea = (b.x2 - b.x1) * (b.y2 - b.y1)
+        # intersection over union overlap
+        if criterion.lower() == "union":
+            o = inter / float(aarea + barea - inter)
+        elif criterion.lower() == "a":
+            o = float(inter) / float(aarea)
+        else:
+            raise TypeError("Unkown type for criterion")
+        return o
+
+    def compute3rdPartyMetrics(self):
+        """
+            Computes the metrics defined in
+                - Stiefelhagen 2008: Evaluating Multiple Object Tracking Performance: The CLEAR MOT Metrics
+                  MOTA, MOTAL, MOTP
+                - Nevatia 2008: Global Data Association for Multi-Object Tracking Using Network Flows
+                  MT/PT/ML
+        """
+        # construct Munkres object for Hungarian Method association
+        hm = Munkres()
+        max_cost = 1e9
+
+        # go through all frames and associate ground truth and tracker results
+        # groundtruth and tracker contain lists for every single frame containing lists of KITTI format detections
+        fr, ids = 0, 0
+        for seq_idx in range(len(self.groundtruth)):
+            seq_gt = self.groundtruth[seq_idx]
+            seq_dc = self.dcareas[seq_idx]  # don't care areas
+            seq_tracker = self.tracker[seq_idx]
+            seq_trajectories = defaultdict(list)
+            seq_ignored = defaultdict(list)
+
+            # statistics over the current sequence, check the corresponding
+            # variable comments in __init__ to get their meaning
+            seqtp = 0
+            seqitp = 0
+            seqfn = 0
+            seqifn = 0
+            seqfp = 0
+            seqigt = 0
+            seqitr = 0
+
+            last_ids = [[], []]
+            n_gts = 0
+            n_trs = 0
+
+            for f in range(len(seq_gt)):
+                g = seq_gt[f]
+                dc = seq_dc[f]
+
+                t = seq_tracker[f]
+                # counting total number of ground truth and tracker objects
+                self.n_gt += len(g)
+                self.n_tr += len(t)
+
+                n_gts += len(g)
+                n_trs += len(t)
+
+                # use hungarian method to associate, using boxoverlap 0..1 as cost
+                # build cost matrix
+                cost_matrix = []
+                this_ids = [[], []]
+                for gg in g:
+                    # save current ids
+                    this_ids[0].append(gg.track_id)
+                    this_ids[1].append(-1)
+                    gg.tracker = -1
+                    gg.id_switch = 0
+                    gg.fragmentation = 0
+                    cost_row = []
+                    for tt in t:
+                        # overlap == 1 is cost ==0
+                        c = 1 - self.boxoverlap(gg, tt)
+                        # gating for boxoverlap
+                        if c <= self.min_overlap:
+                            cost_row.append(c)
+                        else:
+                            cost_row.append(max_cost)  # = 1e9
+                    cost_matrix.append(cost_row)
+                    # all ground truth trajectories are initially not associated
+                    # extend groundtruth trajectories lists (merge lists)
+                    seq_trajectories[gg.track_id].append(-1)
+                    seq_ignored[gg.track_id].append(False)
+
+                if len(g) is 0:
+                    cost_matrix = [[]]
+                # associate
+                association_matrix = hm.compute(cost_matrix)
+
+                # tmp variables for sanity checks and MODP computation
+                tmptp = 0
+                tmpfp = 0
+                tmpfn = 0
+                tmpc = 0  # this will sum up the overlaps for all true positives
+                tmpcs = [0] * len(
+                    g)  # this will save the overlaps for all true positives
+                # the reason is that some true positives might be ignored
+                # later such that the corrsponding overlaps can
+                # be subtracted from tmpc for MODP computation
+
+                # mapping for tracker ids and ground truth ids
+                for row, col in association_matrix:
+                    # apply gating on boxoverlap
+                    c = cost_matrix[row][col]
+                    if c < max_cost:
+                        g[row].tracker = t[col].track_id
+                        this_ids[1][row] = t[col].track_id
+                        t[col].valid = True
+                        g[row].distance = c
+                        self.total_cost += 1 - c
+                        tmpc += 1 - c
+                        tmpcs[row] = 1 - c
+                        seq_trajectories[g[row].track_id][-1] = t[col].track_id
+
+                        # true positives are only valid associations
+                        self.tp += 1
+                        tmptp += 1
+                    else:
+                        g[row].tracker = -1
+                        self.fn += 1
+                        tmpfn += 1
+
+                # associate tracker and DontCare areas
+                # ignore tracker in neighboring classes
+                nignoredtracker = 0  # number of ignored tracker detections
+                ignoredtrackers = dict()  # will associate the track_id with -1
+                # if it is not ignored and 1 if it is
+                # ignored;
+                # this is used to avoid double counting ignored
+                # cases, see the next loop
+
+                for tt in t:
+                    ignoredtrackers[tt.track_id] = -1
+                    # ignore detection if it belongs to a neighboring class or is
+                    # smaller or equal to the minimum height
+
+                    tt_height = abs(tt.y1 - tt.y2)
+                    if ((self.cls == "car" and tt.obj_type == "van") or
+                        (self.cls == "pedestrian" and
+                         tt.obj_type == "person_sitting") or
+                            tt_height <= self.min_height) and not tt.valid:
+                        nignoredtracker += 1
+                        tt.ignored = True
+                        ignoredtrackers[tt.track_id] = 1
+                        continue
+                    for d in dc:
+                        overlap = self.boxoverlap(tt, d, "a")
+                        if overlap > 0.5 and not tt.valid:
+                            tt.ignored = True
+                            nignoredtracker += 1
+                            ignoredtrackers[tt.track_id] = 1
+                            break
+
+                # check for ignored FN/TP (truncation or neighboring object class)
+                ignoredfn = 0  # the number of ignored false negatives
+                nignoredtp = 0  # the number of ignored true positives
+                nignoredpairs = 0  # the number of ignored pairs, i.e. a true positive
+                # which is ignored but where the associated tracker
+                # detection has already been ignored
+
+                gi = 0
+                for gg in g:
+                    if gg.tracker < 0:
+                        if gg.occlusion>self.max_occlusion or gg.truncation>self.max_truncation\
+                                or (self.cls=="car" and gg.obj_type=="van") or (self.cls=="pedestrian" and gg.obj_type=="person_sitting"):
+                            seq_ignored[gg.track_id][-1] = True
+                            gg.ignored = True
+                            ignoredfn += 1
+
+                    elif gg.tracker >= 0:
+                        if gg.occlusion>self.max_occlusion or gg.truncation>self.max_truncation\
+                                or (self.cls=="car" and gg.obj_type=="van") or (self.cls=="pedestrian" and gg.obj_type=="person_sitting"):
+
+                            seq_ignored[gg.track_id][-1] = True
+                            gg.ignored = True
+                            nignoredtp += 1
+
+                            # if the associated tracker detection is already ignored,
+                            # we want to avoid double counting ignored detections
+                            if ignoredtrackers[gg.tracker] > 0:
+                                nignoredpairs += 1
+
+                            # for computing MODP, the overlaps from ignored detections
+                            # are subtracted
+                            tmpc -= tmpcs[gi]
+                    gi += 1
+
+                # the below might be confusion, check the comments in __init__
+                # to see what the individual statistics represent
+
+                # correct TP by number of ignored TP due to truncation
+                # ignored TP are shown as tracked in visualization
+                tmptp -= nignoredtp
+
+                # count the number of ignored true positives
+                self.itp += nignoredtp
+
+                # adjust the number of ground truth objects considered
+                self.n_gt -= (ignoredfn + nignoredtp)
+
+                # count the number of ignored ground truth objects
+                self.n_igt += ignoredfn + nignoredtp
+
+                # count the number of ignored tracker objects
+                self.n_itr += nignoredtracker
+
+                # count the number of ignored pairs, i.e. associated tracker and
+                # ground truth objects that are both ignored
+                self.n_igttr += nignoredpairs
+
+                # false negatives = associated gt bboxes exceding association threshold + non-associated gt bboxes
+                tmpfn += len(g) - len(association_matrix) - ignoredfn
+                self.fn += len(g) - len(association_matrix) - ignoredfn
+                self.ifn += ignoredfn
+
+                # false positives = tracker bboxes - associated tracker bboxes
+                # mismatches (mme_t)
+                tmpfp += len(
+                    t) - tmptp - nignoredtracker - nignoredtp + nignoredpairs
+                self.fp += len(
+                    t) - tmptp - nignoredtracker - nignoredtp + nignoredpairs
+
+                # update sequence data
+                seqtp += tmptp
+                seqitp += nignoredtp
+                seqfp += tmpfp
+                seqfn += tmpfn
+                seqifn += ignoredfn
+                seqigt += ignoredfn + nignoredtp
+                seqitr += nignoredtracker
+
+                # sanity checks
+                # - the number of true positives minues ignored true positives
+                #   should be greater or equal to 0
+                # - the number of false negatives should be greater or equal to 0
+                # - the number of false positives needs to be greater or equal to 0
+                #   otherwise ignored detections might be counted double
+                # - the number of counted true positives (plus ignored ones)
+                #   and the number of counted false negatives (plus ignored ones)
+                #   should match the total number of ground truth objects
+                # - the number of counted true positives (plus ignored ones)
+                #   and the number of counted false positives
+                #   plus the number of ignored tracker detections should
+                #   match the total number of tracker detections; note that
+                #   nignoredpairs is subtracted here to avoid double counting
+                #   of ignored detection sin nignoredtp and nignoredtracker
+                if tmptp < 0:
+                    print(tmptp, nignoredtp)
+                    raise NameError("Something went wrong! TP is negative")
+                if tmpfn < 0:
+                    print(tmpfn,
+                          len(g),
+                          len(association_matrix), ignoredfn, nignoredpairs)
+                    raise NameError("Something went wrong! FN is negative")
+                if tmpfp < 0:
+                    print(tmpfp,
+                          len(t), tmptp, nignoredtracker, nignoredtp,
+                          nignoredpairs)
+                    raise NameError("Something went wrong! FP is negative")
+                if tmptp + tmpfn is not len(g) - ignoredfn - nignoredtp:
+                    print("seqidx", seq_idx)
+                    print("frame ", f)
+                    print("TP    ", tmptp)
+                    print("FN    ", tmpfn)
+                    print("FP    ", tmpfp)
+                    print("nGT   ", len(g))
+                    print("nAss  ", len(association_matrix))
+                    print("ign GT", ignoredfn)
+                    print("ign TP", nignoredtp)
+                    raise NameError(
+                        "Something went wrong! nGroundtruth is not TP+FN")
+                if tmptp + tmpfp + nignoredtp + nignoredtracker - nignoredpairs is not len(
+                        t):
+                    print(seq_idx, f, len(t), tmptp, tmpfp)
+                    print(len(association_matrix), association_matrix)
+                    raise NameError(
+                        "Something went wrong! nTracker is not TP+FP")
+
+                # check for id switches or fragmentations
+                for i, tt in enumerate(this_ids[0]):
+                    if tt in last_ids[0]:
+                        idx = last_ids[0].index(tt)
+                        tid = this_ids[1][i]
+                        lid = last_ids[1][idx]
+                        if tid != lid and lid != -1 and tid != -1:
+                            if g[i].truncation < self.max_truncation:
+                                g[i].id_switch = 1
+                                ids += 1
+                        if tid != lid and lid != -1:
+                            if g[i].truncation < self.max_truncation:
+                                g[i].fragmentation = 1
+                                fr += 1
+
+                # save current index
+                last_ids = this_ids
+                # compute MOTP_t
+                MODP_t = 1
+                if tmptp != 0:
+                    MODP_t = tmpc / float(tmptp)
+                self.MODP_t.append(MODP_t)
+
+            # remove empty lists for current gt trajectories
+            self.gt_trajectories[seq_idx] = seq_trajectories
+            self.ign_trajectories[seq_idx] = seq_ignored
+
+            # gather statistics for "per sequence" statistics.
+            self.n_gts.append(n_gts)
+            self.n_trs.append(n_trs)
+            self.tps.append(seqtp)
+            self.itps.append(seqitp)
+            self.fps.append(seqfp)
+            self.fns.append(seqfn)
+            self.ifns.append(seqifn)
+            self.n_igts.append(seqigt)
+            self.n_itrs.append(seqitr)
+
+        # compute MT/PT/ML, fragments, idswitches for all groundtruth trajectories
+        n_ignored_tr_total = 0
+        for seq_idx, (
+                seq_trajectories, seq_ignored
+        ) in enumerate(zip(self.gt_trajectories, self.ign_trajectories)):
+            if len(seq_trajectories) == 0:
+                continue
+            tmpMT, tmpML, tmpPT, tmpId_switches, tmpFragments = [0] * 5
+            n_ignored_tr = 0
+            for g, ign_g in zip(seq_trajectories.values(),
+                                seq_ignored.values()):
+                # all frames of this gt trajectory are ignored
+                if all(ign_g):
+                    n_ignored_tr += 1
+                    n_ignored_tr_total += 1
+                    continue
+                # all frames of this gt trajectory are not assigned to any detections
+                if all([this == -1 for this in g]):
+                    tmpML += 1
+                    self.ML += 1
+                    continue
+                # compute tracked frames in trajectory
+                last_id = g[0]
+                # first detection (necessary to be in gt_trajectories) is always tracked
+                tracked = 1 if g[0] >= 0 else 0
+                lgt = 0 if ign_g[0] else 1
+                for f in range(1, len(g)):
+                    if ign_g[f]:
+                        last_id = -1
+                        continue
+                    lgt += 1
+                    if last_id != g[f] and last_id != -1 and g[f] != -1 and g[
+                            f - 1] != -1:
+                        tmpId_switches += 1
+                        self.id_switches += 1
+                    if f < len(g) - 1 and g[f - 1] != g[
+                            f] and last_id != -1 and g[f] != -1 and g[f +
+                                                                      1] != -1:
+                        tmpFragments += 1
+                        self.fragments += 1
+                    if g[f] != -1:
+                        tracked += 1
+                        last_id = g[f]
+                # handle last frame; tracked state is handled in for loop (g[f]!=-1)
+                if len(g) > 1 and g[f - 1] != g[f] and last_id != -1 and g[
+                        f] != -1 and not ign_g[f]:
+                    tmpFragments += 1
+                    self.fragments += 1
+
+                # compute MT/PT/ML
+                tracking_ratio = tracked / float(len(g) - sum(ign_g))
+                if tracking_ratio > 0.8:
+                    tmpMT += 1
+                    self.MT += 1
+                elif tracking_ratio < 0.2:
+                    tmpML += 1
+                    self.ML += 1
+                else:  # 0.2 <= tracking_ratio <= 0.8
+                    tmpPT += 1
+                    self.PT += 1
+
+        if (self.n_gt_trajectories - n_ignored_tr_total) == 0:
+            self.MT = 0.
+            self.PT = 0.
+            self.ML = 0.
+        else:
+            self.MT /= float(self.n_gt_trajectories - n_ignored_tr_total)
+            self.PT /= float(self.n_gt_trajectories - n_ignored_tr_total)
+            self.ML /= float(self.n_gt_trajectories - n_ignored_tr_total)
+
+        # precision/recall etc.
+        if (self.fp + self.tp) == 0 or (self.tp + self.fn) == 0:
+            self.recall = 0.
+            self.precision = 0.
+        else:
+            self.recall = self.tp / float(self.tp + self.fn)
+            self.precision = self.tp / float(self.fp + self.tp)
+        if (self.recall + self.precision) == 0:
+            self.F1 = 0.
+        else:
+            self.F1 = 2. * (self.precision * self.recall) / (
+                self.precision + self.recall)
+        if sum(self.n_frames) == 0:
+            self.FAR = "n/a"
+        else:
+            self.FAR = self.fp / float(sum(self.n_frames))
+
+        # compute CLEARMOT
+        if self.n_gt == 0:
+            self.MOTA = -float("inf")
+            self.MODA = -float("inf")
+        else:
+            self.MOTA = 1 - (self.fn + self.fp + self.id_switches
+                             ) / float(self.n_gt)
+            self.MODA = 1 - (self.fn + self.fp) / float(self.n_gt)
+        if self.tp == 0:
+            self.MOTP = float("inf")
+        else:
+            self.MOTP = self.total_cost / float(self.tp)
+        if self.n_gt != 0:
+            if self.id_switches == 0:
+                self.MOTAL = 1 - (self.fn + self.fp + self.id_switches
+                                  ) / float(self.n_gt)
+            else:
+                self.MOTAL = 1 - (self.fn + self.fp +
+                                  math.log10(self.id_switches)
+                                  ) / float(self.n_gt)
+        else:
+            self.MOTAL = -float("inf")
+        if sum(self.n_frames) == 0:
+            self.MODP = "n/a"
+        else:
+            self.MODP = sum(self.MODP_t) / float(sum(self.n_frames))
+        return True
+
+    def createSummary(self):
+        summary = ""
+        summary += "tracking evaluation summary".center(80, "=") + "\n"
+        summary += self.printEntry("Multiple Object Tracking Accuracy (MOTA)",
+                                   self.MOTA) + "\n"
+        summary += self.printEntry("Multiple Object Tracking Precision (MOTP)",
+                                   self.MOTP) + "\n"
+        summary += self.printEntry("Multiple Object Tracking Accuracy (MOTAL)",
+                                   self.MOTAL) + "\n"
+        summary += self.printEntry("Multiple Object Detection Accuracy (MODA)",
+                                   self.MODA) + "\n"
+        summary += self.printEntry("Multiple Object Detection Precision (MODP)",
+                                   self.MODP) + "\n"
+        summary += "\n"
+        summary += self.printEntry("Recall", self.recall) + "\n"
+        summary += self.printEntry("Precision", self.precision) + "\n"
+        summary += self.printEntry("F1", self.F1) + "\n"
+        summary += self.printEntry("False Alarm Rate", self.FAR) + "\n"
+        summary += "\n"
+        summary += self.printEntry("Mostly Tracked", self.MT) + "\n"
+        summary += self.printEntry("Partly Tracked", self.PT) + "\n"
+        summary += self.printEntry("Mostly Lost", self.ML) + "\n"
+        summary += "\n"
+        summary += self.printEntry("True Positives", self.tp) + "\n"
+        #summary += self.printEntry("True Positives per Sequence", self.tps) + "\n"
+        summary += self.printEntry("Ignored True Positives", self.itp) + "\n"
+        #summary += self.printEntry("Ignored True Positives per Sequence", self.itps) + "\n"
+
+        summary += self.printEntry("False Positives", self.fp) + "\n"
+        #summary += self.printEntry("False Positives per Sequence", self.fps) + "\n"
+        summary += self.printEntry("False Negatives", self.fn) + "\n"
+        #summary += self.printEntry("False Negatives per Sequence", self.fns) + "\n"
+        summary += self.printEntry("ID-switches", self.id_switches) + "\n"
+        self.fp = self.fp / self.n_gt
+        self.fn = self.fn / self.n_gt
+        self.id_switches = self.id_switches / self.n_gt
+        summary += self.printEntry("False Positives Ratio", self.fp) + "\n"
+        #summary += self.printEntry("False Positives per Sequence", self.fps) + "\n"
+        summary += self.printEntry("False Negatives Ratio", self.fn) + "\n"
+        #summary += self.printEntry("False Negatives per Sequence", self.fns) + "\n"
+        summary += self.printEntry("Ignored False Negatives Ratio",
+                                   self.ifn) + "\n"
+
+        #summary += self.printEntry("Ignored False Negatives per Sequence", self.ifns) + "\n"
+        summary += self.printEntry("Missed Targets", self.fn) + "\n"
+        summary += self.printEntry("ID-switches", self.id_switches) + "\n"
+        summary += self.printEntry("Fragmentations", self.fragments) + "\n"
+        summary += "\n"
+        summary += self.printEntry("Ground Truth Objects (Total)", self.n_gt +
+                                   self.n_igt) + "\n"
+        #summary += self.printEntry("Ground Truth Objects (Total) per Sequence", self.n_gts) + "\n"
+        summary += self.printEntry("Ignored Ground Truth Objects",
+                                   self.n_igt) + "\n"
+        #summary += self.printEntry("Ignored Ground Truth Objects per Sequence", self.n_igts) + "\n"
+        summary += self.printEntry("Ground Truth Trajectories",
+                                   self.n_gt_trajectories) + "\n"
+        summary += "\n"
+        summary += self.printEntry("Tracker Objects (Total)", self.n_tr) + "\n"
+        #summary += self.printEntry("Tracker Objects (Total) per Sequence", self.n_trs) + "\n"
+        summary += self.printEntry("Ignored Tracker Objects", self.n_itr) + "\n"
+        #summary += self.printEntry("Ignored Tracker Objects per Sequence", self.n_itrs) + "\n"
+        summary += self.printEntry("Tracker Trajectories",
+                                   self.n_tr_trajectories) + "\n"
+        #summary += "\n"
+        #summary += self.printEntry("Ignored Tracker Objects with Associated Ignored Ground Truth Objects", self.n_igttr) + "\n"
+        summary += "=" * 80
+        return summary
+
+    def printEntry(self, key, val, width=(70, 10)):
+        """
+            Pretty print an entry in a table fashion.
+        """
+        s_out = key.ljust(width[0])
+        if type(val) == int:
+            s = "%%%dd" % width[1]
+            s_out += s % val
+        elif type(val) == float:
+            s = "%%%df" % (width[1])
+            s_out += s % val
+        else:
+            s_out += ("%s" % val).rjust(width[1])
+        return s_out
+
+    def saveToStats(self, save_summary):
+        """
+            Save the statistics in a whitespace separate file.
+        """
+        summary = self.createSummary()
+        if save_summary:
+            filename = os.path.join(self.result_path,
+                                    "summary_%s.txt" % self.cls)
+            dump = open(filename, "w+")
+            dump.write(summary)
+            dump.close()
+        return summary
+
+
+class KITTIMOTMetric(Metric):
+    def __init__(self, save_summary=True):
+        self.save_summary = save_summary
+        self.MOTEvaluator = KITTIEvaluation
+        self.result_root = None
+        self.reset()
+
+    def reset(self):
+        self.seqs = []
+        self.n_sequences = 0
+        self.n_frames = []
+        self.strsummary = ''
+
+    def update(self, data_root, seq, data_type, result_root, result_filename):
+        assert data_type == 'kitti', "data_type should 'kitti'"
+        self.result_root = result_root
+        self.gt_path = data_root
+        gt_path = '{}/../labels/{}.txt'.format(data_root, seq)
+        gt = open(gt_path, "r")
+        max_frame = 0
+        for line in gt:
+            line = line.strip()
+            line_list = line.split(" ")
+            if int(line_list[0]) > max_frame:
+                max_frame = int(line_list[0])
+        rs = open(result_filename, "r")
+        for line in rs:
+            line = line.strip()
+            line_list = line.split(" ")
+            if int(line_list[0]) > max_frame:
+                max_frame = int(line_list[0])
+        gt.close()
+        rs.close()
+        self.n_frames.append(max_frame + 1)
+        self.seqs.append(seq)
+        self.n_sequences += 1
+
+    def accumulate(self):
+        logger.info("Processing Result for KITTI Tracking Benchmark")
+        e = self.MOTEvaluator(result_path=self.result_root, gt_path=self.gt_path,\
+            n_frames=self.n_frames, seqs=self.seqs, n_sequences=self.n_sequences)
+        try:
+            if not e.loadTracker():
+                return
+            logger.info("Loading Results - Success")
+            logger.info("Evaluate Object Class: %s" % c.upper())
+        except:
+            logger.info("Caught exception while loading result data.")
+        if not e.loadGroundtruth():
+            raise ValueError("Ground truth not found.")
+        logger.info("Loading Groundtruth - Success")
+        # sanity checks
+        if len(e.groundtruth) is not len(e.tracker):
+            logger.info(
+                "The uploaded data does not provide results for every sequence.")
+            return False
+        logger.info("Loaded %d Sequences." % len(e.groundtruth))
+        logger.info("Start Evaluation...")
+
+        if e.compute3rdPartyMetrics():
+            self.strsummary = e.saveToStats(self.save_summary)
+        else:
+            logger.info(
+                "There seem to be no true positives or false positives at all in the submitted data."
+            )
+
+    def log(self):
+        print(self.strsummary)
+
+    def get_results(self):
+        return self.strsummary
diff --git a/rtdetr_paddle/ppdet/metrics/munkres.py b/rtdetr_paddle/ppdet/metrics/munkres.py
new file mode 100644
index 0000000..fbd4a92
--- /dev/null
+++ b/rtdetr_paddle/ppdet/metrics/munkres.py
@@ -0,0 +1,428 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+"""
+This code is borrow from https://github.com/xingyizhou/CenterTrack/blob/master/src/tools/eval_kitti_track/munkres.py
+"""
+
+import sys
+
+__all__ = ['Munkres', 'make_cost_matrix']
+
+
+class Munkres:
+    """
+    Calculate the Munkres solution to the classical assignment problem.
+    See the module documentation for usage.
+    """
+
+    def __init__(self):
+        """Create a new instance"""
+        self.C = None
+        self.row_covered = []
+        self.col_covered = []
+        self.n = 0
+        self.Z0_r = 0
+        self.Z0_c = 0
+        self.marked = None
+        self.path = None
+
+    def make_cost_matrix(profit_matrix, inversion_function):
+        """
+        **DEPRECATED**
+
+        Please use the module function ``make_cost_matrix()``.
+        """
+        import munkres
+        return munkres.make_cost_matrix(profit_matrix, inversion_function)
+
+    make_cost_matrix = staticmethod(make_cost_matrix)
+
+    def pad_matrix(self, matrix, pad_value=0):
+        """
+        Pad a possibly non-square matrix to make it square.
+
+        :Parameters:
+            matrix : list of lists
+                matrix to pad
+
+            pad_value : int
+                value to use to pad the matrix
+
+        :rtype: list of lists
+        :return: a new, possibly padded, matrix
+        """
+        max_columns = 0
+        total_rows = len(matrix)
+
+        for row in matrix:
+            max_columns = max(max_columns, len(row))
+
+        total_rows = max(max_columns, total_rows)
+
+        new_matrix = []
+        for row in matrix:
+            row_len = len(row)
+            new_row = row[:]
+            if total_rows > row_len:
+                # Row too short. Pad it.
+                new_row += [0] * (total_rows - row_len)
+            new_matrix += [new_row]
+
+        while len(new_matrix) < total_rows:
+            new_matrix += [[0] * total_rows]
+
+        return new_matrix
+
+    def compute(self, cost_matrix):
+        """
+        Compute the indexes for the lowest-cost pairings between rows and
+        columns in the database. Returns a list of (row, column) tuples
+        that can be used to traverse the matrix.
+
+        :Parameters:
+            cost_matrix : list of lists
+                The cost matrix. If this cost matrix is not square, it
+                will be padded with zeros, via a call to ``pad_matrix()``.
+                (This method does *not* modify the caller's matrix. It
+                operates on a copy of the matrix.)
+
+                **WARNING**: This code handles square and rectangular
+                matrices. It does *not* handle irregular matrices.
+
+        :rtype: list
+        :return: A list of ``(row, column)`` tuples that describe the lowest
+                 cost path through the matrix
+
+        """
+        self.C = self.pad_matrix(cost_matrix)
+        self.n = len(self.C)
+        self.original_length = len(cost_matrix)
+        self.original_width = len(cost_matrix[0])
+        self.row_covered = [False for i in range(self.n)]
+        self.col_covered = [False for i in range(self.n)]
+        self.Z0_r = 0
+        self.Z0_c = 0
+        self.path = self.__make_matrix(self.n * 2, 0)
+        self.marked = self.__make_matrix(self.n, 0)
+
+        done = False
+        step = 1
+
+        steps = {
+            1: self.__step1,
+            2: self.__step2,
+            3: self.__step3,
+            4: self.__step4,
+            5: self.__step5,
+            6: self.__step6
+        }
+
+        while not done:
+            try:
+                func = steps[step]
+                step = func()
+            except KeyError:
+                done = True
+
+        # Look for the starred columns
+        results = []
+        for i in range(self.original_length):
+            for j in range(self.original_width):
+                if self.marked[i][j] == 1:
+                    results += [(i, j)]
+
+        return results
+
+    def __copy_matrix(self, matrix):
+        """Return an exact copy of the supplied matrix"""
+        return copy.deepcopy(matrix)
+
+    def __make_matrix(self, n, val):
+        """Create an *n*x*n* matrix, populating it with the specific value."""
+        matrix = []
+        for i in range(n):
+            matrix += [[val for j in range(n)]]
+        return matrix
+
+    def __step1(self):
+        """
+        For each row of the matrix, find the smallest element and
+        subtract it from every element in its row. Go to Step 2.
+        """
+        C = self.C
+        n = self.n
+        for i in range(n):
+            minval = min(self.C[i])
+            # Find the minimum value for this row and subtract that minimum
+            # from every element in the row.
+            for j in range(n):
+                self.C[i][j] -= minval
+
+        return 2
+
+    def __step2(self):
+        """
+        Find a zero (Z) in the resulting matrix. If there is no starred
+        zero in its row or column, star Z. Repeat for each element in the
+        matrix. Go to Step 3.
+        """
+        n = self.n
+        for i in range(n):
+            for j in range(n):
+                if (self.C[i][j] == 0) and \
+                   (not self.col_covered[j]) and \
+                   (not self.row_covered[i]):
+                    self.marked[i][j] = 1
+                    self.col_covered[j] = True
+                    self.row_covered[i] = True
+
+        self.__clear_covers()
+        return 3
+
+    def __step3(self):
+        """
+        Cover each column containing a starred zero. If K columns are
+        covered, the starred zeros describe a complete set of unique
+        assignments. In this case, Go to DONE, otherwise, Go to Step 4.
+        """
+        n = self.n
+        count = 0
+        for i in range(n):
+            for j in range(n):
+                if self.marked[i][j] == 1:
+                    self.col_covered[j] = True
+                    count += 1
+
+        if count >= n:
+            step = 7  # done
+        else:
+            step = 4
+
+        return step
+
+    def __step4(self):
+        """
+        Find a noncovered zero and prime it. If there is no starred zero
+        in the row containing this primed zero, Go to Step 5. Otherwise,
+        cover this row and uncover the column containing the starred
+        zero. Continue in this manner until there are no uncovered zeros
+        left. Save the smallest uncovered value and Go to Step 6.
+        """
+        step = 0
+        done = False
+        row = -1
+        col = -1
+        star_col = -1
+        while not done:
+            (row, col) = self.__find_a_zero()
+            if row < 0:
+                done = True
+                step = 6
+            else:
+                self.marked[row][col] = 2
+                star_col = self.__find_star_in_row(row)
+                if star_col >= 0:
+                    col = star_col
+                    self.row_covered[row] = True
+                    self.col_covered[col] = False
+                else:
+                    done = True
+                    self.Z0_r = row
+                    self.Z0_c = col
+                    step = 5
+
+        return step
+
+    def __step5(self):
+        """
+        Construct a series of alternating primed and starred zeros as
+        follows. Let Z0 represent the uncovered primed zero found in Step 4.
+        Let Z1 denote the starred zero in the column of Z0 (if any).
+        Let Z2 denote the primed zero in the row of Z1 (there will always
+        be one). Continue until the series terminates at a primed zero
+        that has no starred zero in its column. Unstar each starred zero
+        of the series, star each primed zero of the series, erase all
+        primes and uncover every line in the matrix. Return to Step 3
+        """
+        count = 0
+        path = self.path
+        path[count][0] = self.Z0_r
+        path[count][1] = self.Z0_c
+        done = False
+        while not done:
+            row = self.__find_star_in_col(path[count][1])
+            if row >= 0:
+                count += 1
+                path[count][0] = row
+                path[count][1] = path[count - 1][1]
+            else:
+                done = True
+
+            if not done:
+                col = self.__find_prime_in_row(path[count][0])
+                count += 1
+                path[count][0] = path[count - 1][0]
+                path[count][1] = col
+
+        self.__convert_path(path, count)
+        self.__clear_covers()
+        self.__erase_primes()
+        return 3
+
+    def __step6(self):
+        """
+        Add the value found in Step 4 to every element of each covered
+        row, and subtract it from every element of each uncovered column.
+        Return to Step 4 without altering any stars, primes, or covered
+        lines.
+        """
+        minval = self.__find_smallest()
+        for i in range(self.n):
+            for j in range(self.n):
+                if self.row_covered[i]:
+                    self.C[i][j] += minval
+                if not self.col_covered[j]:
+                    self.C[i][j] -= minval
+        return 4
+
+    def __find_smallest(self):
+        """Find the smallest uncovered value in the matrix."""
+        minval = 2e9  # sys.maxint
+        for i in range(self.n):
+            for j in range(self.n):
+                if (not self.row_covered[i]) and (not self.col_covered[j]):
+                    if minval > self.C[i][j]:
+                        minval = self.C[i][j]
+        return minval
+
+    def __find_a_zero(self):
+        """Find the first uncovered element with value 0"""
+        row = -1
+        col = -1
+        i = 0
+        n = self.n
+        done = False
+
+        while not done:
+            j = 0
+            while True:
+                if (self.C[i][j] == 0) and \
+                   (not self.row_covered[i]) and \
+                   (not self.col_covered[j]):
+                    row = i
+                    col = j
+                    done = True
+                j += 1
+                if j >= n:
+                    break
+            i += 1
+            if i >= n:
+                done = True
+
+        return (row, col)
+
+    def __find_star_in_row(self, row):
+        """
+        Find the first starred element in the specified row. Returns
+        the column index, or -1 if no starred element was found.
+        """
+        col = -1
+        for j in range(self.n):
+            if self.marked[row][j] == 1:
+                col = j
+                break
+
+        return col
+
+    def __find_star_in_col(self, col):
+        """
+        Find the first starred element in the specified row. Returns
+        the row index, or -1 if no starred element was found.
+        """
+        row = -1
+        for i in range(self.n):
+            if self.marked[i][col] == 1:
+                row = i
+                break
+
+        return row
+
+    def __find_prime_in_row(self, row):
+        """
+        Find the first prime element in the specified row. Returns
+        the column index, or -1 if no starred element was found.
+        """
+        col = -1
+        for j in range(self.n):
+            if self.marked[row][j] == 2:
+                col = j
+                break
+
+        return col
+
+    def __convert_path(self, path, count):
+        for i in range(count + 1):
+            if self.marked[path[i][0]][path[i][1]] == 1:
+                self.marked[path[i][0]][path[i][1]] = 0
+            else:
+                self.marked[path[i][0]][path[i][1]] = 1
+
+    def __clear_covers(self):
+        """Clear all covered matrix cells"""
+        for i in range(self.n):
+            self.row_covered[i] = False
+            self.col_covered[i] = False
+
+    def __erase_primes(self):
+        """Erase all prime markings"""
+        for i in range(self.n):
+            for j in range(self.n):
+                if self.marked[i][j] == 2:
+                    self.marked[i][j] = 0
+
+
+def make_cost_matrix(profit_matrix, inversion_function):
+    """
+    Create a cost matrix from a profit matrix by calling
+    'inversion_function' to invert each value. The inversion
+    function must take one numeric argument (of any type) and return
+    another numeric argument which is presumed to be the cost inverse
+    of the original profit.
+
+    This is a static method. Call it like this:
+
+    .. python::
+
+        cost_matrix = Munkres.make_cost_matrix(matrix, inversion_func)
+
+    For example:
+
+    .. python::
+
+        cost_matrix = Munkres.make_cost_matrix(matrix, lambda x : sys.maxint - x)
+
+    :Parameters:
+        profit_matrix : list of lists
+            The matrix to convert from a profit to a cost matrix
+
+        inversion_function : function
+            The function to use to invert each entry in the profit matrix
+
+    :rtype: list of lists
+    :return: The converted matrix
+    """
+    cost_matrix = []
+    for row in profit_matrix:
+        cost_matrix.append([inversion_function(value) for value in row])
+    return cost_matrix
diff --git a/rtdetr_paddle/ppdet/metrics/pose3d_metrics.py b/rtdetr_paddle/ppdet/metrics/pose3d_metrics.py
new file mode 100644
index 0000000..ea21de9
--- /dev/null
+++ b/rtdetr_paddle/ppdet/metrics/pose3d_metrics.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import paddle
+from paddle.distributed import ParallelEnv
+import os
+import json
+from collections import defaultdict, OrderedDict
+import numpy as np
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['Pose3DEval']
+
+
+class AverageMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def mean_per_joint_position_error(pred, gt, has_3d_joints):
+    """ 
+    Compute mPJPE
+    """
+    gt = gt[has_3d_joints == 1]
+    gt = gt[:, :, :3]
+    pred = pred[has_3d_joints == 1]
+
+    with paddle.no_grad():
+        gt_pelvis = (gt[:, 2, :] + gt[:, 3, :]) / 2
+        gt = gt - gt_pelvis[:, None, :]
+        pred_pelvis = (pred[:, 2, :] + pred[:, 3, :]) / 2
+        pred = pred - pred_pelvis[:, None, :]
+        error = paddle.sqrt(((pred - gt)**2).sum(axis=-1)).mean(axis=-1).numpy()
+        return error
+
+
+def compute_similarity_transform(S1, S2):
+    """Computes a similarity transform (sR, t) that takes
+    a set of 3D points S1 (3 x N) closest to a set of 3D points S2,
+    where R is an 3x3 rotation matrix, t 3x1 translation, s scale.
+    i.e. solves the orthogonal Procrutes problem.
+    """
+    transposed = False
+    if S1.shape[0] != 3 and S1.shape[0] != 2:
+        S1 = S1.T
+        S2 = S2.T
+        transposed = True
+    assert (S2.shape[1] == S1.shape[1])
+
+    # 1. Remove mean.
+    mu1 = S1.mean(axis=1, keepdims=True)
+    mu2 = S2.mean(axis=1, keepdims=True)
+    X1 = S1 - mu1
+    X2 = S2 - mu2
+
+    # 2. Compute variance of X1 used for scale.
+    var1 = np.sum(X1**2)
+
+    # 3. The outer product of X1 and X2.
+    K = X1.dot(X2.T)
+
+    # 4. Solution that Maximizes trace(R'K) is R=U*V', where U, V are
+    # singular vectors of K.
+    U, s, Vh = np.linalg.svd(K)
+    V = Vh.T
+    # Construct Z that fixes the orientation of R to get det(R)=1.
+    Z = np.eye(U.shape[0])
+    Z[-1, -1] *= np.sign(np.linalg.det(U.dot(V.T)))
+    # Construct R.
+    R = V.dot(Z.dot(U.T))
+
+    # 5. Recover scale.
+    scale = np.trace(R.dot(K)) / var1
+
+    # 6. Recover translation.
+    t = mu2 - scale * (R.dot(mu1))
+
+    # 7. Error:
+    S1_hat = scale * R.dot(S1) + t
+
+    if transposed:
+        S1_hat = S1_hat.T
+
+    return S1_hat
+
+
+def compute_similarity_transform_batch(S1, S2):
+    """Batched version of compute_similarity_transform."""
+    S1_hat = np.zeros_like(S1)
+    for i in range(S1.shape[0]):
+        S1_hat[i] = compute_similarity_transform(S1[i], S2[i])
+    return S1_hat
+
+
+def reconstruction_error(S1, S2, reduction='mean'):
+    """Do Procrustes alignment and compute reconstruction error."""
+    S1_hat = compute_similarity_transform_batch(S1, S2)
+    re = np.sqrt(((S1_hat - S2)**2).sum(axis=-1)).mean(axis=-1)
+    if reduction == 'mean':
+        re = re.mean()
+    elif reduction == 'sum':
+        re = re.sum()
+    return re
+
+
+def all_gather(data):
+    if paddle.distributed.get_world_size() == 1:
+        return data
+    vlist = []
+    paddle.distributed.all_gather(vlist, data)
+    data = paddle.concat(vlist, 0)
+    return data
+
+
+class Pose3DEval(object):
+    def __init__(self, output_eval, save_prediction_only=False):
+        super(Pose3DEval, self).__init__()
+        self.output_eval = output_eval
+        self.res_file = os.path.join(output_eval, "pose3d_results.json")
+        self.save_prediction_only = save_prediction_only
+        self.reset()
+
+    def reset(self):
+        self.PAmPJPE = AverageMeter()
+        self.mPJPE = AverageMeter()
+        self.eval_results = {}
+
+    def get_human36m_joints(self, input):
+        J24_TO_J14 = paddle.to_tensor(
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18])
+        J24_TO_J17 = paddle.to_tensor(
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19])
+        return paddle.index_select(input, J24_TO_J14, axis=1)
+
+    def update(self, inputs, outputs):
+        gt_3d_joints = all_gather(inputs['joints_3d'].cuda(ParallelEnv()
+                                                           .local_rank))
+        has_3d_joints = all_gather(inputs['has_3d_joints'].cuda(ParallelEnv()
+                                                                .local_rank))
+        pred_3d_joints = all_gather(outputs['pose3d'])
+        if gt_3d_joints.shape[1] == 24:
+            gt_3d_joints = self.get_human36m_joints(gt_3d_joints)
+        if pred_3d_joints.shape[1] == 24:
+            pred_3d_joints = self.get_human36m_joints(pred_3d_joints)
+        mPJPE_val = mean_per_joint_position_error(pred_3d_joints, gt_3d_joints,
+                                                  has_3d_joints).mean()
+        PAmPJPE_val = reconstruction_error(
+            pred_3d_joints.numpy(),
+            gt_3d_joints[:, :, :3].numpy(),
+            reduction=None).mean()
+        count = int(np.sum(has_3d_joints.numpy()))
+        self.PAmPJPE.update(PAmPJPE_val * 1000., count)
+        self.mPJPE.update(mPJPE_val * 1000., count)
+
+    def accumulate(self):
+        if self.save_prediction_only:
+            logger.info(f'The pose3d result is saved to {self.res_file} '
+                        'and do not evaluate the model.')
+            return
+        self.eval_results['pose3d'] = [-self.mPJPE.avg, -self.PAmPJPE.avg]
+
+    def log(self):
+        if self.save_prediction_only:
+            return
+        stats_names = ['mPJPE', 'PAmPJPE']
+        num_values = len(stats_names)
+        print(' '.join(['| {}'.format(name) for name in stats_names]) + ' |')
+        print('|---' * (num_values + 1) + '|')
+
+        print(' '.join([
+            '| {:.3f}'.format(abs(value))
+            for value in self.eval_results['pose3d']
+        ]) + ' |')
+
+    def get_results(self):
+        return self.eval_results
diff --git a/rtdetr_paddle/ppdet/metrics/widerface_utils.py b/rtdetr_paddle/ppdet/metrics/widerface_utils.py
new file mode 100644
index 0000000..2f64bf6
--- /dev/null
+++ b/rtdetr_paddle/ppdet/metrics/widerface_utils.py
@@ -0,0 +1,391 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import cv2
+import numpy as np
+from collections import OrderedDict
+
+import paddle
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['face_eval_run', 'lmk2out']
+
+
+def face_eval_run(model,
+                  image_dir,
+                  gt_file,
+                  pred_dir='output/pred',
+                  eval_mode='widerface',
+                  multi_scale=False):
+    # load ground truth files
+    with open(gt_file, 'r') as f:
+        gt_lines = f.readlines()
+    imid2path = []
+    pos_gt = 0
+    while pos_gt < len(gt_lines):
+        name_gt = gt_lines[pos_gt].strip('\n\t').split()[0]
+        imid2path.append(name_gt)
+        pos_gt += 1
+        n_gt = int(gt_lines[pos_gt].strip('\n\t').split()[0])
+        pos_gt += 1 + n_gt
+    logger.info('The ground truth file load {} images'.format(len(imid2path)))
+
+    dets_dist = OrderedDict()
+    for iter_id, im_path in enumerate(imid2path):
+        image_path = os.path.join(image_dir, im_path)
+        if eval_mode == 'fddb':
+            image_path += '.jpg'
+        assert os.path.exists(image_path)
+        image = cv2.imread(image_path)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        if multi_scale:
+            shrink, max_shrink = get_shrink(image.shape[0], image.shape[1])
+            det0 = detect_face(model, image, shrink)
+            det1 = flip_test(model, image, shrink)
+            [det2, det3] = multi_scale_test(model, image, max_shrink)
+            det4 = multi_scale_test_pyramid(model, image, max_shrink)
+            det = np.row_stack((det0, det1, det2, det3, det4))
+            dets = bbox_vote(det)
+        else:
+            dets = detect_face(model, image, 1)
+        if eval_mode == 'widerface':
+            save_widerface_bboxes(image_path, dets, pred_dir)
+        else:
+            dets_dist[im_path] = dets
+        if iter_id % 100 == 0:
+            logger.info('Test iter {}'.format(iter_id))
+    if eval_mode == 'fddb':
+        save_fddb_bboxes(dets_dist, pred_dir)
+    logger.info("Finish evaluation.")
+
+
+def detect_face(model, image, shrink):
+    image_shape = [image.shape[0], image.shape[1]]
+    if shrink != 1:
+        h, w = int(image_shape[0] * shrink), int(image_shape[1] * shrink)
+        image = cv2.resize(image, (w, h))
+        image_shape = [h, w]
+
+    img = face_img_process(image)
+    image_shape = np.asarray([image_shape])
+    scale_factor = np.asarray([[shrink, shrink]])
+    data = {
+        "image": paddle.to_tensor(
+            img, dtype='float32'),
+        "im_shape": paddle.to_tensor(
+            image_shape, dtype='float32'),
+        "scale_factor": paddle.to_tensor(
+            scale_factor, dtype='float32')
+    }
+    model.eval()
+    detection = model(data)
+    detection = detection['bbox'].numpy()
+    # layout: xmin, ymin, xmax. ymax, score
+    if np.prod(detection.shape) == 1:
+        logger.info("No face detected")
+        return np.array([[0, 0, 0, 0, 0]])
+    det_conf = detection[:, 1]
+    det_xmin = detection[:, 2]
+    det_ymin = detection[:, 3]
+    det_xmax = detection[:, 4]
+    det_ymax = detection[:, 5]
+
+    det = np.column_stack((det_xmin, det_ymin, det_xmax, det_ymax, det_conf))
+    return det
+
+
+def flip_test(model, image, shrink):
+    img = cv2.flip(image, 1)
+    det_f = detect_face(model, img, shrink)
+    det_t = np.zeros(det_f.shape)
+    img_width = image.shape[1]
+    det_t[:, 0] = img_width - det_f[:, 2]
+    det_t[:, 1] = det_f[:, 1]
+    det_t[:, 2] = img_width - det_f[:, 0]
+    det_t[:, 3] = det_f[:, 3]
+    det_t[:, 4] = det_f[:, 4]
+    return det_t
+
+
+def multi_scale_test(model, image, max_shrink):
+    # Shrink detecting is only used to detect big faces
+    st = 0.5 if max_shrink >= 0.75 else 0.5 * max_shrink
+    det_s = detect_face(model, image, st)
+    index = np.where(
+        np.maximum(det_s[:, 2] - det_s[:, 0] + 1, det_s[:, 3] - det_s[:, 1] + 1)
+        > 30)[0]
+    det_s = det_s[index, :]
+    # Enlarge one times
+    bt = min(2, max_shrink) if max_shrink > 1 else (st + max_shrink) / 2
+    det_b = detect_face(model, image, bt)
+
+    # Enlarge small image x times for small faces
+    if max_shrink > 2:
+        bt *= 2
+        while bt < max_shrink:
+            det_b = np.row_stack((det_b, detect_face(model, image, bt)))
+            bt *= 2
+        det_b = np.row_stack((det_b, detect_face(model, image, max_shrink)))
+
+    # Enlarged images are only used to detect small faces.
+    if bt > 1:
+        index = np.where(
+            np.minimum(det_b[:, 2] - det_b[:, 0] + 1,
+                       det_b[:, 3] - det_b[:, 1] + 1) < 100)[0]
+        det_b = det_b[index, :]
+    # Shrinked images are only used to detect big faces.
+    else:
+        index = np.where(
+            np.maximum(det_b[:, 2] - det_b[:, 0] + 1,
+                       det_b[:, 3] - det_b[:, 1] + 1) > 30)[0]
+        det_b = det_b[index, :]
+    return det_s, det_b
+
+
+def multi_scale_test_pyramid(model, image, max_shrink):
+    # Use image pyramids to detect faces
+    det_b = detect_face(model, image, 0.25)
+    index = np.where(
+        np.maximum(det_b[:, 2] - det_b[:, 0] + 1, det_b[:, 3] - det_b[:, 1] + 1)
+        > 30)[0]
+    det_b = det_b[index, :]
+
+    st = [0.75, 1.25, 1.5, 1.75]
+    for i in range(len(st)):
+        if st[i] <= max_shrink:
+            det_temp = detect_face(model, image, st[i])
+            # Enlarged images are only used to detect small faces.
+            if st[i] > 1:
+                index = np.where(
+                    np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1,
+                               det_temp[:, 3] - det_temp[:, 1] + 1) < 100)[0]
+                det_temp = det_temp[index, :]
+            # Shrinked images are only used to detect big faces.
+            else:
+                index = np.where(
+                    np.maximum(det_temp[:, 2] - det_temp[:, 0] + 1,
+                               det_temp[:, 3] - det_temp[:, 1] + 1) > 30)[0]
+                det_temp = det_temp[index, :]
+            det_b = np.row_stack((det_b, det_temp))
+    return det_b
+
+
+def to_chw(image):
+    """
+    Transpose image from HWC to CHW.
+    Args:
+        image (np.array): an image with HWC layout.
+    """
+    # HWC to CHW
+    if len(image.shape) == 3:
+        image = np.swapaxes(image, 1, 2)
+        image = np.swapaxes(image, 1, 0)
+    return image
+
+
+def face_img_process(image,
+                     mean=[104., 117., 123.],
+                     std=[127.502231, 127.502231, 127.502231]):
+    img = np.array(image)
+    img = to_chw(img)
+    img = img.astype('float32')
+    img -= np.array(mean)[:, np.newaxis, np.newaxis].astype('float32')
+    img /= np.array(std)[:, np.newaxis, np.newaxis].astype('float32')
+    img = [img]
+    img = np.array(img)
+    return img
+
+
+def get_shrink(height, width):
+    """
+    Args:
+        height (int): image height.
+        width (int): image width.
+    """
+    # avoid out of memory
+    max_shrink_v1 = (0x7fffffff / 577.0 / (height * width))**0.5
+    max_shrink_v2 = ((678 * 1024 * 2.0 * 2.0) / (height * width))**0.5
+
+    def get_round(x, loc):
+        str_x = str(x)
+        if '.' in str_x:
+            str_before, str_after = str_x.split('.')
+            len_after = len(str_after)
+            if len_after >= 3:
+                str_final = str_before + '.' + str_after[0:loc]
+                return float(str_final)
+            else:
+                return x
+
+    max_shrink = get_round(min(max_shrink_v1, max_shrink_v2), 2) - 0.3
+    if max_shrink >= 1.5 and max_shrink < 2:
+        max_shrink = max_shrink - 0.1
+    elif max_shrink >= 2 and max_shrink < 3:
+        max_shrink = max_shrink - 0.2
+    elif max_shrink >= 3 and max_shrink < 4:
+        max_shrink = max_shrink - 0.3
+    elif max_shrink >= 4 and max_shrink < 5:
+        max_shrink = max_shrink - 0.4
+    elif max_shrink >= 5:
+        max_shrink = max_shrink - 0.5
+    elif max_shrink <= 0.1:
+        max_shrink = 0.1
+
+    shrink = max_shrink if max_shrink < 1 else 1
+    return shrink, max_shrink
+
+
+def bbox_vote(det):
+    order = det[:, 4].ravel().argsort()[::-1]
+    det = det[order, :]
+    if det.shape[0] == 0:
+        dets = np.array([[10, 10, 20, 20, 0.002]])
+        det = np.empty(shape=[0, 5])
+    while det.shape[0] > 0:
+        # IOU
+        area = (det[:, 2] - det[:, 0] + 1) * (det[:, 3] - det[:, 1] + 1)
+        xx1 = np.maximum(det[0, 0], det[:, 0])
+        yy1 = np.maximum(det[0, 1], det[:, 1])
+        xx2 = np.minimum(det[0, 2], det[:, 2])
+        yy2 = np.minimum(det[0, 3], det[:, 3])
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        o = inter / (area[0] + area[:] - inter)
+
+        # nms
+        merge_index = np.where(o >= 0.3)[0]
+        det_accu = det[merge_index, :]
+        det = np.delete(det, merge_index, 0)
+        if merge_index.shape[0] <= 1:
+            if det.shape[0] == 0:
+                try:
+                    dets = np.row_stack((dets, det_accu))
+                except:
+                    dets = det_accu
+            continue
+        det_accu[:, 0:4] = det_accu[:, 0:4] * np.tile(det_accu[:, -1:], (1, 4))
+        max_score = np.max(det_accu[:, 4])
+        det_accu_sum = np.zeros((1, 5))
+        det_accu_sum[:, 0:4] = np.sum(det_accu[:, 0:4],
+                                      axis=0) / np.sum(det_accu[:, -1:])
+        det_accu_sum[:, 4] = max_score
+        try:
+            dets = np.row_stack((dets, det_accu_sum))
+        except:
+            dets = det_accu_sum
+    dets = dets[0:750, :]
+    keep_index = np.where(dets[:, 4] >= 0.01)[0]
+    dets = dets[keep_index, :]
+    return dets
+
+
+def save_widerface_bboxes(image_path, bboxes_scores, output_dir):
+    image_name = image_path.split('/')[-1]
+    image_class = image_path.split('/')[-2]
+    odir = os.path.join(output_dir, image_class)
+    if not os.path.exists(odir):
+        os.makedirs(odir)
+
+    ofname = os.path.join(odir, '%s.txt' % (image_name[:-4]))
+    f = open(ofname, 'w')
+    f.write('{:s}\n'.format(image_class + '/' + image_name))
+    f.write('{:d}\n'.format(bboxes_scores.shape[0]))
+    for box_score in bboxes_scores:
+        xmin, ymin, xmax, ymax, score = box_score
+        f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n'.format(xmin, ymin, (
+            xmax - xmin + 1), (ymax - ymin + 1), score))
+    f.close()
+    logger.info("The predicted result is saved as {}".format(ofname))
+
+
+def save_fddb_bboxes(bboxes_scores,
+                     output_dir,
+                     output_fname='pred_fddb_res.txt'):
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    predict_file = os.path.join(output_dir, output_fname)
+    f = open(predict_file, 'w')
+    for image_path, dets in bboxes_scores.iteritems():
+        f.write('{:s}\n'.format(image_path))
+        f.write('{:d}\n'.format(dets.shape[0]))
+        for box_score in dets:
+            xmin, ymin, xmax, ymax, score = box_score
+            width, height = xmax - xmin, ymax - ymin
+            f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n'
+                    .format(xmin, ymin, width, height, score))
+    logger.info("The predicted result is saved as {}".format(predict_file))
+    return predict_file
+
+
+def lmk2out(results, is_bbox_normalized=False):
+    """
+    Args:
+        results: request a dict, should include: `landmark`, `im_id`,
+                 if is_bbox_normalized=True, also need `im_shape`.
+        is_bbox_normalized: whether or not landmark is normalized.
+    """
+    xywh_res = []
+    for t in results:
+        bboxes = t['bbox'][0]
+        lengths = t['bbox'][1][0]
+        im_ids = np.array(t['im_id'][0]).flatten()
+        if bboxes.shape == (1, 1) or bboxes is None:
+            continue
+        face_index = t['face_index'][0]
+        prior_box = t['prior_boxes'][0]
+        predict_lmk = t['landmark'][0]
+        prior = np.reshape(prior_box, (-1, 4))
+        predictlmk = np.reshape(predict_lmk, (-1, 10))
+
+        k = 0
+        for a in range(len(lengths)):
+            num = lengths[a]
+            im_id = int(im_ids[a])
+            for i in range(num):
+                score = bboxes[k][1]
+                theindex = face_index[i][0]
+                me_prior = prior[theindex, :]
+                lmk_pred = predictlmk[theindex, :]
+                prior_w = me_prior[2] - me_prior[0]
+                prior_h = me_prior[3] - me_prior[1]
+                prior_w_center = (me_prior[2] + me_prior[0]) / 2
+                prior_h_center = (me_prior[3] + me_prior[1]) / 2
+                lmk_decode = np.zeros((10))
+                for j in [0, 2, 4, 6, 8]:
+                    lmk_decode[j] = lmk_pred[j] * 0.1 * prior_w + prior_w_center
+                for j in [1, 3, 5, 7, 9]:
+                    lmk_decode[j] = lmk_pred[j] * 0.1 * prior_h + prior_h_center
+                im_shape = t['im_shape'][0][a].tolist()
+                image_h, image_w = int(im_shape[0]), int(im_shape[1])
+                if is_bbox_normalized:
+                    lmk_decode = lmk_decode * np.array([
+                        image_w, image_h, image_w, image_h, image_w, image_h,
+                        image_w, image_h, image_w, image_h
+                    ])
+                lmk_res = {
+                    'image_id': im_id,
+                    'landmark': lmk_decode,
+                    'score': score,
+                }
+                xywh_res.append(lmk_res)
+                k += 1
+    return xywh_res
diff --git a/rtdetr_paddle/ppdet/modeling/__init__.py b/rtdetr_paddle/ppdet/modeling/__init__.py
new file mode 100644
index 0000000..9c29c8c
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import warnings
+warnings.filterwarnings(
+    action='ignore', category=DeprecationWarning, module='ops')
+
+
+from .ops import *
+from .backbones import *
+from .heads import *
+from .losses import *
+from .architectures import *
+from .post_process import *
+from .layers import *
+from .transformers import *
diff --git a/rtdetr_paddle/ppdet/modeling/architectures/__init__.py b/rtdetr_paddle/ppdet/modeling/architectures/__init__.py
new file mode 100644
index 0000000..318b760
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/architectures/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from .meta_arch import *
+from .detr import *
diff --git a/rtdetr_paddle/ppdet/modeling/architectures/detr.py b/rtdetr_paddle/ppdet/modeling/architectures/detr.py
new file mode 100644
index 0000000..7839a12
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/architectures/detr.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from .meta_arch import BaseArch
+from ppdet.core.workspace import register, create
+
+__all__ = ['DETR']
+# Deformable DETR, DINO use the same architecture as DETR
+
+
+@register
+class DETR(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+    __shared__ = ['with_mask', 'exclude_post_process']
+
+    def __init__(self,
+                 backbone,
+                 transformer='DETRTransformer',
+                 detr_head='DETRHead',
+                 neck=None,
+                 post_process='DETRPostProcess',
+                 with_mask=False,
+                 exclude_post_process=False):
+        super(DETR, self).__init__()
+        self.backbone = backbone
+        self.transformer = transformer
+        self.detr_head = detr_head
+        self.neck = neck
+        self.post_process = post_process
+        self.with_mask = with_mask
+        self.exclude_post_process = exclude_post_process
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+        # neck
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs) if cfg['neck'] else None
+
+        # transformer
+        if neck is not None:
+            kwargs = {'input_shape': neck.out_shape}
+        transformer = create(cfg['transformer'], **kwargs)
+        # head
+        kwargs = {
+            'hidden_dim': transformer.hidden_dim,
+            'nhead': transformer.nhead,
+            'input_shape': backbone.out_shape
+        }
+        detr_head = create(cfg['detr_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'transformer': transformer,
+            "detr_head": detr_head,
+            "neck": neck
+        }
+
+    def _forward(self):
+        # Backbone
+        body_feats = self.backbone(self.inputs)
+
+        # Neck
+        if self.neck is not None:
+            body_feats = self.neck(body_feats)
+
+        # Transformer
+        pad_mask = self.inputs.get('pad_mask', None)
+        out_transformer = self.transformer(body_feats, pad_mask, self.inputs)
+
+        # DETR Head
+        if self.training:
+            detr_losses = self.detr_head(out_transformer, body_feats,
+                                         self.inputs)
+            detr_losses.update({
+                'loss': paddle.add_n(
+                    [v for k, v in detr_losses.items() if 'log' not in k])
+            })
+            return detr_losses
+        else:
+            preds = self.detr_head(out_transformer, body_feats)
+            if self.exclude_post_process:
+                bbox, bbox_num, mask = preds
+            else:
+                bbox, bbox_num, mask = self.post_process(
+                    preds, self.inputs['im_shape'], self.inputs['scale_factor'],
+                    paddle.shape(self.inputs['image'])[2:])
+
+            output = {'bbox': bbox, 'bbox_num': bbox_num}
+            if self.with_mask:
+                output['mask'] = mask
+            return output
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
diff --git a/rtdetr_paddle/ppdet/modeling/architectures/meta_arch.py b/rtdetr_paddle/ppdet/modeling/architectures/meta_arch.py
new file mode 100644
index 0000000..370b2b1
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/architectures/meta_arch.py
@@ -0,0 +1,132 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import typing
+
+from ppdet.core.workspace import register
+from ppdet.modeling.post_process import nms
+
+__all__ = ['BaseArch']
+
+
+@register
+class BaseArch(nn.Layer):
+    def __init__(self, data_format='NCHW', use_extra_data=False):
+        super(BaseArch, self).__init__()
+        self.data_format = data_format
+        self.inputs = {}
+        self.fuse_norm = False
+        self.use_extra_data = use_extra_data
+
+    def load_meanstd(self, cfg_transform):
+        scale = 1.
+        mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
+        std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
+        for item in cfg_transform:
+            if 'NormalizeImage' in item:
+                mean = np.array(
+                    item['NormalizeImage']['mean'], dtype=np.float32)
+                std = np.array(item['NormalizeImage']['std'], dtype=np.float32)
+                if item['NormalizeImage'].get('is_scale', True):
+                    scale = 1. / 255.
+                break
+        if self.data_format == 'NHWC':
+            self.scale = paddle.to_tensor(scale / std).reshape((1, 1, 1, 3))
+            self.bias = paddle.to_tensor(-mean / std).reshape((1, 1, 1, 3))
+        else:
+            self.scale = paddle.to_tensor(scale / std).reshape((1, 3, 1, 1))
+            self.bias = paddle.to_tensor(-mean / std).reshape((1, 3, 1, 1))
+
+    def forward(self, inputs):
+        if self.data_format == 'NHWC':
+            image = inputs['image']
+            inputs['image'] = paddle.transpose(image, [0, 2, 3, 1])
+
+        if self.fuse_norm:
+            image = inputs['image']
+            self.inputs['image'] = image * self.scale + self.bias
+            self.inputs['im_shape'] = inputs['im_shape']
+            self.inputs['scale_factor'] = inputs['scale_factor']
+        else:
+            self.inputs = inputs
+
+        self.model_arch()
+
+        if self.training:
+            out = self.get_loss()
+        else:
+            inputs_list = []
+            # multi-scale input
+            if not isinstance(inputs, typing.Sequence):
+                inputs_list.append(inputs)
+            else:
+                inputs_list.extend(inputs)
+            outs = []
+            for inp in inputs_list:
+                if self.fuse_norm:
+                    self.inputs['image'] = inp['image'] * self.scale + self.bias
+                    self.inputs['im_shape'] = inp['im_shape']
+                    self.inputs['scale_factor'] = inp['scale_factor']
+                else:
+                    self.inputs = inp
+                outs.append(self.get_pred())
+
+            # multi-scale test
+            if len(outs) > 1:
+                out = self.merge_multi_scale_predictions(outs)
+            else:
+                out = outs[0]
+        return out
+
+    def merge_multi_scale_predictions(self, outs):
+        # default values for architectures not included in following list
+        num_classes = 80
+        nms_threshold = 0.5
+        keep_top_k = 100
+
+        if self.__class__.__name__ in ('CascadeRCNN', 'FasterRCNN', 'MaskRCNN'):
+            num_classes = self.bbox_head.num_classes
+            keep_top_k = self.bbox_post_process.nms.keep_top_k
+            nms_threshold = self.bbox_post_process.nms.nms_threshold
+        else:
+            raise Exception(
+                "Multi scale test only supports CascadeRCNN, FasterRCNN and MaskRCNN for now"
+            )
+
+        final_boxes = []
+        all_scale_outs = paddle.concat([o['bbox'] for o in outs]).numpy()
+        for c in range(num_classes):
+            idxs = all_scale_outs[:, 0] == c
+            if np.count_nonzero(idxs) == 0:
+                continue
+            r = nms(all_scale_outs[idxs, 1:], nms_threshold)
+            final_boxes.append(
+                np.concatenate([np.full((r.shape[0], 1), c), r], 1))
+        out = np.concatenate(final_boxes)
+        out = np.concatenate(sorted(
+            out, key=lambda e: e[1])[-keep_top_k:]).reshape((-1, 6))
+        out = {
+            'bbox': paddle.to_tensor(out),
+            'bbox_num': paddle.to_tensor(np.array([out.shape[0], ]))
+        }
+
+        return out
+
+    def build_inputs(self, data, input_def):
+        inputs = {}
+        for i, k in enumerate(input_def):
+            inputs[k] = data[i]
+        return inputs
+
+    def model_arch(self, ):
+        pass
+
+    def get_loss(self, ):
+        raise NotImplementedError("Should implement get_loss method!")
+
+    def get_pred(self, ):
+        raise NotImplementedError("Should implement get_pred method!")
diff --git a/rtdetr_paddle/ppdet/modeling/backbones/__init__.py b/rtdetr_paddle/ppdet/modeling/backbones/__init__.py
new file mode 100644
index 0000000..2ea3991
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/backbones/__init__.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from .resnet import *
+from .darknet import *
+from .mobilenet_v1 import *
+from .mobilenet_v3 import *
+from .shufflenet_v2 import *
+from .swin_transformer import *
+from .lcnet import *
+from .cspresnet import *
+from .csp_darknet import *
+from .convnext import *
+from .vision_transformer import *
+from .mobileone import *
+from .trans_encoder import *
+from .focalnet import *
+from .vit_mae import *
+from .hgnet_v2 import *
diff --git a/rtdetr_paddle/ppdet/modeling/backbones/convnext.py b/rtdetr_paddle/ppdet/modeling/backbones/convnext.py
new file mode 100644
index 0000000..476e12b
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/backbones/convnext.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+'''
+Modified from https://github.com/facebookresearch/ConvNeXt
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+'''
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import Constant
+
+import numpy as np
+
+from ppdet.core.workspace import register, serializable
+from ..shape_spec import ShapeSpec
+from .transformer_utils import DropPath, trunc_normal_, zeros_
+
+__all__ = ['ConvNeXt']
+
+
+class Block(nn.Layer):
+    r""" ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in Pypaddle
+    
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+
+    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
+        super().__init__()
+        self.dwconv = nn.Conv2D(
+            dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim, 4 * dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+
+        if layer_scale_init_value > 0:
+            self.gamma = self.create_parameter(
+                shape=(dim, ),
+                attr=ParamAttr(initializer=Constant(layer_scale_init_value)))
+        else:
+            self.gamma = None
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity(
+        )
+
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.transpose([0, 2, 3, 1])
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.transpose([0, 3, 1, 2])
+        x = input + self.drop_path(x)
+        return x
+
+
+class LayerNorm(nn.Layer):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. 
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with 
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs 
+    with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+
+        self.weight = self.create_parameter(
+            shape=(normalized_shape, ),
+            attr=ParamAttr(initializer=Constant(1.)))
+        self.bias = self.create_parameter(
+            shape=(normalized_shape, ),
+            attr=ParamAttr(initializer=Constant(0.)))
+
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape, )
+
+    def forward(self, x):
+        if self.data_format == "channels_last":
+            return F.layer_norm(x, self.normalized_shape, self.weight,
+                                self.bias, self.eps)
+        elif self.data_format == "channels_first":
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / paddle.sqrt(s + self.eps)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+            return x
+
+
+@register
+@serializable
+class ConvNeXt(nn.Layer):
+    r""" ConvNeXt
+        A Pypaddle impl of : `A ConvNet for the 2020s`  -
+          https://arxiv.org/pdf/2201.03545.pdf
+
+    Args:
+        in_chans (int): Number of input image channels. Default: 3
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+
+    arch_settings = {
+        'tiny': {
+            'depths': [3, 3, 9, 3],
+            'dims': [96, 192, 384, 768]
+        },
+        'small': {
+            'depths': [3, 3, 27, 3],
+            'dims': [96, 192, 384, 768]
+        },
+        'base': {
+            'depths': [3, 3, 27, 3],
+            'dims': [128, 256, 512, 1024]
+        },
+        'large': {
+            'depths': [3, 3, 27, 3],
+            'dims': [192, 384, 768, 1536]
+        },
+        'xlarge': {
+            'depths': [3, 3, 27, 3],
+            'dims': [256, 512, 1024, 2048]
+        },
+    }
+
+    def __init__(
+            self,
+            arch='tiny',
+            in_chans=3,
+            drop_path_rate=0.,
+            layer_scale_init_value=1e-6,
+            return_idx=[1, 2, 3],
+            norm_output=True,
+            pretrained=None, ):
+        super().__init__()
+        depths = self.arch_settings[arch]['depths']
+        dims = self.arch_settings[arch]['dims']
+        self.downsample_layers = nn.LayerList(
+        )  # stem and 3 intermediate downsampling conv layers
+        stem = nn.Sequential(
+            nn.Conv2D(
+                in_chans, dims[0], kernel_size=4, stride=4),
+            LayerNorm(
+                dims[0], eps=1e-6, data_format="channels_first"))
+        self.downsample_layers.append(stem)
+        for i in range(3):
+            downsample_layer = nn.Sequential(
+                LayerNorm(
+                    dims[i], eps=1e-6, data_format="channels_first"),
+                nn.Conv2D(
+                    dims[i], dims[i + 1], kernel_size=2, stride=2), )
+            self.downsample_layers.append(downsample_layer)
+
+        self.stages = nn.LayerList(
+        )  # 4 feature resolution stages, each consisting of multiple residual blocks
+        dp_rates = [x for x in np.linspace(0, drop_path_rate, sum(depths))]
+        cur = 0
+        for i in range(4):
+            stage = nn.Sequential(* [
+                Block(
+                    dim=dims[i],
+                    drop_path=dp_rates[cur + j],
+                    layer_scale_init_value=layer_scale_init_value)
+                for j in range(depths[i])
+            ])
+            self.stages.append(stage)
+            cur += depths[i]
+
+        self.return_idx = return_idx
+        self.dims = [dims[i] for i in return_idx]  # [::-1]
+
+        self.norm_output = norm_output
+        if norm_output:
+            self.norms = nn.LayerList([
+                LayerNorm(
+                    c, eps=1e-6, data_format="channels_first")
+                for c in self.dims
+            ])
+
+        self.apply(self._init_weights)
+
+        if pretrained is not None:
+            if 'http' in pretrained:  #URL
+                path = paddle.utils.download.get_weights_path_from_url(
+                    pretrained)
+            else:  #model in local path
+                path = pretrained
+            self.set_state_dict(paddle.load(path))
+
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2D, nn.Linear)):
+            trunc_normal_(m.weight)
+            zeros_(m.bias)
+
+    def forward_features(self, x):
+        output = []
+        for i in range(4):
+            x = self.downsample_layers[i](x)
+            x = self.stages[i](x)
+            output.append(x)
+
+        outputs = [output[i] for i in self.return_idx]
+        if self.norm_output:
+            outputs = [self.norms[i](out) for i, out in enumerate(outputs)]
+
+        return outputs
+
+    def forward(self, x):
+        x = self.forward_features(x['image'])
+        return x
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self.dims]
diff --git a/rtdetr_paddle/ppdet/modeling/backbones/csp_darknet.py b/rtdetr_paddle/ppdet/modeling/backbones/csp_darknet.py
new file mode 100644
index 0000000..4c225d1
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/backbones/csp_darknet.py
@@ -0,0 +1,404 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling.initializer import conv_init_
+from ..shape_spec import ShapeSpec
+
+__all__ = [
+    'CSPDarkNet', 'BaseConv', 'DWConv', 'BottleNeck', 'SPPLayer', 'SPPFLayer'
+]
+
+
+class BaseConv(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize,
+                 stride,
+                 groups=1,
+                 bias=False,
+                 act="silu"):
+        super(BaseConv, self).__init__()
+        self.conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=ksize,
+            stride=stride,
+            padding=(ksize - 1) // 2,
+            groups=groups,
+            bias_attr=bias)
+        self.bn = nn.BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+        self._init_weights()
+
+    def _init_weights(self):
+        conv_init_(self.conv)
+
+    def forward(self, x):
+        # use 'x * F.sigmoid(x)' replace 'silu'
+        x = self.bn(self.conv(x))
+        y = x * F.sigmoid(x)
+        return y
+
+
+class DWConv(nn.Layer):
+    """Depthwise Conv"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize,
+                 stride=1,
+                 bias=False,
+                 act="silu"):
+        super(DWConv, self).__init__()
+        self.dw_conv = BaseConv(
+            in_channels,
+            in_channels,
+            ksize=ksize,
+            stride=stride,
+            groups=in_channels,
+            bias=bias,
+            act=act)
+        self.pw_conv = BaseConv(
+            in_channels,
+            out_channels,
+            ksize=1,
+            stride=1,
+            groups=1,
+            bias=bias,
+            act=act)
+
+    def forward(self, x):
+        return self.pw_conv(self.dw_conv(x))
+
+
+class Focus(nn.Layer):
+    """Focus width and height information into channel space, used in YOLOX."""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize=3,
+                 stride=1,
+                 bias=False,
+                 act="silu"):
+        super(Focus, self).__init__()
+        self.conv = BaseConv(
+            in_channels * 4,
+            out_channels,
+            ksize=ksize,
+            stride=stride,
+            bias=bias,
+            act=act)
+
+    def forward(self, inputs):
+        # inputs [bs, C, H, W] -> outputs [bs, 4C, W/2, H/2]
+        top_left = inputs[:, :, 0::2, 0::2]
+        top_right = inputs[:, :, 0::2, 1::2]
+        bottom_left = inputs[:, :, 1::2, 0::2]
+        bottom_right = inputs[:, :, 1::2, 1::2]
+        outputs = paddle.concat(
+            [top_left, bottom_left, top_right, bottom_right], 1)
+        return self.conv(outputs)
+
+
+class BottleNeck(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 shortcut=True,
+                 expansion=0.5,
+                 depthwise=False,
+                 bias=False,
+                 act="silu"):
+        super(BottleNeck, self).__init__()
+        hidden_channels = int(out_channels * expansion)
+        Conv = DWConv if depthwise else BaseConv
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.conv2 = Conv(
+            hidden_channels,
+            out_channels,
+            ksize=3,
+            stride=1,
+            bias=bias,
+            act=act)
+        self.add_shortcut = shortcut and in_channels == out_channels
+
+    def forward(self, x):
+        y = self.conv2(self.conv1(x))
+        if self.add_shortcut:
+            y = y + x
+        return y
+
+
+class SPPLayer(nn.Layer):
+    """Spatial Pyramid Pooling (SPP) layer used in YOLOv3-SPP and YOLOX"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_sizes=(5, 9, 13),
+                 bias=False,
+                 act="silu"):
+        super(SPPLayer, self).__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.maxpoolings = nn.LayerList([
+            nn.MaxPool2D(
+                kernel_size=ks, stride=1, padding=ks // 2)
+            for ks in kernel_sizes
+        ])
+        conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
+        self.conv2 = BaseConv(
+            conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = paddle.concat([x] + [mp(x) for mp in self.maxpoolings], axis=1)
+        x = self.conv2(x)
+        return x
+
+
+class SPPFLayer(nn.Layer):
+    """ Spatial Pyramid Pooling - Fast (SPPF) layer used in YOLOv5 by Glenn Jocher,
+        equivalent to SPP(k=(5, 9, 13))
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize=5,
+                 bias=False,
+                 act='silu'):
+        super(SPPFLayer, self).__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.maxpooling = nn.MaxPool2D(
+            kernel_size=ksize, stride=1, padding=ksize // 2)
+        conv2_channels = hidden_channels * 4
+        self.conv2 = BaseConv(
+            conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        y1 = self.maxpooling(x)
+        y2 = self.maxpooling(y1)
+        y3 = self.maxpooling(y2)
+        concats = paddle.concat([x, y1, y2, y3], axis=1)
+        out = self.conv2(concats)
+        return out
+
+
+class CSPLayer(nn.Layer):
+    """CSP (Cross Stage Partial) layer with 3 convs, named C3 in YOLOv5"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_blocks=1,
+                 shortcut=True,
+                 expansion=0.5,
+                 depthwise=False,
+                 bias=False,
+                 act="silu"):
+        super(CSPLayer, self).__init__()
+        hidden_channels = int(out_channels * expansion)
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.conv2 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.bottlenecks = nn.Sequential(* [
+            BottleNeck(
+                hidden_channels,
+                hidden_channels,
+                shortcut=shortcut,
+                expansion=1.0,
+                depthwise=depthwise,
+                bias=bias,
+                act=act) for _ in range(num_blocks)
+        ])
+        self.conv3 = BaseConv(
+            hidden_channels * 2,
+            out_channels,
+            ksize=1,
+            stride=1,
+            bias=bias,
+            act=act)
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_1 = self.bottlenecks(x_1)
+        x_2 = self.conv2(x)
+        x = paddle.concat([x_1, x_2], axis=1)
+        x = self.conv3(x)
+        return x
+
+
+@register
+@serializable
+class CSPDarkNet(nn.Layer):
+    """
+    CSPDarkNet backbone.
+    Args:
+        arch (str): Architecture of CSPDarkNet, from {P5, P6, X}, default as X,
+            and 'X' means used in YOLOX, 'P5/P6' means used in YOLOv5.
+        depth_mult (float): Depth multiplier, multiply number of channels in
+            each layer, default as 1.0.
+        width_mult (float): Width multiplier, multiply number of blocks in
+            CSPLayer, default as 1.0.
+        depthwise (bool): Whether to use depth-wise conv layer.
+        act (str): Activation function type, default as 'silu'.
+        return_idx (list): Index of stages whose feature maps are returned.
+    """
+
+    __shared__ = ['depth_mult', 'width_mult', 'act', 'trt']
+
+    # in_channels, out_channels, num_blocks, add_shortcut, use_spp(use_sppf)
+    # 'X' means setting used in YOLOX, 'P5/P6' means setting used in YOLOv5.
+    arch_settings = {
+        'X': [[64, 128, 3, True, False], [128, 256, 9, True, False],
+              [256, 512, 9, True, False], [512, 1024, 3, False, True]],
+        'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 9, True, False], [512, 1024, 3, True, True]],
+        'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 9, True, False], [512, 768, 3, True, False],
+               [768, 1024, 3, True, True]],
+    }
+
+    def __init__(self,
+                 arch='X',
+                 depth_mult=1.0,
+                 width_mult=1.0,
+                 depthwise=False,
+                 act='silu',
+                 trt=False,
+                 return_idx=[2, 3, 4]):
+        super(CSPDarkNet, self).__init__()
+        self.arch = arch
+        self.return_idx = return_idx
+        Conv = DWConv if depthwise else BaseConv
+        arch_setting = self.arch_settings[arch]
+        base_channels = int(arch_setting[0][0] * width_mult)
+
+        # Note: differences between the latest YOLOv5 and the original YOLOX
+        # 1. self.stem, use SPPF(in YOLOv5) or SPP(in YOLOX)
+        # 2. use SPPF(in YOLOv5) or SPP(in YOLOX)
+        # 3. put SPPF before(YOLOv5) or SPP after(YOLOX) the last cspdark block's CSPLayer
+        # 4. whether SPPF(SPP)'CSPLayer add shortcut, True in YOLOv5, False in YOLOX
+        if arch in ['P5', 'P6']:
+            # in the latest YOLOv5, use Conv stem, and SPPF (fast, only single spp kernal size)
+            self.stem = Conv(
+                3, base_channels, ksize=6, stride=2, bias=False, act=act)
+            spp_kernal_sizes = 5
+        elif arch in ['X']:
+            # in the original YOLOX, use Focus stem, and SPP (three spp kernal sizes)
+            self.stem = Focus(
+                3, base_channels, ksize=3, stride=1, bias=False, act=act)
+            spp_kernal_sizes = (5, 9, 13)
+        else:
+            raise AttributeError("Unsupported arch type: {}".format(arch))
+
+        _out_channels = [base_channels]
+        layers_num = 1
+        self.csp_dark_blocks = []
+
+        for i, (in_channels, out_channels, num_blocks, shortcut,
+                use_spp) in enumerate(arch_setting):
+            in_channels = int(in_channels * width_mult)
+            out_channels = int(out_channels * width_mult)
+            _out_channels.append(out_channels)
+            num_blocks = max(round(num_blocks * depth_mult), 1)
+            stage = []
+
+            conv_layer = self.add_sublayer(
+                'layers{}.stage{}.conv_layer'.format(layers_num, i + 1),
+                Conv(
+                    in_channels, out_channels, 3, 2, bias=False, act=act))
+            stage.append(conv_layer)
+            layers_num += 1
+
+            if use_spp and arch in ['X']:
+                # in YOLOX use SPPLayer
+                spp_layer = self.add_sublayer(
+                    'layers{}.stage{}.spp_layer'.format(layers_num, i + 1),
+                    SPPLayer(
+                        out_channels,
+                        out_channels,
+                        kernel_sizes=spp_kernal_sizes,
+                        bias=False,
+                        act=act))
+                stage.append(spp_layer)
+                layers_num += 1
+
+            csp_layer = self.add_sublayer(
+                'layers{}.stage{}.csp_layer'.format(layers_num, i + 1),
+                CSPLayer(
+                    out_channels,
+                    out_channels,
+                    num_blocks=num_blocks,
+                    shortcut=shortcut,
+                    depthwise=depthwise,
+                    bias=False,
+                    act=act))
+            stage.append(csp_layer)
+            layers_num += 1
+
+            if use_spp and arch in ['P5', 'P6']:
+                # in latest YOLOv5 use SPPFLayer instead of SPPLayer
+                sppf_layer = self.add_sublayer(
+                    'layers{}.stage{}.sppf_layer'.format(layers_num, i + 1),
+                    SPPFLayer(
+                        out_channels,
+                        out_channels,
+                        ksize=5,
+                        bias=False,
+                        act=act))
+                stage.append(sppf_layer)
+                layers_num += 1
+
+            self.csp_dark_blocks.append(nn.Sequential(*stage))
+
+        self._out_channels = [_out_channels[i] for i in self.return_idx]
+        self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx]
+
+    def forward(self, inputs):
+        x = inputs['image']
+        outputs = []
+        x = self.stem(x)
+        for i, layer in enumerate(self.csp_dark_blocks):
+            x = layer(x)
+            if i + 1 in self.return_idx:
+                outputs.append(x)
+        return outputs
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=c, stride=s)
+            for c, s in zip(self._out_channels, self.strides)
+        ]
diff --git a/rtdetr_paddle/ppdet/modeling/backbones/cspresnet.py b/rtdetr_paddle/ppdet/modeling/backbones/cspresnet.py
new file mode 100644
index 0000000..5268ec8
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/backbones/cspresnet.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Constant
+
+from ppdet.modeling.ops import get_act_fn
+from ppdet.core.workspace import register, serializable
+from ..shape_spec import ShapeSpec
+
+__all__ = ['CSPResNet', 'BasicBlock', 'EffectiveSELayer', 'ConvBNLayer']
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size=3,
+                 stride=1,
+                 groups=1,
+                 padding=0,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+
+        self.conv = nn.Conv2D(
+            in_channels=ch_in,
+            out_channels=ch_out,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias_attr=False)
+
+        self.bn = nn.BatchNorm2D(
+            ch_out,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.act = get_act_fn(act) if act is None or isinstance(act, (
+            str, dict)) else act
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+
+        return x
+
+
+class RepVggBlock(nn.Layer):
+    def __init__(self, ch_in, ch_out, act='relu', alpha=False):
+        super(RepVggBlock, self).__init__()
+        self.ch_in = ch_in
+        self.ch_out = ch_out
+        self.conv1 = ConvBNLayer(
+            ch_in, ch_out, 3, stride=1, padding=1, act=None)
+        self.conv2 = ConvBNLayer(
+            ch_in, ch_out, 1, stride=1, padding=0, act=None)
+        self.act = get_act_fn(act) if act is None or isinstance(act, (
+            str, dict)) else act
+        if alpha:
+            self.alpha = self.create_parameter(
+                shape=[1],
+                attr=ParamAttr(initializer=Constant(value=1.)),
+                dtype="float32")
+        else:
+            self.alpha = None
+
+    def forward(self, x):
+        if hasattr(self, 'conv'):
+            y = self.conv(x)
+        else:
+            if self.alpha:
+                y = self.conv1(x) + self.alpha * self.conv2(x)
+            else:
+                y = self.conv1(x) + self.conv2(x)
+        y = self.act(y)
+        return y
+
+    def convert_to_deploy(self):
+        if not hasattr(self, 'conv'):
+            self.conv = nn.Conv2D(
+                in_channels=self.ch_in,
+                out_channels=self.ch_out,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                groups=1)
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.conv.weight.set_value(kernel)
+        self.conv.bias.set_value(bias)
+        self.__delattr__('conv1')
+        self.__delattr__('conv2')
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
+        if self.alpha:
+            return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor(
+                kernel1x1), bias3x3 + self.alpha * bias1x1
+        else:
+            return kernel3x3 + self._pad_1x1_to_3x3_tensor(
+                kernel1x1), bias3x3 + bias1x1
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        kernel = branch.conv.weight
+        running_mean = branch.bn._mean
+        running_var = branch.bn._variance
+        gamma = branch.bn.weight
+        beta = branch.bn.bias
+        eps = branch.bn._epsilon
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape((-1, 1, 1, 1))
+        return kernel * t, beta - running_mean * gamma / std
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 act='relu',
+                 shortcut=True,
+                 use_alpha=False):
+        super(BasicBlock, self).__init__()
+        assert ch_in == ch_out
+        self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act)
+        self.conv2 = RepVggBlock(ch_out, ch_out, act=act, alpha=use_alpha)
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        y = self.conv1(x)
+        y = self.conv2(y)
+        if self.shortcut:
+            return paddle.add(x, y)
+        else:
+            return y
+
+
+class EffectiveSELayer(nn.Layer):
+    """ Effective Squeeze-Excitation
+    From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
+    """
+
+    def __init__(self, channels, act='hardsigmoid'):
+        super(EffectiveSELayer, self).__init__()
+        self.fc = nn.Conv2D(channels, channels, kernel_size=1, padding=0)
+        self.act = get_act_fn(act) if act is None or isinstance(act, (
+            str, dict)) else act
+
+    def forward(self, x):
+        x_se = x.mean((2, 3), keepdim=True)
+        x_se = self.fc(x_se)
+        return x * self.act(x_se)
+
+
+class CSPResStage(nn.Layer):
+    def __init__(self,
+                 block_fn,
+                 ch_in,
+                 ch_out,
+                 n,
+                 stride,
+                 act='relu',
+                 attn='eca',
+                 use_alpha=False):
+        super(CSPResStage, self).__init__()
+
+        ch_mid = (ch_in + ch_out) // 2
+        if stride == 2:
+            self.conv_down = ConvBNLayer(
+                ch_in, ch_mid, 3, stride=2, padding=1, act=act)
+        else:
+            self.conv_down = None
+        self.conv1 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
+        self.conv2 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
+        self.blocks = nn.Sequential(*[
+            block_fn(
+                ch_mid // 2,
+                ch_mid // 2,
+                act=act,
+                shortcut=True,
+                use_alpha=use_alpha) for i in range(n)
+        ])
+        if attn:
+            self.attn = EffectiveSELayer(ch_mid, act='hardsigmoid')
+        else:
+            self.attn = None
+
+        self.conv3 = ConvBNLayer(ch_mid, ch_out, 1, act=act)
+
+    def forward(self, x):
+        if self.conv_down is not None:
+            x = self.conv_down(x)
+        y1 = self.conv1(x)
+        y2 = self.blocks(self.conv2(x))
+        y = paddle.concat([y1, y2], axis=1)
+        if self.attn is not None:
+            y = self.attn(y)
+        y = self.conv3(y)
+        return y
+
+
+@register
+@serializable
+class CSPResNet(nn.Layer):
+    __shared__ = ['width_mult', 'depth_mult', 'trt']
+
+    def __init__(self,
+                 layers=[3, 6, 6, 3],
+                 channels=[64, 128, 256, 512, 1024],
+                 act='swish',
+                 return_idx=[1, 2, 3],
+                 depth_wise=False,
+                 use_large_stem=False,
+                 width_mult=1.0,
+                 depth_mult=1.0,
+                 trt=False,
+                 use_checkpoint=False,
+                 use_alpha=False,
+                 **args):
+        super(CSPResNet, self).__init__()
+        self.use_checkpoint = use_checkpoint
+        channels = [max(round(c * width_mult), 1) for c in channels]
+        layers = [max(round(l * depth_mult), 1) for l in layers]
+        act = get_act_fn(
+            act, trt=trt) if act is None or isinstance(act,
+                                                       (str, dict)) else act
+
+        if use_large_stem:
+            self.stem = nn.Sequential(
+                ('conv1', ConvBNLayer(
+                    3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
+                ('conv2', ConvBNLayer(
+                    channels[0] // 2,
+                    channels[0] // 2,
+                    3,
+                    stride=1,
+                    padding=1,
+                    act=act)), ('conv3', ConvBNLayer(
+                        channels[0] // 2,
+                        channels[0],
+                        3,
+                        stride=1,
+                        padding=1,
+                        act=act)))
+        else:
+            self.stem = nn.Sequential(
+                ('conv1', ConvBNLayer(
+                    3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
+                ('conv2', ConvBNLayer(
+                    channels[0] // 2,
+                    channels[0],
+                    3,
+                    stride=1,
+                    padding=1,
+                    act=act)))
+
+        n = len(channels) - 1
+        self.stages = nn.Sequential(*[(str(i), CSPResStage(
+            BasicBlock,
+            channels[i],
+            channels[i + 1],
+            layers[i],
+            2,
+            act=act,
+            use_alpha=use_alpha)) for i in range(n)])
+
+        self._out_channels = channels[1:]
+        self._out_strides = [4 * 2**i for i in range(n)]
+        self.return_idx = return_idx
+        if use_checkpoint:
+            paddle.seed(0)
+
+    def forward(self, inputs):
+        x = inputs['image']
+        x = self.stem(x)
+        outs = []
+        for idx, stage in enumerate(self.stages):
+            if self.use_checkpoint and self.training:
+                x = paddle.distributed.fleet.utils.recompute(
+                    stage, x, **{"preserve_rng_state": True})
+            else:
+                x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+
+        return outs
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self._out_channels[i], stride=self._out_strides[i])
+            for i in self.return_idx
+        ]
diff --git a/rtdetr_paddle/ppdet/modeling/backbones/darknet.py b/rtdetr_paddle/ppdet/modeling/backbones/darknet.py
new file mode 100755
index 0000000..c68c650
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/backbones/darknet.py
@@ -0,0 +1,345 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling.ops import batch_norm, mish
+from ..shape_spec import ShapeSpec
+
+__all__ = ['DarkNet', 'ConvBNLayer']
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size=3,
+                 stride=1,
+                 groups=1,
+                 padding=0,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 act="leaky",
+                 freeze_norm=False,
+                 data_format='NCHW',
+                 name=''):
+        """
+        conv + bn + activation layer
+
+        Args:
+            ch_in (int): input channel
+            ch_out (int): output channel
+            filter_size (int): filter size, default 3
+            stride (int): stride, default 1
+            groups (int): number of groups of conv layer, default 1
+            padding (int): padding size, default 0
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            act (str): activation function type, default 'leaky', which means leaky_relu
+            freeze_norm (bool): whether to freeze norm, default False
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(ConvBNLayer, self).__init__()
+
+        self.conv = nn.Conv2D(
+            in_channels=ch_in,
+            out_channels=ch_out,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            data_format=data_format,
+            bias_attr=False)
+        self.batch_norm = batch_norm(
+            ch_out,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+        self.act = act
+
+    def forward(self, inputs):
+        out = self.conv(inputs)
+        out = self.batch_norm(out)
+        if self.act == 'leaky':
+            out = F.leaky_relu(out, 0.1)
+        else:
+            out = getattr(F, self.act)(out)
+        return out
+
+
+class DownSample(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size=3,
+                 stride=2,
+                 padding=1,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 data_format='NCHW'):
+        """
+        downsample layer
+
+        Args:
+            ch_in (int): input channel
+            ch_out (int): output channel
+            filter_size (int): filter size, default 3
+            stride (int): stride, default 2
+            padding (int): padding size, default 1
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            freeze_norm (bool): whether to freeze norm, default False
+            data_format (str): data format, NCHW or NHWC
+        """
+
+        super(DownSample, self).__init__()
+
+        self.conv_bn_layer = ConvBNLayer(
+            ch_in=ch_in,
+            ch_out=ch_out,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+        self.ch_out = ch_out
+
+    def forward(self, inputs):
+        out = self.conv_bn_layer(inputs)
+        return out
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 data_format='NCHW'):
+        """
+        BasicBlock layer of DarkNet
+
+        Args:
+            ch_in (int): input channel
+            ch_out (int): output channel
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            freeze_norm (bool): whether to freeze norm, default False
+            data_format (str): data format, NCHW or NHWC
+        """
+
+        super(BasicBlock, self).__init__()
+
+        assert ch_in == ch_out and (ch_in % 2) == 0, \
+            f"ch_in and ch_out should be the same even int, but the input \'ch_in is {ch_in}, \'ch_out is {ch_out}"
+        # example:
+        # --------------{conv1} --> {conv2}
+        # channel route: 10-->5 --> 5-->10
+        self.conv1 = ConvBNLayer(
+            ch_in=ch_in,
+            ch_out=int(ch_out / 2),
+            filter_size=1,
+            stride=1,
+            padding=0,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+        self.conv2 = ConvBNLayer(
+            ch_in=int(ch_out / 2),
+            ch_out=ch_out,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+
+    def forward(self, inputs):
+        conv1 = self.conv1(inputs)
+        conv2 = self.conv2(conv1)
+        out = paddle.add(x=inputs, y=conv2)
+        return out
+
+
+class Blocks(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 count,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 name=None,
+                 data_format='NCHW'):
+        """
+        Blocks layer, which consist of some BaickBlock layers
+
+        Args:
+            ch_in (int): input channel
+            ch_out (int): output channel
+            count (int): number of BasicBlock layer
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            freeze_norm (bool): whether to freeze norm, default False
+            name (str): layer name
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(Blocks, self).__init__()
+
+        self.basicblock0 = BasicBlock(
+            ch_in,
+            ch_out,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+        self.res_out_list = []
+        for i in range(1, count):
+            block_name = '{}.{}'.format(name, i)
+            res_out = self.add_sublayer(
+                block_name,
+                BasicBlock(
+                    ch_out,
+                    ch_out,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    data_format=data_format))
+            self.res_out_list.append(res_out)
+        self.ch_out = ch_out
+
+    def forward(self, inputs):
+        y = self.basicblock0(inputs)
+        for basic_block_i in self.res_out_list:
+            y = basic_block_i(y)
+        return y
+
+
+DarkNet_cfg = {53: ([1, 2, 8, 8, 4])}
+
+
+@register
+@serializable
+class DarkNet(nn.Layer):
+    __shared__ = ['norm_type', 'data_format']
+
+    def __init__(self,
+                 depth=53,
+                 freeze_at=-1,
+                 return_idx=[2, 3, 4],
+                 num_stages=5,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 data_format='NCHW'):
+        """
+        Darknet, see https://pjreddie.com/darknet/yolo/
+
+        Args:
+            depth (int): depth of network
+            freeze_at (int): freeze the backbone at which stage
+            filter_size (int): filter size, default 3
+            return_idx (list): index of stages whose feature maps are returned
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(DarkNet, self).__init__()
+        self.depth = depth
+        self.freeze_at = freeze_at
+        self.return_idx = return_idx
+        self.num_stages = num_stages
+        self.stages = DarkNet_cfg[self.depth][0:num_stages]
+
+        self.conv0 = ConvBNLayer(
+            ch_in=3,
+            ch_out=32,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+
+        self.downsample0 = DownSample(
+            ch_in=32,
+            ch_out=32 * 2,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+
+        self._out_channels = []
+        self.darknet_conv_block_list = []
+        self.downsample_list = []
+        ch_in = [64, 128, 256, 512, 1024]
+        for i, stage in enumerate(self.stages):
+            name = 'stage.{}'.format(i)
+            conv_block = self.add_sublayer(
+                name,
+                Blocks(
+                    int(ch_in[i]),
+                    int(ch_in[i]),
+                    stage,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    data_format=data_format,
+                    name=name))
+            self.darknet_conv_block_list.append(conv_block)
+            if i in return_idx:
+                self._out_channels.append(int(ch_in[i]))
+        for i in range(num_stages - 1):
+            down_name = 'stage.{}.downsample'.format(i)
+            downsample = self.add_sublayer(
+                down_name,
+                DownSample(
+                    ch_in=int(ch_in[i]),
+                    ch_out=int(ch_in[i + 1]),
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    data_format=data_format))
+            self.downsample_list.append(downsample)
+
+    def forward(self, inputs):
+        x = inputs['image']
+
+        out = self.conv0(x)
+        out = self.downsample0(out)
+        blocks = []
+        for i, conv_block_i in enumerate(self.darknet_conv_block_list):
+            out = conv_block_i(out)
+            if i == self.freeze_at:
+                out.stop_gradient = True
+            if i in self.return_idx:
+                blocks.append(out)
+            if i < self.num_stages - 1:
+                out = self.downsample_list[i](out)
+        return blocks
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/rtdetr_paddle/ppdet/modeling/backbones/focalnet.py b/rtdetr_paddle/ppdet/modeling/backbones/focalnet.py
new file mode 100644
index 0000000..54c2877
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/backbones/focalnet.py
@@ -0,0 +1,720 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is based on https://github.com/microsoft/FocalNet/blob/main/classification/focalnet.py
+"""
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.modeling.shape_spec import ShapeSpec
+from ppdet.core.workspace import register, serializable
+from .transformer_utils import DropPath, Identity
+from .transformer_utils import add_parameter, to_2tuple
+from .transformer_utils import ones_, zeros_, trunc_normal_
+from .swin_transformer import Mlp
+
+__all__ = ['FocalNet']
+
+MODEL_cfg = {
+    'focalnet_T_224_1k_srf': dict(
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        focal_levels=[2, 2, 2, 2],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.2,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_srf_pretrained.pdparams',
+    ),
+    'focalnet_S_224_1k_srf': dict(
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        focal_levels=[2, 2, 2, 2],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.3,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_srf_pretrained.pdparams',
+    ),
+    'focalnet_B_224_1k_srf': dict(
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        focal_levels=[2, 2, 2, 2],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_srf_pretrained.pdparams',
+    ),
+    'focalnet_T_224_1k_lrf': dict(
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.2,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_lrf_pretrained.pdparams',
+    ),
+    'focalnet_S_224_1k_lrf': dict(
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.3,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_lrf_pretrained.pdparams',
+    ),
+    'focalnet_B_224_1k_lrf': dict(
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_lrf_pretrained.pdparams',
+    ),
+    'focalnet_L_384_22k_fl3': dict(
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[5, 5, 5, 5],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=False,
+        use_layerscale=True,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_pretrained.pdparams',
+    ),
+    'focalnet_L_384_22k_fl4': dict(
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        focal_levels=[4, 4, 4, 4],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=False,
+        use_layerscale=True,
+        normalize_modulator=True,  #
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_fl4_pretrained.pdparams',
+    ),
+    'focalnet_XL_384_22k_fl3': dict(
+        embed_dim=256,
+        depths=[2, 2, 18, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[5, 5, 5, 5],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=False,
+        use_layerscale=True,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_pretrained.pdparams',
+    ),
+    'focalnet_XL_384_22k_fl4': dict(
+        embed_dim=256,
+        depths=[2, 2, 18, 2],
+        focal_levels=[4, 4, 4, 4],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=False,
+        use_layerscale=True,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_fl4_pretrained.pdparams',
+    ),
+    'focalnet_H_224_22k_fl3': dict(
+        embed_dim=352,
+        depths=[2, 2, 18, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=True,  #
+        use_layerscale=True,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_pretrained.pdparams',
+    ),
+    'focalnet_H_224_22k_fl4': dict(
+        embed_dim=352,
+        depths=[2, 2, 18, 2],
+        focal_levels=[4, 4, 4, 4],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=True,  #
+        use_layerscale=True,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_fl4_pretrained.pdparams',
+    ),
+}
+
+
+class FocalModulation(nn.Layer):
+    """
+    Args:
+        dim (int): Number of input channels.
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+        focal_level (int): Number of focal levels
+        focal_window (int): Focal window size at focal level 1
+        focal_factor (int): Step to increase the focal window. Default: 2
+        use_postln_in_modulation (bool): Whether use post-modulation layernorm
+        normalize_modulator (bool): Whether use normalize in modulator
+    """
+
+    def __init__(self,
+                 dim,
+                 proj_drop=0.,
+                 focal_level=2,
+                 focal_window=7,
+                 focal_factor=2,
+                 use_postln_in_modulation=False,
+                 normalize_modulator=False):
+        super().__init__()
+        self.dim = dim
+
+        # specific args for focalv3
+        self.focal_level = focal_level
+        self.focal_window = focal_window
+        self.focal_factor = focal_factor
+        self.use_postln_in_modulation = use_postln_in_modulation
+        self.normalize_modulator = normalize_modulator
+
+        self.f = nn.Linear(
+            dim, 2 * dim + (self.focal_level + 1), bias_attr=True)
+        self.h = nn.Conv2D(
+            dim,
+            dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            bias_attr=True)
+
+        self.act = nn.GELU()
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.focal_layers = nn.LayerList()
+
+        if self.use_postln_in_modulation:
+            self.ln = nn.LayerNorm(dim)
+
+        for k in range(self.focal_level):
+            kernel_size = self.focal_factor * k + self.focal_window
+            self.focal_layers.append(
+                nn.Sequential(
+                    nn.Conv2D(
+                        dim,
+                        dim,
+                        kernel_size=kernel_size,
+                        stride=1,
+                        groups=dim,
+                        padding=kernel_size // 2,
+                        bias_attr=False),
+                    nn.GELU()))
+
+    def forward(self, x):
+        """ Forward function.
+        Args:
+            x: input features with shape of (B, H, W, C)
+        """
+        _, _, _, C = x.shape
+        x = self.f(x)
+        x = x.transpose([0, 3, 1, 2])
+        q, ctx, gates = paddle.split(x, (C, C, self.focal_level + 1), 1)
+
+        ctx_all = 0
+        for l in range(self.focal_level):
+            ctx = self.focal_layers[l](ctx)
+            ctx_all = ctx_all + ctx * gates[:, l:l + 1]
+        ctx_global = self.act(ctx.mean(2, keepdim=True).mean(3, keepdim=True))
+        ctx_all = ctx_all + ctx_global * gates[:, self.focal_level:]
+        if self.normalize_modulator:
+            ctx_all = ctx_all / (self.focal_level + 1)
+
+        x_out = q * self.h(ctx_all)
+        x_out = x_out.transpose([0, 2, 3, 1])
+        if self.use_postln_in_modulation:
+            x_out = self.ln(x_out)
+        x_out = self.proj(x_out)
+        x_out = self.proj_drop(x_out)
+        return x_out
+
+
+class FocalModulationBlock(nn.Layer):
+    """ Focal Modulation Block.
+    Args:
+        dim (int): Number of input channels.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        drop (float, optional): Dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+        focal_level (int): number of focal levels
+        focal_window (int): focal kernel size at level 1
+        use_postln (bool): Whether use layernorm after modulation. Default: False.
+        use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
+        normalize_modulator (bool): Whether use normalize in modulator
+        use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False 
+        layerscale_value (float): Value for layer scale. Default: 1e-4 
+    """
+
+    def __init__(self,
+                 dim,
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 focal_level=2,
+                 focal_window=9,
+                 use_postln=False,
+                 use_postln_in_modulation=False,
+                 normalize_modulator=False,
+                 use_layerscale=False,
+                 layerscale_value=1e-4):
+        super().__init__()
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.focal_window = focal_window
+        self.focal_level = focal_level
+        self.use_postln = use_postln
+        self.use_layerscale = use_layerscale
+
+        self.norm1 = norm_layer(dim)
+        self.modulation = FocalModulation(
+            dim,
+            proj_drop=drop,
+            focal_level=self.focal_level,
+            focal_window=self.focal_window,
+            use_postln_in_modulation=use_postln_in_modulation,
+            normalize_modulator=normalize_modulator)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        self.H = None
+        self.W = None
+
+        self.gamma_1 = 1.0
+        self.gamma_2 = 1.0
+        if self.use_layerscale:
+            self.gamma_1 = add_parameter(self,
+                                         layerscale_value * paddle.ones([dim]))
+            self.gamma_2 = add_parameter(self,
+                                         layerscale_value * paddle.ones([dim]))
+
+    def forward(self, x):
+        """
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        if not self.use_postln:
+            x = self.norm1(x)
+        x = x.reshape([-1, H, W, C])
+
+        # FM
+        x = self.modulation(x).reshape([-1, H * W, C])
+        if self.use_postln:
+            x = self.norm1(x)
+
+        # FFN
+        x = shortcut + self.drop_path(self.gamma_1 * x)
+
+        if self.use_postln:
+            x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x)))
+        else:
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class BasicLayer(nn.Layer):
+    """ A basic focal modulation layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        drop (float, optional): Dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
+        focal_level (int): Number of focal levels
+        focal_window (int): Focal window size at focal level 1
+        use_conv_embed (bool): Whether use overlapped convolution for patch embedding
+        use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False 
+        layerscale_value (float): Value of layerscale
+        use_postln (bool): Whether use layernorm after modulation. Default: False.
+        use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
+        normalize_modulator (bool): Whether use normalize in modulator
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(self,
+                 dim,
+                 depth,
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 focal_level=2,
+                 focal_window=9,
+                 use_conv_embed=False,
+                 use_layerscale=False,
+                 layerscale_value=1e-4,
+                 use_postln=False,
+                 use_postln_in_modulation=False,
+                 normalize_modulator=False,
+                 use_checkpoint=False):
+        super().__init__()
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.LayerList([
+            FocalModulationBlock(
+                dim=dim,
+                mlp_ratio=mlp_ratio,
+                drop=drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, np.ndarray) else drop_path,
+                act_layer=nn.GELU,
+                norm_layer=norm_layer,
+                focal_level=focal_level,
+                focal_window=focal_window,
+                use_postln=use_postln,
+                use_postln_in_modulation=use_postln_in_modulation,
+                normalize_modulator=normalize_modulator,
+                use_layerscale=use_layerscale,
+                layerscale_value=layerscale_value) for i in range(depth)
+        ])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                patch_size=2,
+                in_chans=dim,
+                embed_dim=2 * dim,
+                use_conv_embed=use_conv_embed,
+                norm_layer=norm_layer,
+                is_stem=False)
+        else:
+            self.downsample = None
+
+    def forward(self, x, H, W):
+        """
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+        """
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            x = blk(x)
+
+        if self.downsample is not None:
+            x_reshaped = x.transpose([0, 2, 1]).reshape(
+                [x.shape[0], x.shape[-1], H, W])
+            x_down = self.downsample(x_reshaped)
+            x_down = x_down.flatten(2).transpose([0, 2, 1])
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Layer, optional): Normalization layer. Default: None
+        use_conv_embed (bool): Whether use overlapped convolution for patch embedding. Default: False
+        is_stem (bool): Is the stem block or not. 
+    """
+
+    def __init__(self,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 norm_layer=None,
+                 use_conv_embed=False,
+                 is_stem=False):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        if use_conv_embed:
+            # if we choose to use conv embedding, then we treat the stem and non-stem differently
+            if is_stem:
+                kernel_size = 7
+                padding = 2
+                stride = 4
+            else:
+                kernel_size = 3
+                padding = 1
+                stride = 2
+            self.proj = nn.Conv2D(
+                in_chans,
+                embed_dim,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding)
+        else:
+            self.proj = nn.Conv2D(
+                in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        _, _, H, W = x.shape
+
+        if W % self.patch_size[1] != 0:
+            # for 3D tensor: [pad_left, pad_right]
+            # for 4D tensor: [pad_left, pad_right, pad_top, pad_bottom]
+            x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
+            W += W % self.patch_size[1]
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
+            H += H % self.patch_size[0]
+
+        x = self.proj(x)
+        if self.norm is not None:
+            _, _, Wh, Ww = x.shape
+            x = x.flatten(2).transpose([0, 2, 1])
+            x = self.norm(x)
+            x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
+
+        return x
+
+
+@register
+@serializable
+class FocalNet(nn.Layer):
+    """ FocalNet backbone
+    Args:
+        arch (str): Architecture of FocalNet
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each FocalNet Transformer stage.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        drop_rate (float): Dropout rate.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        focal_levels (Sequence[int]): Number of focal levels at four stages
+        focal_windows (Sequence[int]): Focal window sizes at first focal level at four stages
+        use_conv_embed (bool): Whether use overlapped convolution for patch embedding
+        use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False 
+        layerscale_value (float): Value of layerscale
+        use_postln (bool): Whether use layernorm after modulation. Default: False.
+        use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
+        normalize_modulator (bool): Whether use normalize in modulator
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(
+            self,
+            arch='focalnet_T_224_1k_srf',
+            out_indices=(0, 1, 2, 3),
+            frozen_stages=-1,
+            patch_size=4,
+            in_chans=3,
+            embed_dim=96,
+            depths=[2, 2, 6, 2],
+            mlp_ratio=4.,
+            drop_rate=0.,
+            drop_path_rate=0.2,  # 0.5 better for large+ models
+            norm_layer=nn.LayerNorm,
+            patch_norm=True,
+            focal_levels=[2, 2, 2, 2],
+            focal_windows=[3, 3, 3, 3],
+            use_conv_embed=False,
+            use_layerscale=False,
+            layerscale_value=1e-4,
+            use_postln=False,
+            use_postln_in_modulation=False,
+            normalize_modulator=False,
+            use_checkpoint=False,
+            pretrained=None):
+        super(FocalNet, self).__init__()
+        assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch)
+
+        embed_dim = MODEL_cfg[arch]['embed_dim']
+        depths = MODEL_cfg[arch]['depths']
+        drop_path_rate = MODEL_cfg[arch]['drop_path_rate']
+        focal_levels = MODEL_cfg[arch]['focal_levels']
+        focal_windows = MODEL_cfg[arch]['focal_windows']
+        use_conv_embed = MODEL_cfg[arch]['use_conv_embed']
+        use_layerscale = MODEL_cfg[arch]['use_layerscale']
+        use_postln = MODEL_cfg[arch]['use_postln']
+        use_postln_in_modulation = MODEL_cfg[arch]['use_postln_in_modulation']
+        normalize_modulator = MODEL_cfg[arch]['normalize_modulator']
+        if pretrained is None:
+            pretrained = MODEL_cfg[arch]['pretrained']
+
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.num_layers = len(depths)
+        self.patch_norm = patch_norm
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None,
+            use_conv_embed=use_conv_embed,
+            is_stem=True)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth decay rule
+        dpr = np.linspace(0, drop_path_rate, sum(depths))
+
+        # build layers
+        self.layers = nn.LayerList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                mlp_ratio=mlp_ratio,
+                drop=drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchEmbed
+                if (i_layer < self.num_layers - 1) else None,
+                focal_level=focal_levels[i_layer],
+                focal_window=focal_windows[i_layer],
+                use_conv_embed=use_conv_embed,
+                use_layerscale=use_layerscale,
+                layerscale_value=layerscale_value,
+                use_postln=use_postln,
+                use_postln_in_modulation=use_postln_in_modulation,
+                normalize_modulator=normalize_modulator,
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+
+        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
+        self.num_features = num_features
+
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_sublayer(layer_name, layer)
+
+        self.apply(self._init_weights)
+        self._freeze_stages()
+        if pretrained:
+            if 'http' in pretrained:  #URL
+                path = paddle.utils.download.get_weights_path_from_url(
+                    pretrained)
+            else:  #model in local path
+                path = pretrained
+            self.set_state_dict(paddle.load(path))
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.stop_gradient = True
+
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.stop_gradient = True
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward(self, x):
+        x = self.patch_embed(x['image'])
+        B, _, Wh, Ww = x.shape
+        x = x.flatten(2).transpose([0, 2, 1])
+        x = self.pos_drop(x)
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+                out = x_out.reshape([-1, H, W, self.num_features[i]]).transpose(
+                    (0, 3, 1, 2))
+                outs.append(out)
+
+        return outs
+
+    @property
+    def out_shape(self):
+        out_strides = [4, 8, 16, 32]
+        return [
+            ShapeSpec(
+                channels=self.num_features[i], stride=out_strides[i])
+            for i in self.out_indices
+        ]
diff --git a/rtdetr_paddle/ppdet/modeling/backbones/hgnet_v2.py b/rtdetr_paddle/ppdet/modeling/backbones/hgnet_v2.py
new file mode 100644
index 0000000..88f989a
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/backbones/hgnet_v2.py
@@ -0,0 +1,447 @@
+# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import KaimingNormal, Constant
+from paddle.nn import Conv2D, BatchNorm2D, ReLU, AdaptiveAvgPool2D, MaxPool2D
+from paddle.regularizer import L2Decay
+from paddle import ParamAttr
+
+import copy
+
+from ppdet.core.workspace import register, serializable
+from ..shape_spec import ShapeSpec
+
+__all__ = ['PPHGNetV2']
+
+kaiming_normal_ = KaimingNormal()
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+class LearnableAffineBlock(nn.Layer):
+    def __init__(self,
+                 scale_value=1.0,
+                 bias_value=0.0,
+                 lr_mult=1.0,
+                 lab_lr=0.01):
+        super().__init__()
+        self.scale = self.create_parameter(
+            shape=[1, ],
+            default_initializer=Constant(value=scale_value),
+            attr=ParamAttr(learning_rate=lr_mult * lab_lr))
+        self.add_parameter("scale", self.scale)
+        self.bias = self.create_parameter(
+            shape=[1, ],
+            default_initializer=Constant(value=bias_value),
+            attr=ParamAttr(learning_rate=lr_mult * lab_lr))
+        self.add_parameter("bias", self.bias)
+
+    def forward(self, x):
+        return self.scale * x + self.bias
+
+
+class ConvBNAct(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=1,
+                 groups=1,
+                 use_act=True,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.use_act = use_act
+        self.use_lab = use_lab
+        self.conv = Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=padding
+            if isinstance(padding, str) else (kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=False)
+        self.bn = BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(
+                regularizer=L2Decay(0.0), learning_rate=lr_mult),
+            bias_attr=ParamAttr(
+                regularizer=L2Decay(0.0), learning_rate=lr_mult))
+        if self.use_act:
+            self.act = ReLU()
+            if self.use_lab:
+                self.lab = LearnableAffineBlock(lr_mult=lr_mult)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.use_act:
+            x = self.act(x)
+            if self.use_lab:
+                x = self.lab(x)
+        return x
+
+
+class LightConvBNAct(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 groups=1,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.conv1 = ConvBNAct(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            use_act=False,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.conv2 = ConvBNAct(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            groups=out_channels,
+            use_act=True,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+
+
+class StemBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.stem1 = ConvBNAct(
+            in_channels=in_channels,
+            out_channels=mid_channels,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem2a = ConvBNAct(
+            in_channels=mid_channels,
+            out_channels=mid_channels // 2,
+            kernel_size=2,
+            stride=1,
+            padding="SAME",
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem2b = ConvBNAct(
+            in_channels=mid_channels // 2,
+            out_channels=mid_channels,
+            kernel_size=2,
+            stride=1,
+            padding="SAME",
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem3 = ConvBNAct(
+            in_channels=mid_channels * 2,
+            out_channels=mid_channels,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem4 = ConvBNAct(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.pool = nn.MaxPool2D(
+            kernel_size=2, stride=1, ceil_mode=True, padding="SAME")
+
+    def forward(self, x):
+        x = self.stem1(x)
+        x2 = self.stem2a(x)
+        x2 = self.stem2b(x2)
+        x1 = self.pool(x)
+        x = paddle.concat([x1, x2], 1)
+        x = self.stem3(x)
+        x = self.stem4(x)
+
+        return x
+
+
+class HG_Block(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 kernel_size=3,
+                 layer_num=6,
+                 identity=False,
+                 light_block=True,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.identity = identity
+
+        self.layers = nn.LayerList()
+        block_type = "LightConvBNAct" if light_block else "ConvBNAct"
+        for i in range(layer_num):
+            self.layers.append(
+                eval(block_type)(in_channels=in_channels
+                                 if i == 0 else mid_channels,
+                                 out_channels=mid_channels,
+                                 stride=1,
+                                 kernel_size=kernel_size,
+                                 use_lab=use_lab,
+                                 lr_mult=lr_mult))
+        # feature aggregation
+        total_channels = in_channels + layer_num * mid_channels
+        self.aggregation_squeeze_conv = ConvBNAct(
+            in_channels=total_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.aggregation_excitation_conv = ConvBNAct(
+            in_channels=out_channels // 2,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+
+    def forward(self, x):
+        identity = x
+        output = []
+        output.append(x)
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+        x = paddle.concat(output, axis=1)
+        x = self.aggregation_squeeze_conv(x)
+        x = self.aggregation_excitation_conv(x)
+        if self.identity:
+            x += identity
+        return x
+
+
+class HG_Stage(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 block_num,
+                 layer_num=6,
+                 downsample=True,
+                 light_block=True,
+                 kernel_size=3,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.downsample = downsample
+        if downsample:
+            self.downsample = ConvBNAct(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=3,
+                stride=2,
+                groups=in_channels,
+                use_act=False,
+                use_lab=use_lab,
+                lr_mult=lr_mult)
+
+        blocks_list = []
+        for i in range(block_num):
+            blocks_list.append(
+                HG_Block(
+                    in_channels=in_channels if i == 0 else out_channels,
+                    mid_channels=mid_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    layer_num=layer_num,
+                    identity=False if i == 0 else True,
+                    light_block=light_block,
+                    use_lab=use_lab,
+                    lr_mult=lr_mult))
+        self.blocks = nn.Sequential(*blocks_list)
+
+    def forward(self, x):
+        if self.downsample:
+            x = self.downsample(x)
+        x = self.blocks(x)
+        return x
+
+
+def _freeze_norm(m: nn.BatchNorm2D):
+    param_attr = ParamAttr(
+        learning_rate=0., regularizer=L2Decay(0.), trainable=False)
+    bias_attr = ParamAttr(
+        learning_rate=0., regularizer=L2Decay(0.), trainable=False)
+    global_stats = True
+    norm = nn.BatchNorm2D(
+        m._num_features,
+        weight_attr=param_attr,
+        bias_attr=bias_attr,
+        use_global_stats=global_stats)
+    for param in norm.parameters():
+        param.stop_gradient = True
+    return norm
+
+
+def reset_bn(model: nn.Layer, reset_func=_freeze_norm):
+    if isinstance(model, nn.BatchNorm2D):
+        model = reset_func(model)
+    else:
+        for name, child in model.named_children():
+            _child = reset_bn(child, reset_func)
+            if _child is not child:
+                setattr(model, name, _child)
+    return model
+
+
+@register
+@serializable
+class PPHGNetV2(nn.Layer):
+    """
+    PPHGNetV2
+    Args:
+        stem_channels: list. Number of channels for the stem block.
+        stage_type: str. The stage configuration of PPHGNet. such as the number of channels, stride, etc.
+        use_lab: boolean. Whether to use LearnableAffineBlock in network.
+        lr_mult_list: list. Control the learning rate of different stages.
+    Returns:
+        model: nn.Layer. Specific PPHGNetV2 model depends on args.
+    """
+
+    arch_configs = {
+        'L': {
+            'stem_channels': [3, 32, 48],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [48, 48, 128, 1, False, False, 3, 6],
+                "stage2": [128, 96, 512, 1, True, False, 3, 6],
+                "stage3": [512, 192, 1024, 3, True, True, 5, 6],
+                "stage4": [1024, 384, 2048, 1, True, True, 5, 6],
+            }
+        },
+        'X': {
+            'stem_channels': [3, 32, 64],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [64, 64, 128, 1, False, False, 3, 6],
+                "stage2": [128, 128, 512, 2, True, False, 3, 6],
+                "stage3": [512, 256, 1024, 5, True, True, 5, 6],
+                "stage4": [1024, 512, 2048, 2, True, True, 5, 6],
+            }
+        }
+    }
+
+    def __init__(self,
+                 arch,
+                 use_lab=False,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 return_idx=[1, 2, 3],
+                 freeze_stem_only=True,
+                 freeze_at=0,
+                 freeze_norm=True):
+        super().__init__()
+        self.use_lab = use_lab
+        self.return_idx = return_idx
+
+        stem_channels = self.arch_configs[arch]['stem_channels']
+        stage_config = self.arch_configs[arch]['stage_config']
+
+        self._out_strides = [4, 8, 16, 32]
+        self._out_channels = [stage_config[k][2] for k in stage_config]
+
+        # stem
+        self.stem = StemBlock(
+            in_channels=stem_channels[0],
+            mid_channels=stem_channels[1],
+            out_channels=stem_channels[2],
+            use_lab=use_lab,
+            lr_mult=lr_mult_list[0])
+
+        # stages
+        self.stages = nn.LayerList()
+        for i, k in enumerate(stage_config):
+            in_channels, mid_channels, out_channels, block_num, downsample, light_block, kernel_size, layer_num = stage_config[
+                k]
+            self.stages.append(
+                HG_Stage(
+                    in_channels,
+                    mid_channels,
+                    out_channels,
+                    block_num,
+                    layer_num,
+                    downsample,
+                    light_block,
+                    kernel_size,
+                    use_lab,
+                    lr_mult=lr_mult_list[i + 1]))
+
+        if freeze_at >= 0:
+            self._freeze_parameters(self.stem)
+            if not freeze_stem_only:
+                for i in range(min(freeze_at + 1, len(self.stages))):
+                    self._freeze_parameters(self.stages[i])
+
+        if freeze_norm:
+            reset_bn(self, reset_func=_freeze_norm)
+
+        self._init_weights()
+
+    def _freeze_parameters(self, m):
+        for p in m.parameters():
+            p.stop_gradient = True
+
+    def _init_weights(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                kaiming_normal_(m.weight)
+            elif isinstance(m, (nn.BatchNorm2D)):
+                ones_(m.weight)
+                zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                zeros_(m.bias)
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self._out_channels[i], stride=self._out_strides[i])
+            for i in self.return_idx
+        ]
+
+    def forward(self, inputs):
+        x = inputs['image']
+        x = self.stem(x)
+        outs = []
+        for idx, stage in enumerate(self.stages):
+            x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+        return outs
diff --git a/rtdetr_paddle/ppdet/modeling/backbones/lcnet.py b/rtdetr_paddle/ppdet/modeling/backbones/lcnet.py
new file mode 100644
index 0000000..76da139
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/backbones/lcnet.py
@@ -0,0 +1,271 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.nn import AdaptiveAvgPool2D, Conv2D
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import KaimingNormal
+
+from ppdet.core.workspace import register, serializable
+from numbers import Integral
+from ..shape_spec import ShapeSpec
+
+__all__ = ['LCNet']
+
+NET_CONFIG = {
+    "blocks2":
+    #k, in_c, out_c, s, use_se
+    [[3, 16, 32, 1, False], ],
+    "blocks3": [
+        [3, 32, 64, 2, False],
+        [3, 64, 64, 1, False],
+    ],
+    "blocks4": [
+        [3, 64, 128, 2, False],
+        [3, 128, 128, 1, False],
+    ],
+    "blocks5": [
+        [3, 128, 256, 2, False],
+        [5, 256, 256, 1, False],
+        [5, 256, 256, 1, False],
+        [5, 256, 256, 1, False],
+        [5, 256, 256, 1, False],
+        [5, 256, 256, 1, False],
+    ],
+    "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]]
+}
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 num_groups=1,
+                 act='hard_swish'):
+        super().__init__()
+
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=num_groups,
+            weight_attr=ParamAttr(initializer=KaimingNormal()),
+            bias_attr=False)
+
+        self.bn = nn.BatchNorm2D(
+            num_filters,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        if act == 'hard_swish':
+            self.act = nn.Hardswish()
+        elif act == 'relu6':
+            self.act = nn.ReLU6()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+        return x
+
+
+class DepthwiseSeparable(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 dw_size=3,
+                 use_se=False,
+                 act='hard_swish'):
+        super().__init__()
+        self.use_se = use_se
+        self.dw_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_channels,
+            filter_size=dw_size,
+            stride=stride,
+            num_groups=num_channels,
+            act=act)
+        if use_se:
+            self.se = SEModule(num_channels)
+        self.pw_conv = ConvBNLayer(
+            num_channels=num_channels,
+            filter_size=1,
+            num_filters=num_filters,
+            stride=1,
+            act=act)
+
+    def forward(self, x):
+        x = self.dw_conv(x)
+        if self.use_se:
+            x = self.se(x)
+        x = self.pw_conv(x)
+        return x
+
+
+class SEModule(nn.Layer):
+    def __init__(self, channel, reduction=4):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.relu = nn.ReLU()
+        self.conv2 = Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.hardsigmoid = nn.Hardsigmoid()
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        x = paddle.multiply(x=identity, y=x)
+        return x
+
+
+@register
+@serializable
+class LCNet(nn.Layer):
+    def __init__(self, scale=1.0, feature_maps=[3, 4, 5], act='hard_swish'):
+        super().__init__()
+        self.scale = scale
+        self.feature_maps = feature_maps
+
+        out_channels = []
+
+        self.conv1 = ConvBNLayer(
+            num_channels=3,
+            filter_size=3,
+            num_filters=make_divisible(16 * scale),
+            stride=2,
+            act=act)
+
+        self.blocks2 = nn.Sequential(* [
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                act=act)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"])
+        ])
+
+        self.blocks3 = nn.Sequential(* [
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                act=act)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"])
+        ])
+
+        out_channels.append(
+            make_divisible(NET_CONFIG["blocks3"][-1][2] * scale))
+
+        self.blocks4 = nn.Sequential(* [
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                act=act)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"])
+        ])
+
+        out_channels.append(
+            make_divisible(NET_CONFIG["blocks4"][-1][2] * scale))
+
+        self.blocks5 = nn.Sequential(* [
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                act=act)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"])
+        ])
+
+        out_channels.append(
+            make_divisible(NET_CONFIG["blocks5"][-1][2] * scale))
+
+        self.blocks6 = nn.Sequential(* [
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                act=act)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"])
+        ])
+
+        out_channels.append(
+            make_divisible(NET_CONFIG["blocks6"][-1][2] * scale))
+        self._out_channels = [
+            ch for idx, ch in enumerate(out_channels) if idx + 2 in feature_maps
+        ]
+
+    def forward(self, inputs):
+        x = inputs['image']
+        outs = []
+
+        x = self.conv1(x)
+        x = self.blocks2(x)
+        x = self.blocks3(x)
+        outs.append(x)
+        x = self.blocks4(x)
+        outs.append(x)
+        x = self.blocks5(x)
+        outs.append(x)
+        x = self.blocks6(x)
+        outs.append(x)
+        outs = [o for i, o in enumerate(outs) if i + 2 in self.feature_maps]
+        return outs
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v1.py b/rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v1.py
new file mode 100644
index 0000000..a39435b
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v1.py
@@ -0,0 +1,402 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import KaimingNormal
+from ppdet.core.workspace import register, serializable
+from numbers import Integral
+from ..shape_spec import ShapeSpec
+
+__all__ = ['MobileNet']
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 act='relu',
+                 conv_lr=1.,
+                 conv_decay=0.,
+                 norm_decay=0.,
+                 norm_type='bn',
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.act = act
+        self._conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            weight_attr=ParamAttr(
+                learning_rate=conv_lr,
+                initializer=KaimingNormal(),
+                regularizer=L2Decay(conv_decay)),
+            bias_attr=False)
+
+        param_attr = ParamAttr(regularizer=L2Decay(norm_decay))
+        bias_attr = ParamAttr(regularizer=L2Decay(norm_decay))
+        if norm_type in ['sync_bn', 'bn']:
+            self._batch_norm = nn.BatchNorm2D(
+                out_channels, weight_attr=param_attr, bias_attr=bias_attr)
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._batch_norm(x)
+        if self.act == "relu":
+            x = F.relu(x)
+        elif self.act == "relu6":
+            x = F.relu6(x)
+        return x
+
+
+class DepthwiseSeparable(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels1,
+                 out_channels2,
+                 num_groups,
+                 stride,
+                 scale,
+                 conv_lr=1.,
+                 conv_decay=0.,
+                 norm_decay=0.,
+                 norm_type='bn',
+                 name=None):
+        super(DepthwiseSeparable, self).__init__()
+
+        self._depthwise_conv = ConvBNLayer(
+            in_channels,
+            int(out_channels1 * scale),
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups * scale),
+            conv_lr=conv_lr,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name=name + "_dw")
+
+        self._pointwise_conv = ConvBNLayer(
+            int(out_channels1 * scale),
+            int(out_channels2 * scale),
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_lr=conv_lr,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name=name + "_sep")
+
+    def forward(self, x):
+        x = self._depthwise_conv(x)
+        x = self._pointwise_conv(x)
+        return x
+
+
+class ExtraBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels1,
+                 out_channels2,
+                 num_groups=1,
+                 stride=2,
+                 conv_lr=1.,
+                 conv_decay=0.,
+                 norm_decay=0.,
+                 norm_type='bn',
+                 name=None):
+        super(ExtraBlock, self).__init__()
+
+        self.pointwise_conv = ConvBNLayer(
+            in_channels,
+            int(out_channels1),
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            num_groups=int(num_groups),
+            act='relu6',
+            conv_lr=conv_lr,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name=name + "_extra1")
+
+        self.normal_conv = ConvBNLayer(
+            int(out_channels1),
+            int(out_channels2),
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups),
+            act='relu6',
+            conv_lr=conv_lr,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name=name + "_extra2")
+
+    def forward(self, x):
+        x = self.pointwise_conv(x)
+        x = self.normal_conv(x)
+        return x
+
+
+@register
+@serializable
+class MobileNet(nn.Layer):
+    __shared__ = ['norm_type']
+
+    def __init__(self,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 conv_decay=0.,
+                 scale=1,
+                 conv_learning_rate=1.0,
+                 feature_maps=[4, 6, 13],
+                 with_extra_blocks=False,
+                 extra_block_filters=[[256, 512], [128, 256], [128, 256],
+                                      [64, 128]]):
+        super(MobileNet, self).__init__()
+        if isinstance(feature_maps, Integral):
+            feature_maps = [feature_maps]
+        self.feature_maps = feature_maps
+        self.with_extra_blocks = with_extra_blocks
+        self.extra_block_filters = extra_block_filters
+
+        self._out_channels = []
+
+        self.conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=int(32 * scale),
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_lr=conv_learning_rate,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name="conv1")
+
+        self.dwsl = []
+        dws21 = self.add_sublayer(
+            "conv2_1",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(32 * scale),
+                out_channels1=32,
+                out_channels2=64,
+                num_groups=32,
+                stride=1,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv2_1"))
+        self.dwsl.append(dws21)
+        self._update_out_channels(int(64 * scale), len(self.dwsl), feature_maps)
+        dws22 = self.add_sublayer(
+            "conv2_2",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(64 * scale),
+                out_channels1=64,
+                out_channels2=128,
+                num_groups=64,
+                stride=2,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv2_2"))
+        self.dwsl.append(dws22)
+        self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps)
+        # 1/4
+        dws31 = self.add_sublayer(
+            "conv3_1",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(128 * scale),
+                out_channels1=128,
+                out_channels2=128,
+                num_groups=128,
+                stride=1,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv3_1"))
+        self.dwsl.append(dws31)
+        self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps)
+        dws32 = self.add_sublayer(
+            "conv3_2",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(128 * scale),
+                out_channels1=128,
+                out_channels2=256,
+                num_groups=128,
+                stride=2,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv3_2"))
+        self.dwsl.append(dws32)
+        self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps)
+        # 1/8
+        dws41 = self.add_sublayer(
+            "conv4_1",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(256 * scale),
+                out_channels1=256,
+                out_channels2=256,
+                num_groups=256,
+                stride=1,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv4_1"))
+        self.dwsl.append(dws41)
+        self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps)
+        dws42 = self.add_sublayer(
+            "conv4_2",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(256 * scale),
+                out_channels1=256,
+                out_channels2=512,
+                num_groups=256,
+                stride=2,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv4_2"))
+        self.dwsl.append(dws42)
+        self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps)
+        # 1/16
+        for i in range(5):
+            tmp = self.add_sublayer(
+                "conv5_" + str(i + 1),
+                sublayer=DepthwiseSeparable(
+                    in_channels=int(512 * scale),
+                    out_channels1=512,
+                    out_channels2=512,
+                    num_groups=512,
+                    stride=1,
+                    scale=scale,
+                    conv_lr=conv_learning_rate,
+                    conv_decay=conv_decay,
+                    norm_decay=norm_decay,
+                    norm_type=norm_type,
+                    name="conv5_" + str(i + 1)))
+            self.dwsl.append(tmp)
+            self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps)
+        dws56 = self.add_sublayer(
+            "conv5_6",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(512 * scale),
+                out_channels1=512,
+                out_channels2=1024,
+                num_groups=512,
+                stride=2,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv5_6"))
+        self.dwsl.append(dws56)
+        self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps)
+        # 1/32
+        dws6 = self.add_sublayer(
+            "conv6",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(1024 * scale),
+                out_channels1=1024,
+                out_channels2=1024,
+                num_groups=1024,
+                stride=1,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv6"))
+        self.dwsl.append(dws6)
+        self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps)
+
+        if self.with_extra_blocks:
+            self.extra_blocks = []
+            for i, block_filter in enumerate(self.extra_block_filters):
+                in_c = 1024 if i == 0 else self.extra_block_filters[i - 1][1]
+                conv_extra = self.add_sublayer(
+                    "conv7_" + str(i + 1),
+                    sublayer=ExtraBlock(
+                        in_c,
+                        block_filter[0],
+                        block_filter[1],
+                        conv_lr=conv_learning_rate,
+                        conv_decay=conv_decay,
+                        norm_decay=norm_decay,
+                        norm_type=norm_type,
+                        name="conv7_" + str(i + 1)))
+                self.extra_blocks.append(conv_extra)
+                self._update_out_channels(
+                    block_filter[1],
+                    len(self.dwsl) + len(self.extra_blocks), feature_maps)
+
+    def _update_out_channels(self, channel, feature_idx, feature_maps):
+        if feature_idx in feature_maps:
+            self._out_channels.append(channel)
+
+    def forward(self, inputs):
+        outs = []
+        y = self.conv1(inputs['image'])
+        for i, block in enumerate(self.dwsl):
+            y = block(y)
+            if i + 1 in self.feature_maps:
+                outs.append(y)
+
+        if not self.with_extra_blocks:
+            return outs
+
+        y = outs[-1]
+        for i, block in enumerate(self.extra_blocks):
+            idx = i + len(self.dwsl)
+            y = block(y)
+            if idx + 1 in self.feature_maps:
+                outs.append(y)
+        return outs
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v3.py b/rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v3.py
new file mode 100644
index 0000000..2bd8856
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v3.py
@@ -0,0 +1,478 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from ppdet.core.workspace import register, serializable
+from numbers import Integral
+from ..shape_spec import ShapeSpec
+
+__all__ = ['MobileNetV3']
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 act=None,
+                 lr_mult=1.,
+                 conv_decay=0.,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 name=""):
+        super(ConvBNLayer, self).__init__()
+        self.act = act
+        self.conv = nn.Conv2D(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            weight_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
+            bias_attr=False)
+
+        norm_lr = 0. if freeze_norm else lr_mult
+        param_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay),
+            trainable=False if freeze_norm else True)
+        bias_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay),
+            trainable=False if freeze_norm else True)
+        global_stats = True if freeze_norm else None
+        if norm_type in ['sync_bn', 'bn']:
+            self.bn = nn.BatchNorm2D(
+                out_c,
+                weight_attr=param_attr,
+                bias_attr=bias_attr,
+                use_global_stats=global_stats)
+        norm_params = self.bn.parameters()
+        if freeze_norm:
+            for param in norm_params:
+                param.stop_gradient = True
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act is not None:
+            if self.act == "relu":
+                x = F.relu(x)
+            elif self.act == "relu6":
+                x = F.relu6(x)
+            elif self.act == "hard_swish":
+                x = F.hardswish(x)
+            else:
+                raise NotImplementedError(
+                    "The activation function is selected incorrectly.")
+        return x
+
+
+class ResidualUnit(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 mid_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 use_se,
+                 lr_mult,
+                 conv_decay=0.,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 act=None,
+                 return_list=False,
+                 name=''):
+        super(ResidualUnit, self).__init__()
+        self.if_shortcut = stride == 1 and in_c == out_c
+        self.use_se = use_se
+        self.return_list = return_list
+
+        self.expand_conv = ConvBNLayer(
+            in_c=in_c,
+            out_c=mid_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act=act,
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_expand")
+        self.bottleneck_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=mid_c,
+            filter_size=filter_size,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            num_groups=mid_c,
+            act=act,
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_depthwise")
+        if self.use_se:
+            self.mid_se = SEModule(
+                mid_c, lr_mult, conv_decay, name=name + "_se")
+        self.linear_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=out_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act=None,
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_linear")
+
+    def forward(self, inputs):
+        y = self.expand_conv(inputs)
+        x = self.bottleneck_conv(y)
+        if self.use_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.if_shortcut:
+            x = paddle.add(inputs, x)
+        if self.return_list:
+            return [y, x]
+        else:
+            return x
+
+
+class SEModule(nn.Layer):
+    def __init__(self, channel, lr_mult, conv_decay, reduction=4, name=""):
+        super(SEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2D(1)
+        mid_channels = int(channel // reduction)
+        self.conv1 = nn.Conv2D(
+            in_channels=channel,
+            out_channels=mid_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
+            bias_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))
+        self.conv2 = nn.Conv2D(
+            in_channels=mid_channels,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
+            bias_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))
+
+    def forward(self, inputs):
+        outputs = self.avg_pool(inputs)
+        outputs = self.conv1(outputs)
+        outputs = F.relu(outputs)
+        outputs = self.conv2(outputs)
+        outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5)
+        return paddle.multiply(x=inputs, y=outputs)
+
+
+class ExtraBlockDW(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 ch_1,
+                 ch_2,
+                 stride,
+                 lr_mult,
+                 conv_decay=0.,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 name=None):
+        super(ExtraBlockDW, self).__init__()
+        self.pointwise_conv = ConvBNLayer(
+            in_c=in_c,
+            out_c=ch_1,
+            filter_size=1,
+            stride=1,
+            padding='SAME',
+            act='relu6',
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_extra1")
+        self.depthwise_conv = ConvBNLayer(
+            in_c=ch_1,
+            out_c=ch_2,
+            filter_size=3,
+            stride=stride,
+            padding='SAME',
+            num_groups=int(ch_1),
+            act='relu6',
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_extra2_dw")
+        self.normal_conv = ConvBNLayer(
+            in_c=ch_2,
+            out_c=ch_2,
+            filter_size=1,
+            stride=1,
+            padding='SAME',
+            act='relu6',
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_extra2_sep")
+
+    def forward(self, inputs):
+        x = self.pointwise_conv(inputs)
+        x = self.depthwise_conv(x)
+        x = self.normal_conv(x)
+        return x
+
+
+@register
+@serializable
+class MobileNetV3(nn.Layer):
+    __shared__ = ['norm_type']
+
+    def __init__(
+            self,
+            scale=1.0,
+            model_name="large",
+            feature_maps=[6, 12, 15],
+            with_extra_blocks=False,
+            extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]],
+            lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+            conv_decay=0.0,
+            multiplier=1.0,
+            norm_type='bn',
+            norm_decay=0.0,
+            freeze_norm=False):
+        super(MobileNetV3, self).__init__()
+        if isinstance(feature_maps, Integral):
+            feature_maps = [feature_maps]
+        if norm_type == 'sync_bn' and freeze_norm:
+            raise ValueError(
+                "The norm_type should not be sync_bn when freeze_norm is True")
+        self.feature_maps = feature_maps
+        self.with_extra_blocks = with_extra_blocks
+        self.extra_block_filters = extra_block_filters
+
+        inplanes = 16
+        if model_name == "large":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, False, "relu", 1],
+                [3, 64, 24, False, "relu", 2],
+                [3, 72, 24, False, "relu", 1],
+                [5, 72, 40, True, "relu", 2],  # RCNN output
+                [5, 120, 40, True, "relu", 1],
+                [5, 120, 40, True, "relu", 1],  # YOLOv3 output
+                [3, 240, 80, False, "hard_swish", 2],  # RCNN output
+                [3, 200, 80, False, "hard_swish", 1],
+                [3, 184, 80, False, "hard_swish", 1],
+                [3, 184, 80, False, "hard_swish", 1],
+                [3, 480, 112, True, "hard_swish", 1],
+                [3, 672, 112, True, "hard_swish", 1],  # YOLOv3 output
+                [5, 672, 160, True, "hard_swish", 2],  # SSD/SSDLite/RCNN output
+                [5, 960, 160, True, "hard_swish", 1],
+                [5, 960, 160, True, "hard_swish", 1],  # YOLOv3 output
+            ]
+        elif model_name == "small":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, True, "relu", 2],
+                [3, 72, 24, False, "relu", 2],  # RCNN output
+                [3, 88, 24, False, "relu", 1],  # YOLOv3 output
+                [5, 96, 40, True, "hard_swish", 2],  # RCNN output
+                [5, 240, 40, True, "hard_swish", 1],
+                [5, 240, 40, True, "hard_swish", 1],
+                [5, 120, 48, True, "hard_swish", 1],
+                [5, 144, 48, True, "hard_swish", 1],  # YOLOv3 output
+                [5, 288, 96, True, "hard_swish", 2],  # SSD/SSDLite/RCNN output
+                [5, 576, 96, True, "hard_swish", 1],
+                [5, 576, 96, True, "hard_swish", 1],  # YOLOv3 output
+            ]
+        else:
+            raise NotImplementedError(
+                "mode[{}_model] is not implemented!".format(model_name))
+
+        if multiplier != 1.0:
+            self.cfg[-3][2] = int(self.cfg[-3][2] * multiplier)
+            self.cfg[-2][1] = int(self.cfg[-2][1] * multiplier)
+            self.cfg[-2][2] = int(self.cfg[-2][2] * multiplier)
+            self.cfg[-1][1] = int(self.cfg[-1][1] * multiplier)
+            self.cfg[-1][2] = int(self.cfg[-1][2] * multiplier)
+
+        self.conv1 = ConvBNLayer(
+            in_c=3,
+            out_c=make_divisible(inplanes * scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            num_groups=1,
+            act="hard_swish",
+            lr_mult=lr_mult_list[0],
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name="conv1")
+
+        self._out_channels = []
+        self.block_list = []
+        i = 0
+        inplanes = make_divisible(inplanes * scale)
+        for (k, exp, c, se, nl, s) in self.cfg:
+            lr_idx = min(i // 3, len(lr_mult_list) - 1)
+            lr_mult = lr_mult_list[lr_idx]
+
+            # for SSD/SSDLite, first head input is after ResidualUnit expand_conv
+            return_list = self.with_extra_blocks and i + 2 in self.feature_maps
+
+            block = self.add_sublayer(
+                "conv" + str(i + 2),
+                sublayer=ResidualUnit(
+                    in_c=inplanes,
+                    mid_c=make_divisible(scale * exp),
+                    out_c=make_divisible(scale * c),
+                    filter_size=k,
+                    stride=s,
+                    use_se=se,
+                    act=nl,
+                    lr_mult=lr_mult,
+                    conv_decay=conv_decay,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    return_list=return_list,
+                    name="conv" + str(i + 2)))
+            self.block_list.append(block)
+            inplanes = make_divisible(scale * c)
+            i += 1
+            self._update_out_channels(
+                make_divisible(scale * exp)
+                if return_list else inplanes, i + 1, feature_maps)
+
+        if self.with_extra_blocks:
+            self.extra_block_list = []
+            extra_out_c = make_divisible(scale * self.cfg[-1][1])
+            lr_idx = min(i // 3, len(lr_mult_list) - 1)
+            lr_mult = lr_mult_list[lr_idx]
+
+            conv_extra = self.add_sublayer(
+                "conv" + str(i + 2),
+                sublayer=ConvBNLayer(
+                    in_c=inplanes,
+                    out_c=extra_out_c,
+                    filter_size=1,
+                    stride=1,
+                    padding=0,
+                    num_groups=1,
+                    act="hard_swish",
+                    lr_mult=lr_mult,
+                    conv_decay=conv_decay,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    name="conv" + str(i + 2)))
+            self.extra_block_list.append(conv_extra)
+            i += 1
+            self._update_out_channels(extra_out_c, i + 1, feature_maps)
+
+            for j, block_filter in enumerate(self.extra_block_filters):
+                in_c = extra_out_c if j == 0 else self.extra_block_filters[j -
+                                                                           1][1]
+                conv_extra = self.add_sublayer(
+                    "conv" + str(i + 2),
+                    sublayer=ExtraBlockDW(
+                        in_c,
+                        block_filter[0],
+                        block_filter[1],
+                        stride=2,
+                        lr_mult=lr_mult,
+                        conv_decay=conv_decay,
+                        norm_type=norm_type,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        name='conv' + str(i + 2)))
+                self.extra_block_list.append(conv_extra)
+                i += 1
+                self._update_out_channels(block_filter[1], i + 1, feature_maps)
+
+    def _update_out_channels(self, channel, feature_idx, feature_maps):
+        if feature_idx in feature_maps:
+            self._out_channels.append(channel)
+
+    def forward(self, inputs):
+        x = self.conv1(inputs['image'])
+        outs = []
+        for idx, block in enumerate(self.block_list):
+            x = block(x)
+            if idx + 2 in self.feature_maps:
+                if isinstance(x, list):
+                    outs.append(x[0])
+                    x = x[1]
+                else:
+                    outs.append(x)
+
+        if not self.with_extra_blocks:
+            return outs
+
+        for i, block in enumerate(self.extra_block_list):
+            idx = i + len(self.block_list)
+            x = block(x)
+            if idx + 2 in self.feature_maps:
+                outs.append(x)
+        return outs
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/rtdetr_paddle/ppdet/modeling/backbones/mobileone.py b/rtdetr_paddle/ppdet/modeling/backbones/mobileone.py
new file mode 100644
index 0000000..e548bad
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/backbones/mobileone.py
@@ -0,0 +1,266 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is the paddle implementation of MobileOne block, see: https://arxiv.org/pdf/2206.04040.pdf. 
+Some codes are based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
+Ths copyright of microsoft/Swin-Transformer is as follows:
+MIT License [see LICENSE for details]
+"""
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Normal, Constant
+
+from ppdet.modeling.ops import get_act_fn
+from ppdet.modeling.layers import ConvNormLayer
+
+
+class MobileOneBlock(nn.Layer):
+    def __init__(
+            self,
+            ch_in,
+            ch_out,
+            stride,
+            kernel_size,
+            conv_num=1,
+            norm_type='bn',
+            norm_decay=0.,
+            norm_groups=32,
+            bias_on=False,
+            lr_scale=1.,
+            freeze_norm=False,
+            initializer=Normal(
+                mean=0., std=0.01),
+            skip_quant=False,
+            act='relu', ):
+        super(MobileOneBlock, self).__init__()
+
+        self.ch_in = ch_in
+        self.ch_out = ch_out
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = (kernel_size - 1) // 2
+        self.k = conv_num
+
+        self.depth_conv = nn.LayerList()
+        self.point_conv = nn.LayerList()
+        for _ in range(self.k):
+            self.depth_conv.append(
+                ConvNormLayer(
+                    ch_in,
+                    ch_in,
+                    kernel_size,
+                    stride=stride,
+                    groups=ch_in,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    norm_groups=norm_groups,
+                    bias_on=bias_on,
+                    lr_scale=lr_scale,
+                    freeze_norm=freeze_norm,
+                    initializer=initializer,
+                    skip_quant=skip_quant))
+            self.point_conv.append(
+                ConvNormLayer(
+                    ch_in,
+                    ch_out,
+                    1,
+                    stride=1,
+                    groups=1,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    norm_groups=norm_groups,
+                    bias_on=bias_on,
+                    lr_scale=lr_scale,
+                    freeze_norm=freeze_norm,
+                    initializer=initializer,
+                    skip_quant=skip_quant))
+        self.rbr_1x1 = ConvNormLayer(
+            ch_in,
+            ch_in,
+            1,
+            stride=self.stride,
+            groups=ch_in,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            norm_groups=norm_groups,
+            bias_on=bias_on,
+            lr_scale=lr_scale,
+            freeze_norm=freeze_norm,
+            initializer=initializer,
+            skip_quant=skip_quant)
+        self.rbr_identity_st1 = nn.BatchNorm2D(
+            num_features=ch_in,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(
+                0.0))) if ch_in == ch_out and self.stride == 1 else None
+        self.rbr_identity_st2 = nn.BatchNorm2D(
+            num_features=ch_out,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(
+                0.0))) if ch_in == ch_out and self.stride == 1 else None
+        self.act = get_act_fn(act) if act is None or isinstance(act, (
+            str, dict)) else act
+
+    def forward(self, x):
+        if hasattr(self, "conv1") and hasattr(self, "conv2"):
+            y = self.act(self.conv2(self.act(self.conv1(x))))
+        else:
+            if self.rbr_identity_st1 is None:
+                id_out_st1 = 0
+            else:
+                id_out_st1 = self.rbr_identity_st1(x)
+
+            x1_1 = 0
+            for i in range(self.k):
+                x1_1 += self.depth_conv[i](x)
+
+            x1_2 = self.rbr_1x1(x)
+            x1 = self.act(x1_1 + x1_2 + id_out_st1)
+
+            if self.rbr_identity_st2 is None:
+                id_out_st2 = 0
+            else:
+                id_out_st2 = self.rbr_identity_st2(x1)
+
+            x2_1 = 0
+            for i in range(self.k):
+                x2_1 += self.point_conv[i](x1)
+            y = self.act(x2_1 + id_out_st2)
+
+        return y
+
+    def convert_to_deploy(self):
+        if not hasattr(self, 'conv1'):
+            self.conv1 = nn.Conv2D(
+                in_channels=self.ch_in,
+                out_channels=self.ch_in,
+                kernel_size=self.kernel_size,
+                stride=self.stride,
+                padding=self.padding,
+                groups=self.ch_in,
+                bias_attr=ParamAttr(
+                    initializer=Constant(value=0.), learning_rate=1.))
+        if not hasattr(self, 'conv2'):
+            self.conv2 = nn.Conv2D(
+                in_channels=self.ch_in,
+                out_channels=self.ch_out,
+                kernel_size=1,
+                stride=1,
+                padding='SAME',
+                groups=1,
+                bias_attr=ParamAttr(
+                    initializer=Constant(value=0.), learning_rate=1.))
+
+        conv1_kernel, conv1_bias, conv2_kernel, conv2_bias = self.get_equivalent_kernel_bias(
+        )
+        self.conv1.weight.set_value(conv1_kernel)
+        self.conv1.bias.set_value(conv1_bias)
+        self.conv2.weight.set_value(conv2_kernel)
+        self.conv2.bias.set_value(conv2_bias)
+        self.__delattr__('depth_conv')
+        self.__delattr__('point_conv')
+        self.__delattr__('rbr_1x1')
+        if hasattr(self, 'rbr_identity_st1'):
+            self.__delattr__('rbr_identity_st1')
+        if hasattr(self, 'rbr_identity_st2'):
+            self.__delattr__('rbr_identity_st2')
+
+    def get_equivalent_kernel_bias(self):
+        st1_kernel3x3, st1_bias3x3 = self._fuse_bn_tensor(self.depth_conv)
+        st1_kernel1x1, st1_bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
+        st1_kernelid, st1_biasid = self._fuse_bn_tensor(
+            self.rbr_identity_st1, kernel_size=self.kernel_size)
+
+        st2_kernel1x1, st2_bias1x1 = self._fuse_bn_tensor(self.point_conv)
+        st2_kernelid, st2_biasid = self._fuse_bn_tensor(
+            self.rbr_identity_st2, kernel_size=1)
+
+        conv1_kernel = st1_kernel3x3 + self._pad_1x1_to_3x3_tensor(
+            st1_kernel1x1) + st1_kernelid
+
+        conv1_bias = st1_bias3x3 + st1_bias1x1 + st1_biasid
+
+        conv2_kernel = st2_kernel1x1 + st2_kernelid
+        conv2_bias = st2_bias1x1 + st2_biasid
+
+        return conv1_kernel, conv1_bias, conv2_kernel, conv2_bias
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            padding_size = (self.kernel_size - 1) // 2
+            return nn.functional.pad(
+                kernel1x1,
+                [padding_size, padding_size, padding_size, padding_size])
+
+    def _fuse_bn_tensor(self, branch, kernel_size=3):
+        if branch is None:
+            return 0, 0
+
+        if isinstance(branch, nn.LayerList):
+            fused_kernels = []
+            fused_bias = []
+            for block in branch:
+                kernel = block.conv.weight
+                running_mean = block.norm._mean
+                running_var = block.norm._variance
+                gamma = block.norm.weight
+                beta = block.norm.bias
+                eps = block.norm._epsilon
+
+                std = (running_var + eps).sqrt()
+                t = (gamma / std).reshape((-1, 1, 1, 1))
+
+                fused_kernels.append(kernel * t)
+                fused_bias.append(beta - running_mean * gamma / std)
+
+            return sum(fused_kernels), sum(fused_bias)
+
+        elif isinstance(branch, ConvNormLayer):
+            kernel = branch.conv.weight
+            running_mean = branch.norm._mean
+            running_var = branch.norm._variance
+            gamma = branch.norm.weight
+            beta = branch.norm.bias
+            eps = branch.norm._epsilon
+        else:
+            assert isinstance(branch, nn.BatchNorm2D)
+            input_dim = self.ch_in if kernel_size == 1 else 1
+            kernel_value = paddle.zeros(
+                shape=[self.ch_in, input_dim, kernel_size, kernel_size],
+                dtype='float32')
+            if kernel_size > 1:
+                for i in range(self.ch_in):
+                    kernel_value[i, i % input_dim, (kernel_size - 1) // 2, (
+                        kernel_size - 1) // 2] = 1
+            elif kernel_size == 1:
+                for i in range(self.ch_in):
+                    kernel_value[i, i % input_dim, 0, 0] = 1
+            else:
+                raise ValueError("Invalid kernel size recieved!")
+            kernel = paddle.to_tensor(kernel_value, place=branch.weight.place)
+            running_mean = branch._mean
+            running_var = branch._variance
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch._epsilon
+
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape((-1, 1, 1, 1))
+
+        return kernel * t, beta - running_mean * gamma / std
diff --git a/rtdetr_paddle/ppdet/modeling/backbones/name_adapter.py b/rtdetr_paddle/ppdet/modeling/backbones/name_adapter.py
new file mode 100644
index 0000000..4afbb9b
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/backbones/name_adapter.py
@@ -0,0 +1,69 @@
+class NameAdapter(object):
+    """Fix the backbones variable names for pretrained weight"""
+
+    def __init__(self, model):
+        super(NameAdapter, self).__init__()
+        self.model = model
+
+    @property
+    def model_type(self):
+        return getattr(self.model, '_model_type', '')
+
+    @property
+    def variant(self):
+        return getattr(self.model, 'variant', '')
+
+    def fix_conv_norm_name(self, name):
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        # the naming rule is same as pretrained weight
+        if self.model_type == 'SEResNeXt':
+            bn_name = name + "_bn"
+        return bn_name
+
+    def fix_shortcut_name(self, name):
+        if self.model_type == 'SEResNeXt':
+            name = 'conv' + name + '_prj'
+        return name
+
+    def fix_bottleneck_name(self, name):
+        if self.model_type == 'SEResNeXt':
+            conv_name1 = 'conv' + name + '_x1'
+            conv_name2 = 'conv' + name + '_x2'
+            conv_name3 = 'conv' + name + '_x3'
+            shortcut_name = name
+        else:
+            conv_name1 = name + "_branch2a"
+            conv_name2 = name + "_branch2b"
+            conv_name3 = name + "_branch2c"
+            shortcut_name = name + "_branch1"
+        return conv_name1, conv_name2, conv_name3, shortcut_name
+
+    def fix_basicblock_name(self, name):
+        if self.model_type == 'SEResNeXt':
+            conv_name1 = 'conv' + name + '_x1'
+            conv_name2 = 'conv' + name + '_x2'
+            shortcut_name = name
+        else:
+            conv_name1 = name + "_branch2a"
+            conv_name2 = name + "_branch2b"
+            shortcut_name = name + "_branch1"
+        return conv_name1, conv_name2, shortcut_name
+
+    def fix_layer_warp_name(self, stage_num, count, i):
+        name = 'res' + str(stage_num)
+        if count > 10 and stage_num == 4:
+            if i == 0:
+                conv_name = name + "a"
+            else:
+                conv_name = name + "b" + str(i)
+        else:
+            conv_name = name + chr(ord("a") + i)
+        if self.model_type == 'SEResNeXt':
+            conv_name = str(stage_num + 2) + '_' + str(i + 1)
+        return conv_name
+
+    def fix_c1_stage_name(self):
+        return "res_conv1" if self.model_type == 'ResNeXt' else "conv1"
diff --git a/rtdetr_paddle/ppdet/modeling/backbones/resnet.py b/rtdetr_paddle/ppdet/modeling/backbones/resnet.py
new file mode 100755
index 0000000..84e362a
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/backbones/resnet.py
@@ -0,0 +1,611 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import math
+from numbers import Integral
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, serializable
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Uniform
+from paddle import ParamAttr
+from paddle.nn.initializer import Constant
+from paddle.vision.ops import DeformConv2D
+from .name_adapter import NameAdapter
+from ..shape_spec import ShapeSpec
+
+__all__ = ['ResNet', 'Res5Head', 'Blocks', 'BasicBlock', 'BottleNeck']
+
+ResNet_cfg = {
+    18: [2, 2, 2, 2],
+    34: [3, 4, 6, 3],
+    50: [3, 4, 6, 3],
+    101: [3, 4, 23, 3],
+    152: [3, 8, 36, 3],
+}
+
+
+class ConvNormLayer(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size,
+                 stride,
+                 groups=1,
+                 act=None,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 lr=1.0,
+                 dcn_v2=False):
+        super(ConvNormLayer, self).__init__()
+        assert norm_type in ['bn', 'sync_bn']
+        self.norm_type = norm_type
+        self.act = act
+        self.dcn_v2 = dcn_v2
+
+        if not self.dcn_v2:
+            self.conv = nn.Conv2D(
+                in_channels=ch_in,
+                out_channels=ch_out,
+                kernel_size=filter_size,
+                stride=stride,
+                padding=(filter_size - 1) // 2,
+                groups=groups,
+                weight_attr=ParamAttr(learning_rate=lr),
+                bias_attr=False)
+        else:
+            self.offset_channel = 2 * filter_size**2
+            self.mask_channel = filter_size**2
+
+            self.conv_offset = nn.Conv2D(
+                in_channels=ch_in,
+                out_channels=3 * filter_size**2,
+                kernel_size=filter_size,
+                stride=stride,
+                padding=(filter_size - 1) // 2,
+                weight_attr=ParamAttr(initializer=Constant(0.)),
+                bias_attr=ParamAttr(initializer=Constant(0.)))
+            self.conv = DeformConv2D(
+                in_channels=ch_in,
+                out_channels=ch_out,
+                kernel_size=filter_size,
+                stride=stride,
+                padding=(filter_size - 1) // 2,
+                dilation=1,
+                groups=groups,
+                weight_attr=ParamAttr(learning_rate=lr),
+                bias_attr=False)
+
+        norm_lr = 0. if freeze_norm else lr
+        param_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay),
+            trainable=False if freeze_norm else True)
+        bias_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay),
+            trainable=False if freeze_norm else True)
+
+        global_stats = True if freeze_norm else None
+        if norm_type in ['sync_bn', 'bn']:
+            self.norm = nn.BatchNorm2D(
+                ch_out,
+                weight_attr=param_attr,
+                bias_attr=bias_attr,
+                use_global_stats=global_stats)
+        norm_params = self.norm.parameters()
+
+        if freeze_norm:
+            for param in norm_params:
+                param.stop_gradient = True
+
+    def forward(self, inputs):
+        if not self.dcn_v2:
+            out = self.conv(inputs)
+        else:
+            offset_mask = self.conv_offset(inputs)
+            offset, mask = paddle.split(
+                offset_mask,
+                num_or_sections=[self.offset_channel, self.mask_channel],
+                axis=1)
+            mask = F.sigmoid(mask)
+            out = self.conv(inputs, offset, mask=mask)
+
+        if self.norm_type in ['bn', 'sync_bn']:
+            out = self.norm(out)
+        if self.act:
+            out = getattr(F, self.act)(out)
+        return out
+
+
+class SELayer(nn.Layer):
+    def __init__(self, ch, reduction_ratio=16):
+        super(SELayer, self).__init__()
+        self.pool = nn.AdaptiveAvgPool2D(1)
+        stdv = 1.0 / math.sqrt(ch)
+        c_ = ch // reduction_ratio
+        self.squeeze = nn.Linear(
+            ch,
+            c_,
+            weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
+            bias_attr=True)
+
+        stdv = 1.0 / math.sqrt(c_)
+        self.extract = nn.Linear(
+            c_,
+            ch,
+            weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
+            bias_attr=True)
+
+    def forward(self, inputs):
+        out = self.pool(inputs)
+        out = paddle.squeeze(out, axis=[2, 3])
+        out = self.squeeze(out)
+        out = F.relu(out)
+        out = self.extract(out)
+        out = F.sigmoid(out)
+        out = paddle.unsqueeze(out, axis=[2, 3])
+        scale = out * inputs
+        return scale
+
+
+class BasicBlock(nn.Layer):
+
+    expansion = 1
+
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 stride,
+                 shortcut,
+                 variant='b',
+                 groups=1,
+                 base_width=64,
+                 lr=1.0,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 dcn_v2=False,
+                 std_senet=False):
+        super(BasicBlock, self).__init__()
+        assert groups == 1 and base_width == 64, 'BasicBlock only supports groups=1 and base_width=64'
+
+        self.shortcut = shortcut
+        if not shortcut:
+            if variant == 'd' and stride == 2:
+                self.short = nn.Sequential()
+                self.short.add_sublayer(
+                    'pool',
+                    nn.AvgPool2D(
+                        kernel_size=2, stride=2, padding=0, ceil_mode=True))
+                self.short.add_sublayer(
+                    'conv',
+                    ConvNormLayer(
+                        ch_in=ch_in,
+                        ch_out=ch_out,
+                        filter_size=1,
+                        stride=1,
+                        norm_type=norm_type,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        lr=lr))
+            else:
+                self.short = ConvNormLayer(
+                    ch_in=ch_in,
+                    ch_out=ch_out,
+                    filter_size=1,
+                    stride=stride,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    lr=lr)
+
+        self.branch2a = ConvNormLayer(
+            ch_in=ch_in,
+            ch_out=ch_out,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr)
+
+        self.branch2b = ConvNormLayer(
+            ch_in=ch_out,
+            ch_out=ch_out,
+            filter_size=3,
+            stride=1,
+            act=None,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr,
+            dcn_v2=dcn_v2)
+
+        self.std_senet = std_senet
+        if self.std_senet:
+            self.se = SELayer(ch_out)
+
+    def forward(self, inputs):
+        out = self.branch2a(inputs)
+        out = self.branch2b(out)
+        if self.std_senet:
+            out = self.se(out)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        out = paddle.add(x=out, y=short)
+        out = F.relu(out)
+
+        return out
+
+
+class BottleNeck(nn.Layer):
+
+    expansion = 4
+
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 stride,
+                 shortcut,
+                 variant='b',
+                 groups=1,
+                 base_width=4,
+                 lr=1.0,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 dcn_v2=False,
+                 std_senet=False):
+        super(BottleNeck, self).__init__()
+        if variant == 'a':
+            stride1, stride2 = stride, 1
+        else:
+            stride1, stride2 = 1, stride
+
+        # ResNeXt
+        width = int(ch_out * (base_width / 64.)) * groups
+
+        self.branch2a = ConvNormLayer(
+            ch_in=ch_in,
+            ch_out=width,
+            filter_size=1,
+            stride=stride1,
+            groups=1,
+            act='relu',
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr)
+
+        self.branch2b = ConvNormLayer(
+            ch_in=width,
+            ch_out=width,
+            filter_size=3,
+            stride=stride2,
+            groups=groups,
+            act='relu',
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr,
+            dcn_v2=dcn_v2)
+
+        self.branch2c = ConvNormLayer(
+            ch_in=width,
+            ch_out=ch_out * self.expansion,
+            filter_size=1,
+            stride=1,
+            groups=1,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr)
+
+        self.shortcut = shortcut
+        if not shortcut:
+            if variant == 'd' and stride == 2:
+                self.short = nn.Sequential()
+                self.short.add_sublayer(
+                    'pool',
+                    nn.AvgPool2D(
+                        kernel_size=2, stride=2, padding=0, ceil_mode=True))
+                self.short.add_sublayer(
+                    'conv',
+                    ConvNormLayer(
+                        ch_in=ch_in,
+                        ch_out=ch_out * self.expansion,
+                        filter_size=1,
+                        stride=1,
+                        norm_type=norm_type,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        lr=lr))
+            else:
+                self.short = ConvNormLayer(
+                    ch_in=ch_in,
+                    ch_out=ch_out * self.expansion,
+                    filter_size=1,
+                    stride=stride,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    lr=lr)
+
+        self.std_senet = std_senet
+        if self.std_senet:
+            self.se = SELayer(ch_out * self.expansion)
+
+    def forward(self, inputs):
+
+        out = self.branch2a(inputs)
+        out = self.branch2b(out)
+        out = self.branch2c(out)
+
+        if self.std_senet:
+            out = self.se(out)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        out = paddle.add(x=out, y=short)
+        out = F.relu(out)
+
+        return out
+
+
+class Blocks(nn.Layer):
+    def __init__(self,
+                 block,
+                 ch_in,
+                 ch_out,
+                 count,
+                 name_adapter,
+                 stage_num,
+                 variant='b',
+                 groups=1,
+                 base_width=64,
+                 lr=1.0,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 dcn_v2=False,
+                 std_senet=False):
+        super(Blocks, self).__init__()
+
+        self.blocks = []
+        for i in range(count):
+            conv_name = name_adapter.fix_layer_warp_name(stage_num, count, i)
+            layer = self.add_sublayer(
+                conv_name,
+                block(
+                    ch_in=ch_in,
+                    ch_out=ch_out,
+                    stride=2 if i == 0 and stage_num != 2 else 1,
+                    shortcut=False if i == 0 else True,
+                    variant=variant,
+                    groups=groups,
+                    base_width=base_width,
+                    lr=lr,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    dcn_v2=dcn_v2,
+                    std_senet=std_senet))
+            self.blocks.append(layer)
+            if i == 0:
+                ch_in = ch_out * block.expansion
+
+    def forward(self, inputs):
+        block_out = inputs
+        for block in self.blocks:
+            block_out = block(block_out)
+        return block_out
+
+
+@register
+@serializable
+class ResNet(nn.Layer):
+    __shared__ = ['norm_type']
+
+    def __init__(self,
+                 depth=50,
+                 ch_in=64,
+                 variant='b',
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0],
+                 groups=1,
+                 base_width=64,
+                 norm_type='bn',
+                 norm_decay=0,
+                 freeze_norm=True,
+                 freeze_at=0,
+                 return_idx=[0, 1, 2, 3],
+                 dcn_v2_stages=[-1],
+                 num_stages=4,
+                 std_senet=False,
+                 freeze_stem_only=False):
+        """
+        Residual Network, see https://arxiv.org/abs/1512.03385
+        
+        Args:
+            depth (int): ResNet depth, should be 18, 34, 50, 101, 152.
+            ch_in (int): output channel of first stage, default 64
+            variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently
+            lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5),
+                                 lower learning rate ratio is need for pretrained model 
+                                 got using distillation(default as [1.0, 1.0, 1.0, 1.0]).
+            groups (int): group convolution cardinality
+            base_width (int): base width of each group convolution
+            norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel'
+            norm_decay (float): weight decay for normalization layer weights
+            freeze_norm (bool): freeze normalization layers
+            freeze_at (int): freeze the backbone at which stage
+            return_idx (list): index of the stages whose feature maps are returned
+            dcn_v2_stages (list): index of stages who select deformable conv v2
+            num_stages (int): total num of stages
+            std_senet (bool): whether use senet, default False.
+        """
+        super(ResNet, self).__init__()
+        self._model_type = 'ResNet' if groups == 1 else 'ResNeXt'
+        assert num_stages >= 1 and num_stages <= 4
+        self.depth = depth
+        self.variant = variant
+        self.groups = groups
+        self.base_width = base_width
+        self.norm_type = norm_type
+        self.norm_decay = norm_decay
+        self.freeze_norm = freeze_norm
+        self.freeze_at = freeze_at
+        if isinstance(return_idx, Integral):
+            return_idx = [return_idx]
+        assert max(return_idx) < num_stages, \
+            'the maximum return index must smaller than num_stages, ' \
+            'but received maximum return index is {} and num_stages ' \
+            'is {}'.format(max(return_idx), num_stages)
+        self.return_idx = return_idx
+        self.num_stages = num_stages
+        assert len(lr_mult_list) == 4, \
+            "lr_mult_list length must be 4 but got {}".format(len(lr_mult_list))
+        if isinstance(dcn_v2_stages, Integral):
+            dcn_v2_stages = [dcn_v2_stages]
+        assert max(dcn_v2_stages) < num_stages
+
+        if isinstance(dcn_v2_stages, Integral):
+            dcn_v2_stages = [dcn_v2_stages]
+        assert max(dcn_v2_stages) < num_stages
+        self.dcn_v2_stages = dcn_v2_stages
+
+        block_nums = ResNet_cfg[depth]
+        na = NameAdapter(self)
+
+        conv1_name = na.fix_c1_stage_name()
+        if variant in ['c', 'd']:
+            conv_def = [
+                [3, ch_in // 2, 3, 2, "conv1_1"],
+                [ch_in // 2, ch_in // 2, 3, 1, "conv1_2"],
+                [ch_in // 2, ch_in, 3, 1, "conv1_3"],
+            ]
+        else:
+            conv_def = [[3, ch_in, 7, 2, conv1_name]]
+        self.conv1 = nn.Sequential()
+        for (c_in, c_out, k, s, _name) in conv_def:
+            self.conv1.add_sublayer(
+                _name,
+                ConvNormLayer(
+                    ch_in=c_in,
+                    ch_out=c_out,
+                    filter_size=k,
+                    stride=s,
+                    groups=1,
+                    act='relu',
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    lr=1.0))
+
+        self.ch_in = ch_in
+        ch_out_list = [64, 128, 256, 512]
+        block = BottleNeck if depth >= 50 else BasicBlock
+
+        self._out_channels = [block.expansion * v for v in ch_out_list]
+        self._out_strides = [4, 8, 16, 32]
+
+        self.res_layers = []
+        for i in range(num_stages):
+            lr_mult = lr_mult_list[i]
+            stage_num = i + 2
+            res_name = "res{}".format(stage_num)
+            res_layer = self.add_sublayer(
+                res_name,
+                Blocks(
+                    block,
+                    self.ch_in,
+                    ch_out_list[i],
+                    count=block_nums[i],
+                    name_adapter=na,
+                    stage_num=stage_num,
+                    variant=variant,
+                    groups=groups,
+                    base_width=base_width,
+                    lr=lr_mult,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    dcn_v2=(i in self.dcn_v2_stages),
+                    std_senet=std_senet))
+            self.res_layers.append(res_layer)
+            self.ch_in = self._out_channels[i]
+
+        if freeze_at >= 0:
+            self._freeze_parameters(self.conv1)
+            if not freeze_stem_only:
+                for i in range(min(freeze_at + 1, num_stages)):
+                    self._freeze_parameters(self.res_layers[i])
+
+    def _freeze_parameters(self, m):
+        for p in m.parameters():
+            p.stop_gradient = True
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self._out_channels[i], stride=self._out_strides[i])
+            for i in self.return_idx
+        ]
+
+    def forward(self, inputs):
+        x = inputs['image']
+        conv1 = self.conv1(x)
+        x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1)
+        outs = []
+        for idx, stage in enumerate(self.res_layers):
+            x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+        return outs
+
+
+@register
+class Res5Head(nn.Layer):
+    def __init__(self, depth=50):
+        super(Res5Head, self).__init__()
+        feat_in, feat_out = [1024, 512]
+        if depth < 50:
+            feat_in = 256
+        na = NameAdapter(self)
+        block = BottleNeck if depth >= 50 else BasicBlock
+        self.res5 = Blocks(
+            block, feat_in, feat_out, count=3, name_adapter=na, stage_num=5)
+        self.feat_out = feat_out if depth < 50 else feat_out * 4
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(
+            channels=self.feat_out,
+            stride=16, )]
+
+    def forward(self, roi_feat, stage=0):
+        y = self.res5(roi_feat)
+        return y
diff --git a/rtdetr_paddle/ppdet/modeling/backbones/shufflenet_v2.py b/rtdetr_paddle/ppdet/modeling/backbones/shufflenet_v2.py
new file mode 100644
index 0000000..ca7ebb9
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/backbones/shufflenet_v2.py
@@ -0,0 +1,250 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm2D
+from paddle.nn.initializer import KaimingNormal
+from paddle.regularizer import L2Decay
+
+from ppdet.core.workspace import register, serializable
+from numbers import Integral
+from ..shape_spec import ShapeSpec
+from ppdet.modeling.ops import channel_shuffle
+
+__all__ = ['ShuffleNetV2']
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 groups=1,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(initializer=KaimingNormal()),
+            bias_attr=False)
+
+        self._batch_norm = BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        if act == "hard_swish":
+            act = 'hardswish'
+        self.act = act
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if self.act:
+            y = getattr(F, self.act)(y)
+        return y
+
+
+class InvertedResidual(nn.Layer):
+    def __init__(self, in_channels, out_channels, stride, act="relu"):
+        super(InvertedResidual, self).__init__()
+        self._conv_pw = ConvBNLayer(
+            in_channels=in_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+        self._conv_dw = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=out_channels // 2,
+            act=None)
+        self._conv_linear = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+
+    def forward(self, inputs):
+        x1, x2 = paddle.split(
+            inputs,
+            num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2],
+            axis=1)
+        x2 = self._conv_pw(x2)
+        x2 = self._conv_dw(x2)
+        x2 = self._conv_linear(x2)
+        out = paddle.concat([x1, x2], axis=1)
+        return channel_shuffle(out, 2)
+
+
+class InvertedResidualDS(nn.Layer):
+    def __init__(self, in_channels, out_channels, stride, act="relu"):
+        super(InvertedResidualDS, self).__init__()
+
+        # branch1
+        self._conv_dw_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=in_channels,
+            act=None)
+        self._conv_linear_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+        # branch2
+        self._conv_pw_2 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+        self._conv_dw_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=out_channels // 2,
+            act=None)
+        self._conv_linear_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+
+    def forward(self, inputs):
+        x1 = self._conv_dw_1(inputs)
+        x1 = self._conv_linear_1(x1)
+        x2 = self._conv_pw_2(inputs)
+        x2 = self._conv_dw_2(x2)
+        x2 = self._conv_linear_2(x2)
+        out = paddle.concat([x1, x2], axis=1)
+
+        return channel_shuffle(out, 2)
+
+
+@register
+@serializable
+class ShuffleNetV2(nn.Layer):
+    def __init__(self, scale=1.0, act="relu", feature_maps=[5, 13, 17]):
+        super(ShuffleNetV2, self).__init__()
+        self.scale = scale
+        if isinstance(feature_maps, Integral):
+            feature_maps = [feature_maps]
+        self.feature_maps = feature_maps
+        stage_repeats = [4, 8, 4]
+
+        if scale == 0.25:
+            stage_out_channels = [-1, 24, 24, 48, 96, 512]
+        elif scale == 0.33:
+            stage_out_channels = [-1, 24, 32, 64, 128, 512]
+        elif scale == 0.5:
+            stage_out_channels = [-1, 24, 48, 96, 192, 1024]
+        elif scale == 1.0:
+            stage_out_channels = [-1, 24, 116, 232, 464, 1024]
+        elif scale == 1.5:
+            stage_out_channels = [-1, 24, 176, 352, 704, 1024]
+        elif scale == 2.0:
+            stage_out_channels = [-1, 24, 244, 488, 976, 2048]
+        else:
+            raise NotImplementedError("This scale size:[" + str(scale) +
+                                      "] is not implemented!")
+        self._out_channels = []
+        self._feature_idx = 0
+        # 1. conv1
+        self._conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=stage_out_channels[1],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            act=act)
+        self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
+        self._feature_idx += 1
+
+        # 2. bottleneck sequences
+        self._block_list = []
+        for stage_id, num_repeat in enumerate(stage_repeats):
+            for i in range(num_repeat):
+                if i == 0:
+                    block = self.add_sublayer(
+                        name=str(stage_id + 2) + '_' + str(i + 1),
+                        sublayer=InvertedResidualDS(
+                            in_channels=stage_out_channels[stage_id + 1],
+                            out_channels=stage_out_channels[stage_id + 2],
+                            stride=2,
+                            act=act))
+                else:
+                    block = self.add_sublayer(
+                        name=str(stage_id + 2) + '_' + str(i + 1),
+                        sublayer=InvertedResidual(
+                            in_channels=stage_out_channels[stage_id + 2],
+                            out_channels=stage_out_channels[stage_id + 2],
+                            stride=1,
+                            act=act))
+                self._block_list.append(block)
+                self._feature_idx += 1
+                self._update_out_channels(stage_out_channels[stage_id + 2],
+                                          self._feature_idx, self.feature_maps)
+
+    def _update_out_channels(self, channel, feature_idx, feature_maps):
+        if feature_idx in feature_maps:
+            self._out_channels.append(channel)
+
+    def forward(self, inputs):
+        y = self._conv1(inputs['image'])
+        y = self._max_pool(y)
+        outs = []
+        for i, inv in enumerate(self._block_list):
+            y = inv(y)
+            if i + 2 in self.feature_maps:
+                outs.append(y)
+
+        return outs
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/rtdetr_paddle/ppdet/modeling/backbones/swin_transformer.py b/rtdetr_paddle/ppdet/modeling/backbones/swin_transformer.py
new file mode 100644
index 0000000..64aabab
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/backbones/swin_transformer.py
@@ -0,0 +1,752 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is based on https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
+Ths copyright of microsoft/Swin-Transformer is as follows:
+MIT License [see LICENSE for details]
+"""
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.modeling.shape_spec import ShapeSpec
+from ppdet.core.workspace import register, serializable
+from .transformer_utils import DropPath, Identity
+from .transformer_utils import add_parameter, to_2tuple
+from .transformer_utils import ones_, zeros_, trunc_normal_
+
+__all__ = ['SwinTransformer']
+
+MODEL_cfg = {
+    # use 22kto1k finetune weights as default pretrained, can set by SwinTransformer.pretrained in config
+    'swin_T_224': dict(
+        pretrain_img_size=224,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_tiny_patch4_window7_224_22kto1k_pretrained.pdparams',
+    ),
+    'swin_S_224': dict(
+        pretrain_img_size=224,
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_small_patch4_window7_224_22kto1k_pretrained.pdparams',
+    ),
+    'swin_B_224': dict(
+        pretrain_img_size=224,
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=7,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window7_224_22kto1k_pretrained.pdparams',
+    ),
+    'swin_L_224': dict(
+        pretrain_img_size=224,
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=7,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window7_224_22kto1k_pretrained.pdparams',
+    ),
+    'swin_B_384': dict(
+        pretrain_img_size=384,
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window12_384_22kto1k_pretrained.pdparams',
+    ),
+    'swin_L_384': dict(
+        pretrain_img_size=384,
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window12_384_22kto1k_pretrained.pdparams',
+    ),
+}
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.reshape(
+        [-1, H // window_size, window_size, W // window_size, window_size, C])
+    windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
+        [-1, window_size, window_size, C])
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    _, _, _, C = windows.shape
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.reshape(
+        [-1, H // window_size, W // window_size, window_size, window_size, C])
+    x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, H, W, C])
+    return x
+
+
+class WindowAttention(nn.Layer):
+    """ Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = add_parameter(
+            self,
+            paddle.zeros(((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                          num_heads)))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = paddle.arange(self.window_size[0])
+        coords_w = paddle.arange(self.window_size[1])
+        coords = paddle.stack(paddle.meshgrid(
+            [coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+        coords_flatten_1 = coords_flatten.unsqueeze(axis=2)
+        coords_flatten_2 = coords_flatten.unsqueeze(axis=1)
+        relative_coords = coords_flatten_1 - coords_flatten_2
+        relative_coords = relative_coords.transpose(
+            [1, 2, 0])  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[
+            0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        self.relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table)
+        self.softmax = nn.Softmax(axis=-1)
+
+    def forward(self, x, mask=None):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(
+            [-1, N, 3, self.num_heads, C // self.num_heads]).transpose(
+                [2, 0, 3, 1, 4])
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        attn = paddle.mm(q, k.transpose([0, 1, 3, 2]))
+
+        index = self.relative_position_index.flatten()
+
+        relative_position_bias = paddle.index_select(
+            self.relative_position_bias_table, index)
+        relative_position_bias = relative_position_bias.reshape([
+            self.window_size[0] * self.window_size[1],
+            self.window_size[0] * self.window_size[1], -1
+        ])  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.transpose(
+            [2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.reshape([-1, nW, self.num_heads, N, N
+                                 ]) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.reshape([-1, self.num_heads, N, N])
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        # x = (attn @ v).transpose(1, 2).reshape([B_, N, C])
+        x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([-1, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock(nn.Layer):
+    """ Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=7,
+                 shift_size=0,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+        self.H = None
+        self.W = None
+
+    def forward(self, x, mask_matrix):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.reshape([-1, H, W, C])
+
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, [0, pad_l, 0, pad_b, 0, pad_r, 0, pad_t],
+                  data_format='NHWC')
+        _, Hp, Wp, _ = x.shape
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = paddle.roll(
+                x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.reshape(
+            [x_windows.shape[0], self.window_size * self.window_size,
+             C])  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.reshape(
+            [x_windows.shape[0], self.window_size, self.window_size, C])
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp,
+                                   Wp)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = paddle.roll(
+                shifted_x,
+                shifts=(self.shift_size, self.shift_size),
+                axis=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :]
+
+        x = x.reshape([-1, H * W, C])
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+
+class PatchMerging(nn.Layer):
+    r""" Patch Merging Layer.
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+
+        x = x.reshape([-1, H, W, C])
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            # paddle F.pad default data_format is 'NCHW'
+            x = F.pad(x, [0, 0, 0, H % 2, 0, W % 2, 0, 0], data_format='NHWC')
+            H += H % 2
+            W += W % 2
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = paddle.concat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.reshape([-1, H * W // 4, 4 * C])  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+class BasicLayer(nn.Layer):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
+    """
+
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+
+        # build blocks
+        self.blocks = nn.LayerList([
+            SwinTransformerBlock(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, np.ndarray) else drop_path,
+                norm_layer=norm_layer) for i in range(depth)
+        ])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = paddle.zeros([1, Hp, Wp, 1], dtype='float32')  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+
+                cnt += 1
+
+        mask_windows = window_partition(
+            img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.reshape(
+            [-1, self.window_size * self.window_size])
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        huns = -100.0 * paddle.ones_like(attn_mask)
+        attn_mask = huns * (attn_mask != 0).astype("float32")
+
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Layer, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        # TODO # export dynamic shape
+        B, C, H, W = x.shape
+        # assert [H, W] == self.img_size[:2], "Input image size ({H}*{W}) doesn't match model ({}*{}).".format(H, W, self.img_size[0], self.img_size[1])
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
+
+        x = self.proj(x)
+        if self.norm is not None:
+            _, _, Wh, Ww = x.shape
+            x = x.flatten(2).transpose([0, 2, 1])
+            x = self.norm(x)
+            x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
+
+        return x
+
+
+@register
+@serializable
+class SwinTransformer(nn.Layer):
+    """ Swin Transformer backbone
+    Args:
+        arch (str): Architecture of FocalNet
+        pretrain_img_size (int | tuple(int)): Input image size. Default 224
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        in_chans (int): Number of input image channels. Default: 3
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 7
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+    """
+
+    def __init__(self,
+                 arch='swin_T_224',
+                 pretrain_img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 out_indices=(0, 1, 2, 3),
+                 frozen_stages=-1,
+                 pretrained=None):
+        super(SwinTransformer, self).__init__()
+        assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch)
+
+        pretrain_img_size = MODEL_cfg[arch]['pretrain_img_size']
+        embed_dim = MODEL_cfg[arch]['embed_dim']
+        depths = MODEL_cfg[arch]['depths']
+        num_heads = MODEL_cfg[arch]['num_heads']
+        window_size = MODEL_cfg[arch]['window_size']
+        if pretrained is None:
+            pretrained = MODEL_cfg[arch]['pretrained']
+
+        self.num_layers = len(depths)
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [
+                pretrain_img_size[0] // patch_size[0],
+                pretrain_img_size[1] // patch_size[1]
+            ]
+
+            self.absolute_pos_embed = add_parameter(
+                self,
+                paddle.zeros((1, embed_dim, patches_resolution[0],
+                              patches_resolution[1])))
+            trunc_normal_(self.absolute_pos_embed)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = np.linspace(0, drop_path_rate,
+                          sum(depths))  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.LayerList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging
+                if (i_layer < self.num_layers - 1) else None)
+            self.layers.append(layer)
+
+        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
+        self.num_features = num_features
+
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_sublayer(layer_name, layer)
+
+        self.apply(self._init_weights)
+        self._freeze_stages()
+        if pretrained:
+            if 'http' in pretrained:  #URL
+                path = paddle.utils.download.get_weights_path_from_url(
+                    pretrained)
+            else:  #model in local path
+                path = pretrained
+            self.set_state_dict(paddle.load(path))
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.stop_gradient = True
+
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.stop_gradient = True
+
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.stop_gradient = True
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x['image'])
+        B, _, Wh, Ww = x.shape
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(
+                self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
+            x = (x + absolute_pos_embed).flatten(2).transpose([0, 2, 1])
+        else:
+            x = x.flatten(2).transpose([0, 2, 1])
+        x = self.pos_drop(x)
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+                out = x_out.reshape((-1, H, W, self.num_features[i])).transpose(
+                    (0, 3, 1, 2))
+                outs.append(out)
+
+        return outs
+
+    @property
+    def out_shape(self):
+        out_strides = [4, 8, 16, 32]
+        return [
+            ShapeSpec(
+                channels=self.num_features[i], stride=out_strides[i])
+            for i in self.out_indices
+        ]
diff --git a/rtdetr_paddle/ppdet/modeling/backbones/trans_encoder.py b/rtdetr_paddle/ppdet/modeling/backbones/trans_encoder.py
new file mode 100644
index 0000000..1a45e0f
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/backbones/trans_encoder.py
@@ -0,0 +1,381 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import ReLU, Swish, GELU
+import math
+
+from ppdet.core.workspace import register
+from ..shape_spec import ShapeSpec
+
+__all__ = ['TransEncoder']
+
+
+class BertEmbeddings(nn.Layer):
+    def __init__(self, word_size, position_embeddings_size, word_type_size,
+                 hidden_size, dropout_prob):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(
+            word_size, hidden_size, padding_idx=0)
+        self.position_embeddings = nn.Embedding(position_embeddings_size,
+                                                hidden_size)
+        self.token_type_embeddings = nn.Embedding(word_type_size, hidden_size)
+        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
+        self.dropout = nn.Dropout(dropout_prob)
+
+    def forward(self, x, token_type_ids=None, position_ids=None):
+        seq_len = paddle.shape(x)[1]
+        if position_ids is None:
+            position_ids = paddle.arange(seq_len).unsqueeze(0).expand_as(x)
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(paddle.shape(x))
+
+        word_embs = self.word_embeddings(x)
+        position_embs = self.position_embeddings(position_ids)
+        token_type_embs = self.token_type_embeddings(token_type_ids)
+
+        embs_cmb = word_embs + position_embs + token_type_embs
+        embs_out = self.layernorm(embs_cmb)
+        embs_out = self.dropout(embs_out)
+        return embs_out
+
+
+class BertSelfAttention(nn.Layer):
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 output_attentions=False):
+        super(BertSelfAttention, self).__init__()
+        if hidden_size % num_attention_heads != 0:
+            raise ValueError(
+                "The hidden_size must be a multiple of the number of attention "
+                "heads, but got {} % {} != 0" %
+                (hidden_size, num_attention_heads))
+
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_size = int(hidden_size / num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(hidden_size, self.all_head_size)
+        self.key = nn.Linear(hidden_size, self.all_head_size)
+        self.value = nn.Linear(hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(attention_probs_dropout_prob)
+        self.output_attentions = output_attentions
+
+    def forward(self, x, attention_mask, head_mask=None):
+        query = self.query(x)
+        key = self.key(x)
+        value = self.value(x)
+
+        query_dim1, query_dim2 = paddle.shape(query)[:-1]
+        new_shape = [
+            query_dim1, query_dim2, self.num_attention_heads,
+            self.attention_head_size
+        ]
+        query = query.reshape(new_shape).transpose(perm=(0, 2, 1, 3))
+        key = key.reshape(new_shape).transpose(perm=(0, 2, 3, 1))
+        value = value.reshape(new_shape).transpose(perm=(0, 2, 1, 3))
+
+        attention = paddle.matmul(query,
+                                  key) / math.sqrt(self.attention_head_size)
+        attention = attention + attention_mask
+        attention_value = F.softmax(attention, axis=-1)
+        attention_value = self.dropout(attention_value)
+
+        if head_mask is not None:
+            attention_value = attention_value * head_mask
+
+        context = paddle.matmul(attention_value, value).transpose(perm=(0, 2, 1,
+                                                                        3))
+        ctx_dim1, ctx_dim2 = paddle.shape(context)[:-2]
+        new_context_shape = [
+            ctx_dim1,
+            ctx_dim2,
+            self.all_head_size,
+        ]
+        context = context.reshape(new_context_shape)
+
+        if self.output_attentions:
+            return (context, attention_value)
+        else:
+            return (context, )
+
+
+class BertAttention(nn.Layer):
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 output_attentions=False):
+        super(BertAttention, self).__init__()
+        self.bert_selfattention = BertSelfAttention(
+            hidden_size, num_attention_heads, attention_probs_dropout_prob,
+            output_attentions)
+        self.fc = nn.Linear(hidden_size, hidden_size)
+        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
+        self.dropout = nn.Dropout(fc_dropout_prob)
+
+    def forward(self, x, attention_mask, head_mask=None):
+        attention_feats = self.bert_selfattention(x, attention_mask, head_mask)
+        features = self.fc(attention_feats[0])
+        features = self.dropout(features)
+        features = self.layernorm(features + x)
+        if len(attention_feats) == 2:
+            return (features, attention_feats[1])
+        else:
+            return (features, )
+
+
+class BertFeedForward(nn.Layer):
+    def __init__(self,
+                 hidden_size,
+                 intermediate_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 act_fn='ReLU',
+                 output_attentions=False):
+        super(BertFeedForward, self).__init__()
+        self.fc1 = nn.Linear(hidden_size, intermediate_size)
+        self.act_fn = eval(act_fn)
+        self.fc2 = nn.Linear(intermediate_size, hidden_size)
+        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
+        self.dropout = nn.Dropout(fc_dropout_prob)
+
+    def forward(self, x):
+        features = self.fc1(x)
+        features = self.act_fn(features)
+        features = self.fc2(features)
+        features = self.dropout(features)
+        features = self.layernorm(features + x)
+        return features
+
+
+class BertLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size,
+                 intermediate_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 act_fn='ReLU',
+                 output_attentions=False):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(hidden_size, num_attention_heads,
+                                       attention_probs_dropout_prob,
+                                       output_attentions)
+        self.feed_forward = BertFeedForward(
+            hidden_size, intermediate_size, num_attention_heads,
+            attention_probs_dropout_prob, fc_dropout_prob, act_fn,
+            output_attentions)
+
+    def forward(self, x, attention_mask, head_mask=None):
+        attention_feats = self.attention(x, attention_mask, head_mask)
+        features = self.feed_forward(attention_feats[0])
+        if len(attention_feats) == 2:
+            return (features, attention_feats[1])
+        else:
+            return (features, )
+
+
+class BertEncoder(nn.Layer):
+    def __init__(self,
+                 num_hidden_layers,
+                 hidden_size,
+                 intermediate_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 act_fn='ReLU',
+                 output_attentions=False,
+                 output_hidden_feats=False):
+        super(BertEncoder, self).__init__()
+        self.output_attentions = output_attentions
+        self.output_hidden_feats = output_hidden_feats
+        self.layers = nn.LayerList([
+            BertLayer(hidden_size, intermediate_size, num_attention_heads,
+                      attention_probs_dropout_prob, fc_dropout_prob, act_fn,
+                      output_attentions) for _ in range(num_hidden_layers)
+        ])
+
+    def forward(self, x, attention_mask, head_mask=None):
+        all_features = (x, )
+        all_attentions = ()
+
+        for i, layer in enumerate(self.layers):
+            mask = head_mask[i] if head_mask is not None else None
+            layer_out = layer(x, attention_mask, mask)
+
+            if self.output_hidden_feats:
+                all_features = all_features + (x, )
+            x = layer_out[0]
+            if self.output_attentions:
+                all_attentions = all_attentions + (layer_out[1], )
+
+        outputs = (x, )
+        if self.output_hidden_feats:
+            outputs += (all_features, )
+        if self.output_attentions:
+            outputs += (all_attentions, )
+        return outputs
+
+
+class BertPooler(nn.Layer):
+    def __init__(self, hidden_size):
+        super(BertPooler, self).__init__()
+        self.fc = nn.Linear(hidden_size, hidden_size)
+        self.act = nn.Tanh()
+
+    def forward(self, x):
+        first_token = x[:, 0]
+        pooled_output = self.fc(first_token)
+        pooled_output = self.act(pooled_output)
+        return pooled_output
+
+
+class METROEncoder(nn.Layer):
+    def __init__(self,
+                 vocab_size,
+                 num_hidden_layers,
+                 features_dims,
+                 position_embeddings_size,
+                 hidden_size,
+                 intermediate_size,
+                 output_feature_dim,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 act_fn='ReLU',
+                 output_attentions=False,
+                 output_hidden_feats=False,
+                 use_img_layernorm=False):
+        super(METROEncoder, self).__init__()
+        self.img_dims = features_dims
+        self.num_hidden_layers = num_hidden_layers
+        self.use_img_layernorm = use_img_layernorm
+        self.output_attentions = output_attentions
+        self.embedding = BertEmbeddings(vocab_size, position_embeddings_size, 2,
+                                        hidden_size, fc_dropout_prob)
+        self.encoder = BertEncoder(
+            num_hidden_layers, hidden_size, intermediate_size,
+            num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob,
+            act_fn, output_attentions, output_hidden_feats)
+        self.pooler = BertPooler(hidden_size)
+        self.position_embeddings = nn.Embedding(position_embeddings_size,
+                                                hidden_size)
+        self.img_embedding = nn.Linear(
+            features_dims, hidden_size, bias_attr=True)
+        self.dropout = nn.Dropout(fc_dropout_prob)
+        self.cls_head = nn.Linear(hidden_size, output_feature_dim)
+        self.residual = nn.Linear(features_dims, output_feature_dim)
+
+        self.apply(self.init_weights)
+
+    def init_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.set_value(
+                paddle.normal(
+                    mean=0.0, std=0.02, shape=module.weight.shape))
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.set_value(paddle.zeros(shape=module.bias.shape))
+            module.weight.set_value(
+                paddle.full(
+                    shape=module.weight.shape, fill_value=1.0))
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.set_value(paddle.zeros(shape=module.bias.shape))
+
+    def forward(self, x):
+        batchsize, seq_len = paddle.shape(x)[:2]
+        input_ids = paddle.zeros((batchsize, seq_len), dtype="int64")
+        position_ids = paddle.arange(
+            seq_len, dtype="int64").unsqueeze(0).expand_as(input_ids)
+
+        attention_mask = paddle.ones_like(input_ids).unsqueeze(1).unsqueeze(2)
+        head_mask = [None] * self.num_hidden_layers
+
+        position_embs = self.position_embeddings(position_ids)
+        attention_mask = (1.0 - attention_mask) * -10000.0
+
+        img_features = self.img_embedding(x)
+
+        # We empirically observe that adding an additional learnable position embedding leads to more stable training
+        embeddings = position_embs + img_features
+        if self.use_img_layernorm:
+            embeddings = self.layernorm(embeddings)
+        embeddings = self.dropout(embeddings)
+
+        encoder_outputs = self.encoder(
+            embeddings, attention_mask, head_mask=head_mask)
+
+        pred_score = self.cls_head(encoder_outputs[0])
+        res_img_feats = self.residual(x)
+        pred_score = pred_score + res_img_feats
+
+        if self.output_attentions and self.output_hidden_feats:
+            return pred_score, encoder_outputs[1], encoder_outputs[-1]
+        else:
+            return pred_score
+
+
+def gelu(x):
+    """Implementation of the gelu activation function.
+        https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0)))
+
+
+@register
+class TransEncoder(nn.Layer):
+    def __init__(self,
+                 vocab_size=30522,
+                 num_hidden_layers=4,
+                 num_attention_heads=4,
+                 position_embeddings_size=512,
+                 intermediate_size=3072,
+                 input_feat_dim=[2048, 512, 128],
+                 hidden_feat_dim=[1024, 256, 128],
+                 attention_probs_dropout_prob=0.1,
+                 fc_dropout_prob=0.1,
+                 act_fn='gelu',
+                 output_attentions=False,
+                 output_hidden_feats=False):
+        super(TransEncoder, self).__init__()
+        output_feat_dim = input_feat_dim[1:] + [3]
+        trans_encoder = []
+        for i in range(len(output_feat_dim)):
+            features_dims = input_feat_dim[i]
+            output_feature_dim = output_feat_dim[i]
+            hidden_size = hidden_feat_dim[i]
+
+            # init a transformer encoder and append it to a list
+            assert hidden_size % num_attention_heads == 0
+            model = METROEncoder(vocab_size, num_hidden_layers, features_dims,
+                                 position_embeddings_size, hidden_size,
+                                 intermediate_size, output_feature_dim,
+                                 num_attention_heads,
+                                 attention_probs_dropout_prob, fc_dropout_prob,
+                                 act_fn, output_attentions, output_hidden_feats)
+            trans_encoder.append(model)
+        self.trans_encoder = paddle.nn.Sequential(*trans_encoder)
+
+    def forward(self, x):
+        out = self.trans_encoder(x)
+        return out
diff --git a/rtdetr_paddle/ppdet/modeling/backbones/transformer_utils.py b/rtdetr_paddle/ppdet/modeling/backbones/transformer_utils.py
new file mode 100644
index 0000000..a0783e1
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/backbones/transformer_utils.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddle.nn.initializer import TruncatedNormal, Constant, Assign
+
+# Common initializations
+ones_ = Constant(value=1.)
+zeros_ = Constant(value=0.)
+trunc_normal_ = TruncatedNormal(std=.02)
+
+
+# Common Layers
+def drop_path(x, drop_prob=0., training=False):
+    """
+        Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+        the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+        See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
+    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+# common funcs
+
+
+def to_2tuple(x):
+    if isinstance(x, (list, tuple)):
+        return x
+    return tuple([x] * 2)
+
+
+def add_parameter(layer, datas, name=None):
+    parameter = layer.create_parameter(
+        shape=(datas.shape), default_initializer=Assign(datas))
+    if name:
+        layer.add_parameter(name, parameter)
+    return parameter
+
+
+def window_partition(x, window_size):
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """
+    B, H, W, C = paddle.shape(x)
+
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    x = F.pad(x.transpose([0, 3, 1, 2]),
+              paddle.to_tensor(
+                  [0, int(pad_w), 0, int(pad_h)],
+                  dtype='int32')).transpose([0, 2, 3, 1])
+    Hp, Wp = H + pad_h, W + pad_w
+
+    num_h, num_w = Hp // window_size, Wp // window_size
+
+    x = x.reshape([B, num_h, window_size, num_w, window_size, C])
+    windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
+        [-1, window_size, window_size, C])
+    return windows, (Hp, Wp), (num_h, num_w)
+
+
+def window_unpartition(x, pad_hw, num_hw, hw):
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """
+    Hp, Wp = pad_hw
+    num_h, num_w = num_hw
+    H, W = hw
+    B, window_size, _, C = paddle.shape(x)
+    B = B // (num_h * num_w)
+    x = x.reshape([B, num_h, num_w, window_size, window_size, C])
+    x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, Hp, Wp, C])
+
+    return x[:, :H, :W, :]
diff --git a/rtdetr_paddle/ppdet/modeling/backbones/vision_transformer.py b/rtdetr_paddle/ppdet/modeling/backbones/vision_transformer.py
new file mode 100644
index 0000000..a21eefc
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/backbones/vision_transformer.py
@@ -0,0 +1,652 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+from paddle.nn.initializer import Constant
+
+from ppdet.modeling.shape_spec import ShapeSpec
+from ppdet.core.workspace import register, serializable
+
+from .transformer_utils import zeros_, DropPath, Identity
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 window_size=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=False)
+
+        if qkv_bias:
+            self.q_bias = self.create_parameter(
+                shape=([dim]), default_initializer=zeros_)
+            self.v_bias = self.create_parameter(
+                shape=([dim]), default_initializer=zeros_)
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0] - 1) * (
+                2 * window_size[1] - 1) + 3
+            self.relative_position_bias_table = self.create_parameter(
+                shape=(self.num_relative_distance, num_heads),
+                default_initializer=zeros_)  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+
+            # get pair-wise relative position index for each token inside the window
+            coords_h = paddle.arange(window_size[0])
+            coords_w = paddle.arange(window_size[1])
+            coords = paddle.stack(paddle.meshgrid(
+                [coords_h, coords_w]))  # 2, Wh, Ww
+            coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww 
+            coords_flatten_1 = paddle.unsqueeze(coords_flatten, 2)
+            coords_flatten_2 = paddle.unsqueeze(coords_flatten, 1)
+            relative_coords = coords_flatten_1.clone() - coords_flatten_2.clone(
+            )
+
+            #relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Wh
+            relative_coords = relative_coords.transpose(
+                (1, 2, 0))  #.contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += window_size[
+                0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = \
+                paddle.zeros(shape=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
+            relative_position_index[1:, 1:] = relative_coords.sum(
+                -1)  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+
+            self.register_buffer("relative_position_index",
+                                 relative_position_index)
+            # trunc_normal_(self.relative_position_bias_table, std=.0)
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, rel_pos_bias=None):
+        x_shape = paddle.shape(x)
+        N, C = x_shape[1], x_shape[2]
+
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = paddle.concat(
+                (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
+        qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)
+
+        qkv = qkv.reshape((-1, N, 3, self.num_heads,
+                           C // self.num_heads)).transpose((2, 0, 3, 1, 4))
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+
+        if self.relative_position_bias_table is not None:
+            relative_position_bias = self.relative_position_bias_table[
+                self.relative_position_index.reshape([-1])].reshape([
+                    self.window_size[0] * self.window_size[1] + 1,
+                    self.window_size[0] * self.window_size[1] + 1, -1
+                ])  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.transpose(
+                (2, 0, 1))  #.contiguous()  # nH, Wh*Ww, Wh*Ww
+            attn = attn + relative_position_bias.unsqueeze(0)
+        if rel_pos_bias is not None:
+            attn = attn + rel_pos_bias
+
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 window_size=None,
+                 init_values=None,
+                 act_layer=nn.GELU,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim, epsilon=1e-6)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            window_size=window_size)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        if init_values is not None:
+            self.gamma_1 = self.create_parameter(
+                shape=([dim]), default_initializer=Constant(value=init_values))
+            self.gamma_2 = self.create_parameter(
+                shape=([dim]), default_initializer=Constant(value=init_values))
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x, rel_pos_bias=None):
+
+        if self.gamma_1 is None:
+            x = x + self.drop_path(
+                self.attn(
+                    self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_1 * self.attn(
+                self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=[224, 224],
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768):
+        super().__init__()
+        self.num_patches_w = img_size[0] // patch_size
+        self.num_patches_h = img_size[1] // patch_size
+
+        num_patches = self.num_patches_w * self.num_patches_h
+        self.patch_shape = (img_size[0] // patch_size,
+                            img_size[1] // patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    @property
+    def num_patches_in_h(self):
+        return self.img_size[1] // self.patch_size
+
+    @property
+    def num_patches_in_w(self):
+        return self.img_size[0] // self.patch_size
+
+    def forward(self, x, mask=None):
+        B, C, H, W = x.shape
+        return self.proj(x)
+
+
+class RelativePositionBias(nn.Layer):
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0] - 1) * (
+            2 * window_size[1] - 1) + 3
+        self.relative_position_bias_table = self.create_parameter(
+            shape=(self.num_relative_distance, num_heads),
+            default_initialize=zeros_)
+        # cls to token & token 2 cls & cls to cls
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = paddle.arange(window_size[0])
+        coords_w = paddle.arange(window_size[1])
+        coords = paddle.stack(paddle.meshgrid(
+            [coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = coords.flatten(1)  # 2, Wh*Ww
+
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.transpos(
+            (1, 2, 0))  # Wh*Ww, Wh*Ww, 2 
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = \
+            paddle.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:, 1:] = relative_coords.sum(
+            -1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+        self.register_buffer("relative_position_index", relative_position_index)
+
+    def forward(self):
+        relative_position_bias = \
+            self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([
+                 self.window_size[0] * self.window_size[1] + 1,
+                 self.window_size[0] * self.window_size[1] + 1, -1])  # Wh*Ww,Wh*Ww,nH 
+        return relative_position_bias.transpose((2, 0, 1))  # nH, Wh*Ww, Wh*Ww
+
+
+def get_sinusoid_encoding_table(n_position, d_hid, token=False):
+    ''' Sinusoid position encoding table '''
+
+    def get_position_angle_vec(position):
+        return [
+            position / np.power(10000, 2 * (hid_j // 2) / d_hid)
+            for hid_j in range(d_hid)
+        ]
+
+    sinusoid_table = np.array(
+        [get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+    if token:
+        sinusoid_table = np.concatenate(
+            [sinusoid_table, np.zeros([1, d_hid])], dim=0)
+
+    return paddle.to_tensor(sinusoid_table, dtype=paddle.float32).unsqueeze(0)
+
+
+@register
+@serializable
+class VisionTransformer(nn.Layer):
+    """ Vision Transformer with support for patch input
+    """
+
+    def __init__(self,
+                 img_size=[672, 1092],
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer='nn.LayerNorm',
+                 init_values=None,
+                 use_rel_pos_bias=False,
+                 use_shared_rel_pos_bias=False,
+                 epsilon=1e-5,
+                 final_norm=False,
+                 pretrained=None,
+                 out_indices=[3, 5, 7, 11],
+                 use_abs_pos_emb=False,
+                 use_sincos_pos_emb=True,
+                 with_fpn=True,
+                 num_fpn_levels=4,
+                 use_checkpoint=False,
+                 **args):
+        super().__init__()
+        self.img_size = img_size
+        self.embed_dim = embed_dim
+        self.with_fpn = with_fpn
+        self.use_checkpoint = use_checkpoint
+        self.use_sincos_pos_emb = use_sincos_pos_emb
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.final_norm = final_norm
+        self.out_indices = out_indices
+        self.num_fpn_levels = num_fpn_levels
+
+        if use_checkpoint:
+            paddle.seed(0)
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim)
+
+        self.pos_w = self.patch_embed.num_patches_in_w
+        self.pos_h = self.patch_embed.num_patches_in_h
+
+        self.cls_token = self.create_parameter(
+            shape=(1, 1, embed_dim),
+            default_initializer=paddle.nn.initializer.Constant(value=0.))
+
+        if use_abs_pos_emb:
+            self.pos_embed = self.create_parameter(
+                shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
+                default_initializer=paddle.nn.initializer.TruncatedNormal(
+                    std=.02))
+        elif use_sincos_pos_emb:
+            pos_embed = self.build_2d_sincos_position_embedding(embed_dim)
+
+            self.pos_embed = pos_embed
+            self.pos_embed = self.create_parameter(shape=pos_embed.shape)
+            self.pos_embed.set_value(pos_embed.numpy())
+            self.pos_embed.stop_gradient = True
+
+        else:
+            self.pos_embed = None
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(
+                window_size=self.patch_embed.patch_shape, num_heads=num_heads)
+        else:
+            self.rel_pos_bias = None
+
+        dpr = np.linspace(0, drop_path_rate, depth)
+
+        self.blocks = nn.LayerList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                init_values=init_values,
+                window_size=self.patch_embed.patch_shape
+                if use_rel_pos_bias else None,
+                epsilon=epsilon) for i in range(depth)
+        ])
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+        assert len(out_indices) <= 4, ''
+        self.out_indices = out_indices
+        self.out_channels = [embed_dim for _ in range(num_fpn_levels)]
+        self.out_strides = [4, 8, 16, 32][-num_fpn_levels:] if with_fpn else [
+            patch_size for _ in range(len(out_indices))
+        ]
+
+        self.norm = Identity()
+
+        if self.with_fpn:
+            assert num_fpn_levels <= 4, ''
+            self.init_fpn(
+                embed_dim=embed_dim,
+                patch_size=patch_size, )
+
+    def init_weight(self):
+        pretrained = self.pretrained
+
+        if pretrained:
+            if 'http' in pretrained:  #URL
+                path = paddle.utils.download.get_weights_path_from_url(
+                    pretrained)
+            else:  #model in local path
+                path = pretrained
+
+            load_state_dict = paddle.load(path)
+            model_state_dict = self.state_dict()
+            pos_embed_name = "pos_embed"
+
+            if pos_embed_name in load_state_dict.keys():
+                load_pos_embed = paddle.to_tensor(
+                    load_state_dict[pos_embed_name], dtype="float32")
+                if self.pos_embed.shape != load_pos_embed.shape:
+                    pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
+                    model_state_dict[pos_embed_name] = self.resize_pos_embed(
+                        load_pos_embed, (pos_size, pos_size),
+                        (self.pos_h, self.pos_w))
+
+                    # self.set_state_dict(model_state_dict)
+                    load_state_dict[pos_embed_name] = model_state_dict[
+                        pos_embed_name]
+
+                    print("Load pos_embed and resize it from {} to {} .".format(
+                        load_pos_embed.shape, self.pos_embed.shape))
+
+            self.set_state_dict(load_state_dict)
+            print("Load load_state_dict....")
+
+    def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):
+        if patch_size == 16:
+            self.fpn1 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2),
+                nn.BatchNorm2D(embed_dim),
+                nn.GELU(),
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn2 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn3 = Identity()
+
+            self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)
+        elif patch_size == 8:
+            self.fpn1 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn2 = Identity()
+
+            self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )
+
+            self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )
+
+        if not out_with_norm:
+            self.norm = Identity()
+        else:
+            self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6)
+
+    def interpolate_pos_encoding(self, x, w, h):
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        w0 = w // self.patch_embed.patch_size
+        h0 = h // self.patch_embed.patch_size
+        if npatch == N and w0 == self.patch_embed.num_patches_w and h0 == self.patch_embed.num_patches_h:
+            return self.pos_embed
+        class_pos_embed = self.pos_embed[:, 0]
+        patch_pos_embed = self.pos_embed[:, 1:]
+        dim = x.shape[-1]
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        # w0, h0 = w0 + 0.1, h0 + 0.1
+        # patch_pos_embed = nn.functional.interpolate(
+        #     patch_pos_embed.reshape([
+        #         1, self.patch_embed.num_patches_w,
+        #         self.patch_embed.num_patches_h, dim
+        #     ]).transpose((0, 3, 1, 2)),
+        #     scale_factor=(w0 / self.patch_embed.num_patches_w,
+        #                   h0 / self.patch_embed.num_patches_h),
+        #     mode='bicubic', )
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape([
+                1, self.patch_embed.num_patches_w,
+                self.patch_embed.num_patches_h, dim
+            ]).transpose((0, 3, 1, 2)),
+            (w0, h0),
+            mode='bicubic', )
+
+        assert int(w0) == patch_pos_embed.shape[-2] and int(
+            h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.transpose(
+            (0, 2, 3, 1)).reshape([1, -1, dim])
+        return paddle.concat(
+            (class_pos_embed.unsqueeze(0), patch_pos_embed), axis=1)
+
+    def resize_pos_embed(self, pos_embed, old_hw, new_hw):
+        """
+        Resize pos_embed weight.
+        Args:
+            pos_embed (Tensor): the pos_embed weight
+            old_hw (list[int]): the height and width of old pos_embed
+            new_hw (list[int]): the height and width of new pos_embed
+        Returns:
+            Tensor: the resized pos_embed weight
+        """
+        cls_pos_embed = pos_embed[:, :1, :]
+        pos_embed = pos_embed[:, 1:, :]
+
+        pos_embed = pos_embed.transpose([0, 2, 1])
+        pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
+        pos_embed = F.interpolate(
+            pos_embed, new_hw, mode='bicubic', align_corners=False)
+        pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
+        pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
+
+        return pos_embed
+
+    def build_2d_sincos_position_embedding(
+            self,
+            embed_dim=768,
+            temperature=10000., ):
+        h, w = self.patch_embed.patch_shape
+        grid_w = paddle.arange(w, dtype=paddle.float32)
+        grid_h = paddle.arange(h, dtype=paddle.float32)
+        grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
+        assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
+        pos_dim = embed_dim // 4
+        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
+        omega = 1. / (temperature**omega)
+
+        out_w = grid_w.flatten()[..., None] @omega[None]
+        out_h = grid_h.flatten()[..., None] @omega[None]
+
+        pos_emb = paddle.concat(
+            [
+                paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
+                paddle.cos(out_h)
+            ],
+            axis=1)[None, :, :]
+
+        pe_token = paddle.zeros([1, 1, embed_dim], dtype=paddle.float32)
+        pos_embed = paddle.concat([pe_token, pos_emb], axis=1)
+        # pos_embed.stop_gradient = True
+
+        return pos_embed
+
+    def forward(self, x):
+        x = x['image'] if isinstance(x, dict) else x
+        _, _, h, w = x.shape
+
+        x = self.patch_embed(x)
+
+        B, D, Hp, Wp = x.shape  # b * c * h * w
+
+        cls_tokens = self.cls_token.expand(
+            (B, self.cls_token.shape[-2], self.cls_token.shape[-1]))
+        x = x.flatten(2).transpose([0, 2, 1])  # b * hw * c
+        x = paddle.concat([cls_tokens, x], axis=1)
+
+        if self.pos_embed is not None:
+            # x = x + self.interpolate_pos_encoding(x, w, h)
+            x = x + self.interpolate_pos_encoding(x, h, w)
+
+        x = self.pos_drop(x)
+
+        rel_pos_bias = self.rel_pos_bias(
+        ) if self.rel_pos_bias is not None else None
+
+        feats = []
+        for idx, blk in enumerate(self.blocks):
+            if self.use_checkpoint and self.training:
+                x = paddle.distributed.fleet.utils.recompute(
+                    blk, x, rel_pos_bias, **{"preserve_rng_state": True})
+            else:
+                x = blk(x, rel_pos_bias)
+
+            if idx in self.out_indices:
+                xp = paddle.reshape(
+                    paddle.transpose(
+                        self.norm(x[:, 1:, :]), perm=[0, 2, 1]),
+                    shape=[B, D, Hp, Wp])
+                feats.append(xp)
+
+        if self.with_fpn:
+            fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4][
+                -self.num_fpn_levels:]
+            assert len(fpns) == len(feats) or len(feats) == 1, ''
+            outputs = []
+            for i, m in enumerate(fpns):
+                outputs.append(
+                    m(feats[i] if len(feats) == len(fpns) else feats[-1]))
+
+            return outputs
+
+        return feats
+
+    @property
+    def num_layers(self):
+        return len(self.blocks)
+
+    @property
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=c, stride=s)
+            for c, s in zip(self.out_channels, self.out_strides)
+        ]
diff --git a/rtdetr_paddle/ppdet/modeling/backbones/vit_mae.py b/rtdetr_paddle/ppdet/modeling/backbones/vit_mae.py
new file mode 100644
index 0000000..8d00da7
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/backbones/vit_mae.py
@@ -0,0 +1,749 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+import math
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Constant, TruncatedNormal
+
+from ppdet.modeling.shape_spec import ShapeSpec
+from ppdet.core.workspace import register, serializable
+
+from .transformer_utils import (zeros_, DropPath, Identity, window_partition,
+                                window_unpartition)
+from ..initializer import linear_init_
+
+__all__ = ['VisionTransformer2D', 'SimpleFeaturePyramid']
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer='nn.GELU',
+                 drop=0.,
+                 lr_factor=1.0):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(
+            in_features,
+            hidden_features,
+            weight_attr=ParamAttr(learning_rate=lr_factor),
+            bias_attr=ParamAttr(learning_rate=lr_factor))
+        self.act = eval(act_layer)()
+        self.fc2 = nn.Linear(
+            hidden_features,
+            out_features,
+            weight_attr=ParamAttr(learning_rate=lr_factor),
+            bias_attr=ParamAttr(learning_rate=lr_factor))
+        self.drop = nn.Dropout(drop)
+
+        self._init_weights()
+
+    def _init_weights(self):
+        linear_init_(self.fc1)
+        linear_init_(self.fc2)
+
+    def forward(self, x):
+        x = self.drop(self.act(self.fc1(x)))
+        x = self.drop(self.fc2(x))
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 attn_bias=False,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 use_rel_pos=False,
+                 rel_pos_zero_init=True,
+                 window_size=None,
+                 input_size=None,
+                 qk_scale=None,
+                 lr_factor=1.0):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = qk_scale or self.head_dim**-0.5
+        self.use_rel_pos = use_rel_pos
+        self.input_size = input_size
+        self.rel_pos_zero_init = rel_pos_zero_init
+        self.window_size = window_size
+        self.lr_factor = lr_factor
+
+        self.qkv = nn.Linear(
+            dim,
+            dim * 3,
+            weight_attr=ParamAttr(learning_rate=lr_factor),
+            bias_attr=ParamAttr(learning_rate=lr_factor)
+            if attn_bias else False)
+        if qkv_bias:
+            self.q_bias = self.create_parameter(
+                shape=([dim]), default_initializer=zeros_)
+            self.v_bias = self.create_parameter(
+                shape=([dim]), default_initializer=zeros_)
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        self.proj = nn.Linear(
+            dim,
+            dim,
+            weight_attr=ParamAttr(learning_rate=lr_factor),
+            bias_attr=ParamAttr(learning_rate=lr_factor))
+        self.attn_drop = nn.Dropout(attn_drop)
+        if window_size is None:
+            self.window_size = self.input_size[0]
+
+        self._init_weights()
+
+    def _init_weights(self):
+        linear_init_(self.qkv)
+        linear_init_(self.proj)
+
+        if self.use_rel_pos:
+            self.rel_pos_h = self.create_parameter(
+                [2 * self.window_size - 1, self.head_dim],
+                attr=ParamAttr(learning_rate=self.lr_factor),
+                default_initializer=Constant(value=0.))
+            self.rel_pos_w = self.create_parameter(
+                [2 * self.window_size - 1, self.head_dim],
+                attr=ParamAttr(learning_rate=self.lr_factor),
+                default_initializer=Constant(value=0.))
+
+            if not self.rel_pos_zero_init:
+                TruncatedNormal(self.rel_pos_h, std=0.02)
+                TruncatedNormal(self.rel_pos_w, std=0.02)
+
+    def get_rel_pos(self, seq_size, rel_pos):
+        max_rel_dist = int(2 * seq_size - 1)
+        # Interpolate rel pos if needed.
+        if rel_pos.shape[0] != max_rel_dist:
+            # Interpolate rel pos.
+            rel_pos = rel_pos.reshape([1, rel_pos.shape[0], -1])
+            rel_pos = rel_pos.transpose([0, 2, 1])
+            rel_pos_resized = F.interpolate(
+                rel_pos,
+                size=(max_rel_dist, ),
+                mode="linear",
+                data_format='NCW')
+            rel_pos_resized = rel_pos_resized.reshape([-1, max_rel_dist])
+            rel_pos_resized = rel_pos_resized.transpose([1, 0])
+        else:
+            rel_pos_resized = rel_pos
+
+        coords = paddle.arange(seq_size, dtype='float32')
+        relative_coords = coords.unsqueeze(-1) - coords.unsqueeze(0)
+        relative_coords += (seq_size - 1)
+        relative_coords = relative_coords.astype('int64').flatten()
+
+        return paddle.index_select(rel_pos_resized, relative_coords).reshape(
+            [seq_size, seq_size, self.head_dim])
+
+    def add_decomposed_rel_pos(self, attn, q, h, w):
+        """
+        Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+        Args:
+            attn (Tensor): attention map.
+            q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+        Returns:
+            attn (Tensor): attention map with added relative positional embeddings.
+        """
+        Rh = self.get_rel_pos(h, self.rel_pos_h)
+        Rw = self.get_rel_pos(w, self.rel_pos_w)
+
+        B, _, dim = q.shape
+        r_q = q.reshape([B, h, w, dim])
+        # bhwc, hch->bhwh1
+        # bwhc, wcw->bhw1w
+        rel_h = paddle.einsum("bhwc,hkc->bhwk", r_q, Rh).unsqueeze(-1)
+        rel_w = paddle.einsum("bhwc,wkc->bhwk", r_q, Rw).unsqueeze(-2)
+
+        attn = attn.reshape([B, h, w, h, w]) + rel_h + rel_w
+        return attn.reshape([B, h * w, h * w])
+
+    def forward(self, x):
+        B, H, W, C = paddle.shape(x)
+
+        if self.q_bias is not None:
+            qkv_bias = paddle.concat(
+                (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
+            qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)
+        else:
+            qkv = self.qkv(x).reshape(
+                [B, H * W, 3, self.num_heads, self.head_dim]).transpose(
+                    [2, 0, 3, 1, 4]).reshape(
+                        [3, B * self.num_heads, H * W, self.head_dim])
+
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn = q.matmul(k.transpose([0, 2, 1])) * self.scale
+
+        if self.use_rel_pos:
+            attn = self.add_decomposed_rel_pos(attn, q, H, W)
+
+        attn = F.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+        x = attn.matmul(v).reshape(
+            [B, self.num_heads, H * W, self.head_dim]).transpose(
+                [0, 2, 1, 3]).reshape([B, H, W, C])
+        x = self.proj(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 attn_bias=False,
+                 qk_scale=None,
+                 init_values=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 use_rel_pos=True,
+                 rel_pos_zero_init=True,
+                 window_size=None,
+                 input_size=None,
+                 act_layer='nn.GELU',
+                 norm_layer='nn.LayerNorm',
+                 lr_factor=1.0,
+                 epsilon=1e-5):
+        super().__init__()
+        self.window_size = window_size
+
+        self.norm1 = eval(norm_layer)(dim,
+                                      weight_attr=ParamAttr(
+                                          learning_rate=lr_factor,
+                                          regularizer=L2Decay(0.0)),
+                                      bias_attr=ParamAttr(
+                                          learning_rate=lr_factor,
+                                          regularizer=L2Decay(0.0)),
+                                      epsilon=epsilon)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_bias=attn_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            use_rel_pos=use_rel_pos,
+            rel_pos_zero_init=rel_pos_zero_init,
+            window_size=window_size,
+            input_size=input_size,
+            lr_factor=lr_factor)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = eval(norm_layer)(dim,
+                                      weight_attr=ParamAttr(
+                                          learning_rate=lr_factor,
+                                          regularizer=L2Decay(0.0)),
+                                      bias_attr=ParamAttr(
+                                          learning_rate=lr_factor,
+                                          regularizer=L2Decay(0.0)),
+                                      epsilon=epsilon)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=int(dim * mlp_ratio),
+                       act_layer=act_layer,
+                       drop=drop,
+                       lr_factor=lr_factor)
+        if init_values is not None:
+            self.gamma_1 = self.create_parameter(
+                shape=([dim]), default_initializer=Constant(value=init_values))
+            self.gamma_2 = self.create_parameter(
+                shape=([dim]), default_initializer=Constant(value=init_values))
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x):
+        y = self.norm1(x)
+        if self.window_size is not None:
+            y, pad_hw, num_hw = window_partition(y, self.window_size)
+        y = self.attn(y)
+        if self.gamma_1 is not None:
+            y = self.gamma_1 * y
+
+        if self.window_size is not None:
+            y = window_unpartition(y, pad_hw, num_hw, (x.shape[1], x.shape[2]))
+        x = x + self.drop_path(y)
+        if self.gamma_2 is None:
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=(224, 224),
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 lr_factor=0.01):
+        super().__init__()
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.proj = nn.Conv2D(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            weight_attr=ParamAttr(learning_rate=lr_factor),
+            bias_attr=ParamAttr(learning_rate=lr_factor))
+
+    @property
+    def num_patches_in_h(self):
+        return self.img_size[1] // self.patch_size
+
+    @property
+    def num_patches_in_w(self):
+        return self.img_size[0] // self.patch_size
+
+    def forward(self, x):
+        out = self.proj(x)
+        return out
+
+
+@register
+@serializable
+class VisionTransformer2D(nn.Layer):
+    """ Vision Transformer with support for patch input
+    """
+
+    def __init__(self,
+                 img_size=(1024, 1024),
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 attn_bias=False,
+                 qk_scale=None,
+                 init_values=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 act_layer='nn.GELU',
+                 norm_layer='nn.LayerNorm',
+                 lr_decay_rate=1.0,
+                 global_attn_indexes=(2, 5, 8, 11),
+                 use_abs_pos=False,
+                 use_rel_pos=False,
+                 use_abs_pos_emb=False,
+                 use_sincos_pos_emb=False,
+                 rel_pos_zero_init=True,
+                 epsilon=1e-5,
+                 final_norm=False,
+                 pretrained=None,
+                 window_size=None,
+                 out_indices=(11, ),
+                 with_fpn=False,
+                 use_checkpoint=False,
+                 *args,
+                 **kwargs):
+        super().__init__()
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.depth = depth
+        self.global_attn_indexes = global_attn_indexes
+        self.epsilon = epsilon
+        self.with_fpn = with_fpn
+        self.use_checkpoint = use_checkpoint
+
+        self.patch_h = img_size[0] // patch_size
+        self.patch_w = img_size[1] // patch_size
+        self.num_patches = self.patch_h * self.patch_w
+        self.use_abs_pos = use_abs_pos
+        self.use_abs_pos_emb = use_abs_pos_emb
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim)
+
+        dpr = np.linspace(0, drop_path_rate, depth)
+        if use_checkpoint:
+            paddle.seed(0)
+
+        if use_abs_pos_emb:
+            self.pos_w = self.patch_embed.num_patches_in_w
+            self.pos_h = self.patch_embed.num_patches_in_h
+            self.pos_embed = self.create_parameter(
+                shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
+                default_initializer=paddle.nn.initializer.TruncatedNormal(
+                    std=.02))
+        elif use_sincos_pos_emb:
+            pos_embed = self.get_2d_sincos_position_embedding(self.patch_h,
+                                                              self.patch_w)
+
+            self.pos_embed = pos_embed
+            self.pos_embed = self.create_parameter(shape=pos_embed.shape)
+            self.pos_embed.set_value(pos_embed.numpy())
+            self.pos_embed.stop_gradient = True
+        else:
+            self.pos_embed = None
+
+        self.blocks = nn.LayerList([
+            Block(
+                embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                attn_bias=attn_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                use_rel_pos=use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                window_size=None
+                if i in self.global_attn_indexes else window_size,
+                input_size=[self.patch_h, self.patch_w],
+                act_layer=act_layer,
+                lr_factor=self.get_vit_lr_decay_rate(i, lr_decay_rate),
+                norm_layer=norm_layer,
+                init_values=init_values,
+                epsilon=epsilon) for i in range(depth)
+        ])
+
+        assert len(out_indices) <= 4, 'out_indices out of bound'
+        self.out_indices = out_indices
+        self.pretrained = pretrained
+        self.init_weight()
+
+        self.out_channels = [embed_dim for _ in range(len(out_indices))]
+        self.out_strides = [4, 8, 16, 32][-len(out_indices):] if with_fpn else [
+            patch_size for _ in range(len(out_indices))
+        ]
+        self.norm = Identity()
+        if self.with_fpn:
+            self.init_fpn(
+                embed_dim=embed_dim,
+                patch_size=patch_size,
+                out_with_norm=final_norm)
+
+    def get_vit_lr_decay_rate(self, layer_id, lr_decay_rate):
+        return lr_decay_rate**(self.depth - layer_id)
+
+    def init_weight(self):
+        pretrained = self.pretrained
+        if pretrained:
+            if 'http' in pretrained:
+                path = paddle.utils.download.get_weights_path_from_url(
+                    pretrained)
+            else:
+                path = pretrained
+
+            load_state_dict = paddle.load(path)
+            model_state_dict = self.state_dict()
+            pos_embed_name = "pos_embed"
+
+            if pos_embed_name in load_state_dict.keys(
+            ) and self.use_abs_pos_emb:
+                load_pos_embed = paddle.to_tensor(
+                    load_state_dict[pos_embed_name], dtype="float32")
+                if self.pos_embed.shape != load_pos_embed.shape:
+                    pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
+                    model_state_dict[pos_embed_name] = self.resize_pos_embed(
+                        load_pos_embed, (pos_size, pos_size),
+                        (self.pos_h, self.pos_w))
+
+                    # self.set_state_dict(model_state_dict)
+                    load_state_dict[pos_embed_name] = model_state_dict[
+                        pos_embed_name]
+
+                    print("Load pos_embed and resize it from {} to {} .".format(
+                        load_pos_embed.shape, self.pos_embed.shape))
+
+            self.set_state_dict(load_state_dict)
+            print("Load load_state_dict....")
+
+    def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):
+        if patch_size == 16:
+            self.fpn1 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2),
+                nn.BatchNorm2D(embed_dim),
+                nn.GELU(),
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn2 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn3 = Identity()
+
+            self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)
+        elif patch_size == 8:
+            self.fpn1 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn2 = Identity()
+
+            self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )
+
+            self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )
+
+        if not out_with_norm:
+            self.norm = Identity()
+        else:
+            self.norm = nn.LayerNorm(embed_dim, epsilon=self.epsilon)
+
+    def resize_pos_embed(self, pos_embed, old_hw, new_hw):
+        """
+        Resize pos_embed weight.
+        Args:
+            pos_embed (Tensor): the pos_embed weight
+            old_hw (list[int]): the height and width of old pos_embed
+            new_hw (list[int]): the height and width of new pos_embed
+        Returns:
+            Tensor: the resized pos_embed weight
+        """
+        cls_pos_embed = pos_embed[:, :1, :]
+        pos_embed = pos_embed[:, 1:, :]
+
+        pos_embed = pos_embed.transpose([0, 2, 1])
+        pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
+        pos_embed = F.interpolate(
+            pos_embed, new_hw, mode='bicubic', align_corners=False)
+        pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
+        pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
+
+        return pos_embed
+
+    def get_2d_sincos_position_embedding(self, h, w, temperature=10000.):
+        grid_y, grid_x = paddle.meshgrid(
+            paddle.arange(
+                h, dtype=paddle.float32),
+            paddle.arange(
+                w, dtype=paddle.float32))
+        assert self.embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
+        pos_dim = self.embed_dim // 4
+        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
+        omega = (1. / (temperature**omega)).unsqueeze(0)
+
+        out_x = grid_x.reshape([-1, 1]).matmul(omega)
+        out_y = grid_y.reshape([-1, 1]).matmul(omega)
+
+        pos_emb = paddle.concat(
+            [
+                paddle.sin(out_y), paddle.cos(out_y), paddle.sin(out_x),
+                paddle.cos(out_x)
+            ],
+            axis=1)
+
+        return pos_emb.reshape([1, h, w, self.embed_dim])
+
+    def forward(self, inputs):
+        x = self.patch_embed(inputs['image']).transpose([0, 2, 3, 1])
+        B, Hp, Wp, _ = paddle.shape(x)
+
+        if self.use_abs_pos:
+            x = x + self.get_2d_sincos_position_embedding(Hp, Wp)
+
+        if self.use_abs_pos_emb:
+            x = x + self.resize_pos_embed(self.pos_embed,
+                                          (self.pos_h, self.pos_w), (Hp, Wp))
+
+        feats = []
+        for idx, blk in enumerate(self.blocks):
+            if self.use_checkpoint and self.training:
+                x = paddle.distributed.fleet.utils.recompute(
+                    blk, x, **{"preserve_rng_state": True})
+            else:
+                x = blk(x)
+            if idx in self.out_indices:
+                feats.append(self.norm(x.transpose([0, 3, 1, 2])))
+
+        if self.with_fpn:
+            fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
+            for i in range(len(feats)):
+                feats[i] = fpns[i](feats[i])
+        return feats
+
+    @property
+    def num_layers(self):
+        return len(self.blocks)
+
+    @property
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=c, stride=s)
+            for c, s in zip(self.out_channels, self.out_strides)
+        ]
+
+
+class LayerNorm(nn.Layer):
+    """
+    A LayerNorm variant, popularized by Transformers, that performs point-wise mean and
+    variance normalization over the channel dimension for inputs that have shape
+    (batch_size, channels, height, width).    
+    Note that, the modified LayerNorm on used in ResBlock and SimpleFeaturePyramid.
+
+    In ViT, we use the nn.LayerNorm
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6):
+        super().__init__()
+        self.weight = self.create_parameter([normalized_shape])
+        self.bias = self.create_parameter([normalized_shape])
+        self.eps = eps
+        self.normalized_shape = (normalized_shape, )
+
+    def forward(self, x):
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / paddle.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+
+
+@register
+@serializable
+class SimpleFeaturePyramid(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 spatial_scales,
+                 num_levels=4,
+                 use_bias=False):
+        """
+        Args:
+            in_channels (list[int]): input channels of each level which can be 
+                derived from the output shape of backbone by from_config
+            out_channel (int): output channel of each level.
+            spatial_scales (list[float]): list of scaling factors to upsample or downsample
+                the input features for creating pyramid features which can be derived from 
+                the output shape of backbone by from_config
+            num_levels (int): number of levels of output features.
+            use_bias (bool): whether use bias or not.
+        """
+        super(SimpleFeaturePyramid, self).__init__()
+
+        self.in_channels = in_channels[0]
+        self.out_channels = out_channels
+        self.num_levels = num_levels
+
+        self.stages = []
+        dim = self.in_channels
+        if num_levels == 4:
+            scale_factors = [2.0, 1.0, 0.5]
+        elif num_levels == 5:
+            scale_factors = [4.0, 2.0, 1.0, 0.5]
+        else:
+            raise NotImplementedError(
+                f"num_levels={num_levels} is not supported yet.")
+
+        dim = in_channels[0]
+        for idx, scale in enumerate(scale_factors):
+            out_dim = dim
+            if scale == 4.0:
+                layers = [
+                    nn.Conv2DTranspose(
+                        dim, dim // 2, kernel_size=2, stride=2),
+                    nn.LayerNorm(dim // 2),
+                    nn.GELU(),
+                    nn.Conv2DTranspose(
+                        dim // 2, dim // 4, kernel_size=2, stride=2),
+                ]
+                out_dim = dim // 4
+            elif scale == 2.0:
+                layers = [
+                    nn.Conv2DTranspose(
+                        dim, dim // 2, kernel_size=2, stride=2)
+                ]
+                out_dim = dim // 2
+            elif scale == 1.0:
+                layers = []
+            elif scale == 0.5:
+                layers = [nn.MaxPool2D(kernel_size=2, stride=2)]
+
+            layers.extend([
+                nn.Conv2D(
+                    out_dim,
+                    out_channels,
+                    kernel_size=1,
+                    bias_attr=use_bias, ), LayerNorm(out_channels), nn.Conv2D(
+                        out_channels,
+                        out_channels,
+                        kernel_size=3,
+                        padding=1,
+                        bias_attr=use_bias, ), LayerNorm(out_channels)
+            ])
+            layers = nn.Sequential(*layers)
+
+            stage = -int(math.log2(spatial_scales[0] * scale_factors[idx]))
+            self.add_sublayer(f"simfp_{stage}", layers)
+            self.stages.append(layers)
+
+        # top block output feature maps.
+        self.top_block = nn.Sequential(
+            nn.MaxPool2D(
+                kernel_size=1, stride=2, padding=0))
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            'in_channels': [i.channels for i in input_shape],
+            'spatial_scales': [1.0 / i.stride for i in input_shape],
+        }
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(channels=self.out_channels)
+            for _ in range(self.num_levels)
+        ]
+
+    def forward(self, feats):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W).
+        """
+        features = feats[0]
+        results = []
+
+        for stage in self.stages:
+            results.append(stage(features))
+
+        top_block_in_feature = results[-1]
+        results.append(self.top_block(top_block_in_feature))
+        assert self.num_levels == len(results)
+
+        return results
diff --git a/rtdetr_paddle/ppdet/modeling/bbox_utils.py b/rtdetr_paddle/ppdet/modeling/bbox_utils.py
new file mode 100644
index 0000000..576cbbf
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/bbox_utils.py
@@ -0,0 +1,607 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import numpy as np
+
+
+def bbox2delta(src_boxes, tgt_boxes, weights=[1.0, 1.0, 1.0, 1.0]):
+    """Encode bboxes to deltas.
+    """
+    src_w = src_boxes[:, 2] - src_boxes[:, 0]
+    src_h = src_boxes[:, 3] - src_boxes[:, 1]
+    src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
+    src_ctr_y = src_boxes[:, 1] + 0.5 * src_h
+
+    tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]
+    tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]
+    tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w
+    tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h
+
+    wx, wy, ww, wh = weights
+    dx = wx * (tgt_ctr_x - src_ctr_x) / src_w
+    dy = wy * (tgt_ctr_y - src_ctr_y) / src_h
+    dw = ww * paddle.log(tgt_w / src_w)
+    dh = wh * paddle.log(tgt_h / src_h)
+
+    deltas = paddle.stack((dx, dy, dw, dh), axis=1)
+    return deltas
+
+
+def delta2bbox(deltas, boxes, weights=[1.0, 1.0, 1.0, 1.0], max_shape=None):
+    """Decode deltas to boxes. Used in RCNNBox,CascadeHead,RCNNHead,RetinaHead.
+    Note: return tensor shape [n,1,4]
+        If you want to add a reshape, please add after the calling code instead of here.
+    """
+    clip_scale = math.log(1000.0 / 16)
+
+    widths = boxes[:, 2] - boxes[:, 0]
+    heights = boxes[:, 3] - boxes[:, 1]
+    ctr_x = boxes[:, 0] + 0.5 * widths
+    ctr_y = boxes[:, 1] + 0.5 * heights
+
+    wx, wy, ww, wh = weights
+    dx = deltas[:, 0::4] / wx
+    dy = deltas[:, 1::4] / wy
+    dw = deltas[:, 2::4] / ww
+    dh = deltas[:, 3::4] / wh
+    # Prevent sending too large values into paddle.exp()
+    dw = paddle.clip(dw, max=clip_scale)
+    dh = paddle.clip(dh, max=clip_scale)
+
+    pred_ctr_x = dx * widths.unsqueeze(1) + ctr_x.unsqueeze(1)
+    pred_ctr_y = dy * heights.unsqueeze(1) + ctr_y.unsqueeze(1)
+    pred_w = paddle.exp(dw) * widths.unsqueeze(1)
+    pred_h = paddle.exp(dh) * heights.unsqueeze(1)
+
+    pred_boxes = []
+    pred_boxes.append(pred_ctr_x - 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y - 0.5 * pred_h)
+    pred_boxes.append(pred_ctr_x + 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
+    pred_boxes = paddle.stack(pred_boxes, axis=-1)
+
+    if max_shape is not None:
+        pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip(
+            min=0, max=max_shape[1])
+        pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip(
+            min=0, max=max_shape[0])
+    return pred_boxes
+
+
+def bbox2delta_v2(src_boxes,
+                  tgt_boxes,
+                  delta_mean=[0.0, 0.0, 0.0, 0.0],
+                  delta_std=[1.0, 1.0, 1.0, 1.0]):
+    """Encode bboxes to deltas.
+    Modified from bbox2delta() which just use weight parameters to multiply deltas.
+    """
+    src_w = src_boxes[:, 2] - src_boxes[:, 0]
+    src_h = src_boxes[:, 3] - src_boxes[:, 1]
+    src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
+    src_ctr_y = src_boxes[:, 1] + 0.5 * src_h
+
+    tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]
+    tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]
+    tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w
+    tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h
+
+    dx = (tgt_ctr_x - src_ctr_x) / src_w
+    dy = (tgt_ctr_y - src_ctr_y) / src_h
+    dw = paddle.log(tgt_w / src_w)
+    dh = paddle.log(tgt_h / src_h)
+
+    deltas = paddle.stack((dx, dy, dw, dh), axis=1)
+    deltas = (
+        deltas - paddle.to_tensor(delta_mean)) / paddle.to_tensor(delta_std)
+    return deltas
+
+
+def delta2bbox_v2(deltas,
+                  boxes,
+                  delta_mean=[0.0, 0.0, 0.0, 0.0],
+                  delta_std=[1.0, 1.0, 1.0, 1.0],
+                  max_shape=None,
+                  ctr_clip=32.0):
+    """Decode deltas to bboxes.
+    Modified from delta2bbox() which just use weight parameters to be divided by deltas.
+    Used in YOLOFHead.
+    Note: return tensor shape [n,1,4]
+        If you want to add a reshape, please add after the calling code instead of here.
+    """
+    clip_scale = math.log(1000.0 / 16)
+
+    widths = boxes[:, 2] - boxes[:, 0]
+    heights = boxes[:, 3] - boxes[:, 1]
+    ctr_x = boxes[:, 0] + 0.5 * widths
+    ctr_y = boxes[:, 1] + 0.5 * heights
+
+    deltas = deltas * paddle.to_tensor(delta_std) + paddle.to_tensor(delta_mean)
+    dx = deltas[:, 0::4]
+    dy = deltas[:, 1::4]
+    dw = deltas[:, 2::4]
+    dh = deltas[:, 3::4]
+
+    # Prevent sending too large values into paddle.exp()
+    dx = dx * widths.unsqueeze(1)
+    dy = dy * heights.unsqueeze(1)
+    if ctr_clip is not None:
+        dx = paddle.clip(dx, max=ctr_clip, min=-ctr_clip)
+        dy = paddle.clip(dy, max=ctr_clip, min=-ctr_clip)
+        dw = paddle.clip(dw, max=clip_scale)
+        dh = paddle.clip(dh, max=clip_scale)
+    else:
+        dw = dw.clip(min=-clip_scale, max=clip_scale)
+        dh = dh.clip(min=-clip_scale, max=clip_scale)
+
+    pred_ctr_x = dx + ctr_x.unsqueeze(1)
+    pred_ctr_y = dy + ctr_y.unsqueeze(1)
+    pred_w = paddle.exp(dw) * widths.unsqueeze(1)
+    pred_h = paddle.exp(dh) * heights.unsqueeze(1)
+
+    pred_boxes = []
+    pred_boxes.append(pred_ctr_x - 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y - 0.5 * pred_h)
+    pred_boxes.append(pred_ctr_x + 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
+    pred_boxes = paddle.stack(pred_boxes, axis=-1)
+
+    if max_shape is not None:
+        pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip(
+            min=0, max=max_shape[1])
+        pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip(
+            min=0, max=max_shape[0])
+    return pred_boxes
+
+
+def expand_bbox(bboxes, scale):
+    w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5
+    h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5
+    x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5
+    y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5
+
+    w_half *= scale
+    h_half *= scale
+
+    bboxes_exp = np.zeros(bboxes.shape, dtype=np.float32)
+    bboxes_exp[:, 0] = x_c - w_half
+    bboxes_exp[:, 2] = x_c + w_half
+    bboxes_exp[:, 1] = y_c - h_half
+    bboxes_exp[:, 3] = y_c + h_half
+
+    return bboxes_exp
+
+
+def clip_bbox(boxes, im_shape):
+    h, w = im_shape[0], im_shape[1]
+    x1 = boxes[:, 0].clip(0, w)
+    y1 = boxes[:, 1].clip(0, h)
+    x2 = boxes[:, 2].clip(0, w)
+    y2 = boxes[:, 3].clip(0, h)
+    return paddle.stack([x1, y1, x2, y2], axis=1)
+
+
+def nonempty_bbox(boxes, min_size=0, return_mask=False):
+    w = boxes[:, 2] - boxes[:, 0]
+    h = boxes[:, 3] - boxes[:, 1]
+    mask = paddle.logical_and(h > min_size, w > min_size)
+    if return_mask:
+        return mask
+    keep = paddle.nonzero(mask).flatten()
+    return keep
+
+
+def bbox_area(boxes):
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+def bbox_overlaps(boxes1, boxes2):
+    """
+    Calculate overlaps between boxes1 and boxes2
+
+    Args:
+        boxes1 (Tensor): boxes with shape [M, 4]
+        boxes2 (Tensor): boxes with shape [N, 4]
+
+    Return:
+        overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N]
+    """
+    M = boxes1.shape[0]
+    N = boxes2.shape[0]
+    if M * N == 0:
+        return paddle.zeros([M, N], dtype='float32')
+    area1 = bbox_area(boxes1)
+    area2 = bbox_area(boxes2)
+
+    xy_max = paddle.minimum(
+        paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:])
+    xy_min = paddle.maximum(
+        paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2])
+    width_height = xy_max - xy_min
+    width_height = width_height.clip(min=0)
+    inter = width_height.prod(axis=2)
+
+    overlaps = paddle.where(inter > 0, inter /
+                            (paddle.unsqueeze(area1, 1) + area2 - inter),
+                            paddle.zeros_like(inter))
+    return overlaps
+
+
+def batch_bbox_overlaps(bboxes1,
+                        bboxes2,
+                        mode='iou',
+                        is_aligned=False,
+                        eps=1e-6):
+    """Calculate overlap between two set of bboxes.
+    If ``is_aligned `` is ``False``, then calculate the overlaps between each
+    bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
+    pair of bboxes1 and bboxes2.
+    Args:
+        bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
+        bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
+            B indicates the batch dim, in shape (B1, B2, ..., Bn).
+            If ``is_aligned `` is ``True``, then m and n must be equal.
+        mode (str): "iou" (intersection over union) or "iof" (intersection over
+            foreground).
+        is_aligned (bool, optional): If True, then m and n must be equal.
+            Default False.
+        eps (float, optional): A value added to the denominator for numerical
+            stability. Default 1e-6.
+    Returns:
+        Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
+    """
+    assert mode in ['iou', 'iof', 'giou'], 'Unsupported mode {}'.format(mode)
+    # Either the boxes are empty or the length of boxes's last dimenstion is 4
+    assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0)
+    assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0)
+
+    # Batch dim must be the same
+    # Batch dim: (B1, B2, ... Bn)
+    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
+    batch_shape = bboxes1.shape[:-2]
+
+    rows = bboxes1.shape[-2] if bboxes1.shape[0] > 0 else 0
+    cols = bboxes2.shape[-2] if bboxes2.shape[0] > 0 else 0
+    if is_aligned:
+        assert rows == cols
+
+    if rows * cols == 0:
+        if is_aligned:
+            return paddle.full(batch_shape + (rows, ), 1)
+        else:
+            return paddle.full(batch_shape + (rows, cols), 1)
+
+    area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])
+    area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1])
+
+    if is_aligned:
+        lt = paddle.maximum(bboxes1[:, :2], bboxes2[:, :2])  # [B, rows, 2]
+        rb = paddle.minimum(bboxes1[:, 2:], bboxes2[:, 2:])  # [B, rows, 2]
+
+        wh = (rb - lt).clip(min=0)  # [B, rows, 2]
+        overlap = wh[:, 0] * wh[:, 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1 + area2 - overlap
+        else:
+            union = area1
+        if mode == 'giou':
+            enclosed_lt = paddle.minimum(bboxes1[:, :2], bboxes2[:, :2])
+            enclosed_rb = paddle.maximum(bboxes1[:, 2:], bboxes2[:, 2:])
+    else:
+        lt = paddle.maximum(bboxes1[:, :2].reshape([rows, 1, 2]),
+                            bboxes2[:, :2])  # [B, rows, cols, 2]
+        rb = paddle.minimum(bboxes1[:, 2:].reshape([rows, 1, 2]),
+                            bboxes2[:, 2:])  # [B, rows, cols, 2]
+
+        wh = (rb - lt).clip(min=0)  # [B, rows, cols, 2]
+        overlap = wh[:, :, 0] * wh[:, :, 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1.reshape([rows,1]) \
+                    + area2.reshape([1,cols]) - overlap
+        else:
+            union = area1[:, None]
+        if mode == 'giou':
+            enclosed_lt = paddle.minimum(bboxes1[:, :2].reshape([rows, 1, 2]),
+                                         bboxes2[:, :2])
+            enclosed_rb = paddle.maximum(bboxes1[:, 2:].reshape([rows, 1, 2]),
+                                         bboxes2[:, 2:])
+
+    eps = paddle.to_tensor([eps])
+    union = paddle.maximum(union, eps)
+    ious = overlap / union
+    if mode in ['iou', 'iof']:
+        return ious
+    # calculate gious
+    enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
+    enclose_area = enclose_wh[:, :, 0] * enclose_wh[:, :, 1]
+    enclose_area = paddle.maximum(enclose_area, eps)
+    gious = ious - (enclose_area - union) / enclose_area
+    return 1 - gious
+
+
+def xywh2xyxy(box):
+    x, y, w, h = box
+    x1 = x - w * 0.5
+    y1 = y - h * 0.5
+    x2 = x + w * 0.5
+    y2 = y + h * 0.5
+    return [x1, y1, x2, y2]
+
+
+def make_grid(h, w, dtype):
+    yv, xv = paddle.meshgrid([paddle.arange(h), paddle.arange(w)])
+    return paddle.stack((xv, yv), 2).cast(dtype=dtype)
+
+
+def decode_yolo(box, anchor, downsample_ratio):
+    """decode yolo box
+
+    Args:
+        box (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
+        anchor (list): anchor with the shape [na, 2]
+        downsample_ratio (int): downsample ratio, default 32
+        scale (float): scale, default 1.
+
+    Return:
+        box (list): decoded box, [x, y, w, h], all have the shape [b, na, h, w, 1]
+    """
+    x, y, w, h = box
+    na, grid_h, grid_w = x.shape[1:4]
+    grid = make_grid(grid_h, grid_w, x.dtype).reshape((1, 1, grid_h, grid_w, 2))
+    x1 = (x + grid[:, :, :, :, 0:1]) / grid_w
+    y1 = (y + grid[:, :, :, :, 1:2]) / grid_h
+
+    anchor = paddle.to_tensor(anchor, dtype=x.dtype)
+    anchor = anchor.reshape((1, na, 1, 1, 2))
+    w1 = paddle.exp(w) * anchor[:, :, :, :, 0:1] / (downsample_ratio * grid_w)
+    h1 = paddle.exp(h) * anchor[:, :, :, :, 1:2] / (downsample_ratio * grid_h)
+
+    return [x1, y1, w1, h1]
+
+
+def batch_iou_similarity(box1, box2, eps=1e-9):
+    """Calculate iou of box1 and box2 in batch
+
+    Args:
+        box1 (Tensor): box with the shape [N, M1, 4]
+        box2 (Tensor): box with the shape [N, M2, 4]
+
+    Return:
+        iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2]
+    """
+    box1 = box1.unsqueeze(2)  # [N, M1, 4] -> [N, M1, 1, 4]
+    box2 = box2.unsqueeze(1)  # [N, M2, 4] -> [N, 1, M2, 4]
+    px1y1, px2y2 = box1[:, :, :, 0:2], box1[:, :, :, 2:4]
+    gx1y1, gx2y2 = box2[:, :, :, 0:2], box2[:, :, :, 2:4]
+    x1y1 = paddle.maximum(px1y1, gx1y1)
+    x2y2 = paddle.minimum(px2y2, gx2y2)
+    overlap = (x2y2 - x1y1).clip(0).prod(-1)
+    area1 = (px2y2 - px1y1).clip(0).prod(-1)
+    area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
+    union = area1 + area2 - overlap + eps
+    return overlap / union
+
+
+def bbox_iou(box1, box2, giou=False, diou=False, ciou=False, eps=1e-9):
+    """calculate the iou of box1 and box2
+
+    Args:
+        box1 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
+        box2 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
+        giou (bool): whether use giou or not, default False
+        diou (bool): whether use diou or not, default False
+        ciou (bool): whether use ciou or not, default False
+        eps (float): epsilon to avoid divide by zero
+
+    Return:
+        iou (Tensor): iou of box1 and box1, with the shape [b, na, h, w, 1]
+    """
+    px1, py1, px2, py2 = box1
+    gx1, gy1, gx2, gy2 = box2
+    x1 = paddle.maximum(px1, gx1)
+    y1 = paddle.maximum(py1, gy1)
+    x2 = paddle.minimum(px2, gx2)
+    y2 = paddle.minimum(py2, gy2)
+
+    overlap = ((x2 - x1).clip(0)) * ((y2 - y1).clip(0))
+
+    area1 = (px2 - px1) * (py2 - py1)
+    area1 = area1.clip(0)
+
+    area2 = (gx2 - gx1) * (gy2 - gy1)
+    area2 = area2.clip(0)
+
+    union = area1 + area2 - overlap + eps
+    iou = overlap / union
+
+    if giou or ciou or diou:
+        # convex w, h
+        cw = paddle.maximum(px2, gx2) - paddle.minimum(px1, gx1)
+        ch = paddle.maximum(py2, gy2) - paddle.minimum(py1, gy1)
+        if giou:
+            c_area = cw * ch + eps
+            return iou - (c_area - union) / c_area
+        else:
+            # convex diagonal squared
+            c2 = cw**2 + ch**2 + eps
+            # center distance
+            rho2 = ((px1 + px2 - gx1 - gx2)**2 + (py1 + py2 - gy1 - gy2)**2) / 4
+            if diou:
+                return iou - rho2 / c2
+            else:
+                w1, h1 = px2 - px1, py2 - py1 + eps
+                w2, h2 = gx2 - gx1, gy2 - gy1 + eps
+                delta = paddle.atan(w1 / h1) - paddle.atan(w2 / h2)
+                v = (4 / math.pi**2) * paddle.pow(delta, 2)
+                alpha = v / (1 + eps - iou + v)
+                alpha.stop_gradient = True
+                return iou - (rho2 / c2 + v * alpha)
+    else:
+        return iou
+
+
+def bbox_iou_np_expand(box1, box2, x1y1x2y2=True, eps=1e-16):
+    """
+    Calculate the iou of box1 and box2 with numpy.
+
+    Args:
+        box1 (ndarray): [N, 4]
+        box2 (ndarray): [M, 4], usually N != M
+        x1y1x2y2 (bool): whether in x1y1x2y2 stype, default True
+        eps (float): epsilon to avoid divide by zero
+    Return:
+        iou (ndarray): iou of box1 and box2, [N, M]
+    """
+    N, M = len(box1), len(box2)  # usually N != M
+    if x1y1x2y2:
+        b1_x1, b1_y1 = box1[:, 0], box1[:, 1]
+        b1_x2, b1_y2 = box1[:, 2], box1[:, 3]
+        b2_x1, b2_y1 = box2[:, 0], box2[:, 1]
+        b2_x2, b2_y2 = box2[:, 2], box2[:, 3]
+    else:
+        # cxcywh style
+        # Transform from center and width to exact coordinates
+        b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
+        b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
+        b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
+        b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
+
+    # get the coordinates of the intersection rectangle
+    inter_rect_x1 = np.zeros((N, M), dtype=np.float32)
+    inter_rect_y1 = np.zeros((N, M), dtype=np.float32)
+    inter_rect_x2 = np.zeros((N, M), dtype=np.float32)
+    inter_rect_y2 = np.zeros((N, M), dtype=np.float32)
+    for i in range(len(box2)):
+        inter_rect_x1[:, i] = np.maximum(b1_x1, b2_x1[i])
+        inter_rect_y1[:, i] = np.maximum(b1_y1, b2_y1[i])
+        inter_rect_x2[:, i] = np.minimum(b1_x2, b2_x2[i])
+        inter_rect_y2[:, i] = np.minimum(b1_y2, b2_y2[i])
+    # Intersection area
+    inter_area = np.maximum(inter_rect_x2 - inter_rect_x1, 0) * np.maximum(
+        inter_rect_y2 - inter_rect_y1, 0)
+    # Union Area
+    b1_area = np.repeat(
+        ((b1_x2 - b1_x1) * (b1_y2 - b1_y1)).reshape(-1, 1), M, axis=-1)
+    b2_area = np.repeat(
+        ((b2_x2 - b2_x1) * (b2_y2 - b2_y1)).reshape(1, -1), N, axis=0)
+
+    ious = inter_area / (b1_area + b2_area - inter_area + eps)
+    return ious
+
+
+def bbox2distance(points, bbox, max_dis=None, eps=0.1):
+    """Decode bounding box based on distances.
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        bbox (Tensor): Shape (n, 4), "xyxy" format
+        max_dis (float): Upper bound of the distance.
+        eps (float): a small value to ensure target < max_dis, instead <=
+    Returns:
+        Tensor: Decoded distances.
+    """
+    left = points[:, 0] - bbox[:, 0]
+    top = points[:, 1] - bbox[:, 1]
+    right = bbox[:, 2] - points[:, 0]
+    bottom = bbox[:, 3] - points[:, 1]
+    if max_dis is not None:
+        left = left.clip(min=0, max=max_dis - eps)
+        top = top.clip(min=0, max=max_dis - eps)
+        right = right.clip(min=0, max=max_dis - eps)
+        bottom = bottom.clip(min=0, max=max_dis - eps)
+    return paddle.stack([left, top, right, bottom], -1)
+
+
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+        Args:
+            points (Tensor): Shape (n, 2), [x, y].
+            distance (Tensor): Distance from the given point to 4
+                boundaries (left, top, right, bottom).
+            max_shape (tuple): Shape of the image.
+        Returns:
+            Tensor: Decoded bboxes.
+        """
+    x1 = points[:, 0] - distance[:, 0]
+    y1 = points[:, 1] - distance[:, 1]
+    x2 = points[:, 0] + distance[:, 2]
+    y2 = points[:, 1] + distance[:, 3]
+    if max_shape is not None:
+        x1 = x1.clip(min=0, max=max_shape[1])
+        y1 = y1.clip(min=0, max=max_shape[0])
+        x2 = x2.clip(min=0, max=max_shape[1])
+        y2 = y2.clip(min=0, max=max_shape[0])
+    return paddle.stack([x1, y1, x2, y2], -1)
+
+
+def bbox_center(boxes):
+    """Get bbox centers from boxes.
+    Args:
+        boxes (Tensor): boxes with shape (..., 4), "xmin, ymin, xmax, ymax" format.
+    Returns:
+        Tensor: boxes centers with shape (..., 2), "cx, cy" format.
+    """
+    boxes_cx = (boxes[..., 0] + boxes[..., 2]) / 2
+    boxes_cy = (boxes[..., 1] + boxes[..., 3]) / 2
+    return paddle.stack([boxes_cx, boxes_cy], axis=-1)
+
+
+def batch_distance2bbox(points, distance, max_shapes=None):
+    """Decode distance prediction to bounding box for batch.
+    Args:
+        points (Tensor): [B, ..., 2], "xy" format
+        distance (Tensor): [B, ..., 4], "ltrb" format
+        max_shapes (Tensor): [B, 2], "h,w" format, Shape of the image.
+    Returns:
+        Tensor: Decoded bboxes, "x1y1x2y2" format.
+    """
+    lt, rb = paddle.split(distance, 2, -1)
+    # while tensor add parameters, parameters should be better placed on the second place
+    x1y1 = -lt + points
+    x2y2 = rb + points
+    out_bbox = paddle.concat([x1y1, x2y2], -1)
+    if max_shapes is not None:
+        max_shapes = max_shapes.flip(-1).tile([1, 2])
+        delta_dim = out_bbox.ndim - max_shapes.ndim
+        for _ in range(delta_dim):
+            max_shapes.unsqueeze_(1)
+        out_bbox = paddle.where(out_bbox < max_shapes, out_bbox, max_shapes)
+        out_bbox = paddle.where(out_bbox > 0, out_bbox,
+                                paddle.zeros_like(out_bbox))
+    return out_bbox
+
+
+def iou_similarity(box1, box2, eps=1e-10):
+    """Calculate iou of box1 and box2
+
+    Args:
+        box1 (Tensor): box with the shape [M1, 4]
+        box2 (Tensor): box with the shape [M2, 4]
+
+    Return:
+        iou (Tensor): iou between box1 and box2 with the shape [M1, M2]
+    """
+    box1 = box1.unsqueeze(1)  # [M1, 4] -> [M1, 1, 4]
+    box2 = box2.unsqueeze(0)  # [M2, 4] -> [1, M2, 4]
+    px1y1, px2y2 = box1[:, :, 0:2], box1[:, :, 2:4]
+    gx1y1, gx2y2 = box2[:, :, 0:2], box2[:, :, 2:4]
+    x1y1 = paddle.maximum(px1y1, gx1y1)
+    x2y2 = paddle.minimum(px2y2, gx2y2)
+    overlap = (x2y2 - x1y1).clip(0).prod(-1)
+    area1 = (px2y2 - px1y1).clip(0).prod(-1)
+    area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
+    union = area1 + area2 - overlap + eps
+    return overlap / union
diff --git a/rtdetr_paddle/ppdet/modeling/cls_utils.py b/rtdetr_paddle/ppdet/modeling/cls_utils.py
new file mode 100644
index 0000000..3ae8d11
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/cls_utils.py
@@ -0,0 +1,40 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def _get_class_default_kwargs(cls, *args, **kwargs):
+    """
+    Get default arguments of a class in dict format, if args and
+    kwargs is specified, it will replace default arguments
+    """
+    varnames = cls.__init__.__code__.co_varnames
+    argcount = cls.__init__.__code__.co_argcount
+    keys = varnames[:argcount]
+    assert keys[0] == 'self'
+    keys = keys[1:]
+
+    values = list(cls.__init__.__defaults__)
+    assert len(values) == len(keys)
+
+    if len(args) > 0:
+        for i, arg in enumerate(args):
+            values[i] = arg
+
+    default_kwargs = dict(zip(keys, values))
+
+    if len(kwargs) > 0:
+        for k, v in kwargs.items():
+            default_kwargs[k] = v
+
+    return default_kwargs
diff --git a/rtdetr_paddle/ppdet/modeling/heads/__init__.py b/rtdetr_paddle/ppdet/modeling/heads/__init__.py
new file mode 100644
index 0000000..ccd9c24
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/heads/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .detr_head import *
+
diff --git a/rtdetr_paddle/ppdet/modeling/heads/detr_head.py b/rtdetr_paddle/ppdet/modeling/heads/detr_head.py
new file mode 100644
index 0000000..fde4bb4
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/heads/detr_head.py
@@ -0,0 +1,534 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+from ..initializer import linear_init_, constant_
+from ..transformers.utils import inverse_sigmoid
+
+import pycocotools.mask as mask_util
+
+__all__ = ['DETRHead', 'DeformableDETRHead', 'DINOHead', 'MaskDINOHead']
+
+
+class MLP(nn.Layer):
+    """This code is based on
+        https://github.com/facebookresearch/detr/blob/main/models/detr.py
+    """
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.LayerList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for l in self.layers:
+            linear_init_(l)
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+class MultiHeadAttentionMap(nn.Layer):
+    """This code is based on
+        https://github.com/facebookresearch/detr/blob/main/models/segmentation.py
+
+        This is a 2D attention module, which only returns the attention softmax (no multiplication by value)
+    """
+
+    def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0,
+                 bias=True):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.dropout = nn.Dropout(dropout)
+
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.XavierUniform())
+        bias_attr = paddle.framework.ParamAttr(
+            initializer=paddle.nn.initializer.Constant()) if bias else False
+
+        self.q_proj = nn.Linear(query_dim, hidden_dim, weight_attr, bias_attr)
+        self.k_proj = nn.Conv2D(
+            query_dim,
+            hidden_dim,
+            1,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr)
+
+        self.normalize_fact = float(hidden_dim / self.num_heads)**-0.5
+
+    def forward(self, q, k, mask=None):
+        q = self.q_proj(q)
+        k = self.k_proj(k)
+        bs, num_queries, n, c, h, w = q.shape[0], q.shape[1], self.num_heads,\
+                                      self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1]
+        qh = q.reshape([bs, num_queries, n, c])
+        kh = k.reshape([bs, n, c, h, w])
+        # weights = paddle.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh)
+        qh = qh.transpose([0, 2, 1, 3]).reshape([-1, num_queries, c])
+        kh = kh.reshape([-1, c, h * w])
+        weights = paddle.bmm(qh * self.normalize_fact, kh).reshape(
+            [bs, n, num_queries, h, w]).transpose([0, 2, 1, 3, 4])
+
+        if mask is not None:
+            weights += mask
+        # fix a potenial bug: https://github.com/facebookresearch/detr/issues/247
+        weights = F.softmax(weights.flatten(3), axis=-1).reshape(weights.shape)
+        weights = self.dropout(weights)
+        return weights
+
+
+class MaskHeadFPNConv(nn.Layer):
+    """This code is based on
+        https://github.com/facebookresearch/detr/blob/main/models/segmentation.py
+
+        Simple convolutional head, using group norm.
+        Upsampling is done using a FPN approach
+    """
+
+    def __init__(self, input_dim, fpn_dims, context_dim, num_groups=8):
+        super().__init__()
+
+        inter_dims = [input_dim,
+                      ] + [context_dim // (2**i) for i in range(1, 5)]
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.KaimingUniform())
+        bias_attr = paddle.framework.ParamAttr(
+            initializer=paddle.nn.initializer.Constant())
+
+        self.conv0 = self._make_layers(input_dim, input_dim, 3, num_groups,
+                                       weight_attr, bias_attr)
+        self.conv_inter = nn.LayerList()
+        for in_dims, out_dims in zip(inter_dims[:-1], inter_dims[1:]):
+            self.conv_inter.append(
+                self._make_layers(in_dims, out_dims, 3, num_groups, weight_attr,
+                                  bias_attr))
+
+        self.conv_out = nn.Conv2D(
+            inter_dims[-1],
+            1,
+            3,
+            padding=1,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr)
+
+        self.adapter = nn.LayerList()
+        for i in range(len(fpn_dims)):
+            self.adapter.append(
+                nn.Conv2D(
+                    fpn_dims[i],
+                    inter_dims[i + 1],
+                    1,
+                    weight_attr=weight_attr,
+                    bias_attr=bias_attr))
+
+    def _make_layers(self,
+                     in_dims,
+                     out_dims,
+                     kernel_size,
+                     num_groups,
+                     weight_attr=None,
+                     bias_attr=None):
+        return nn.Sequential(
+            nn.Conv2D(
+                in_dims,
+                out_dims,
+                kernel_size,
+                padding=kernel_size // 2,
+                weight_attr=weight_attr,
+                bias_attr=bias_attr),
+            nn.GroupNorm(num_groups, out_dims),
+            nn.ReLU())
+
+    def forward(self, x, bbox_attention_map, fpns):
+        x = paddle.concat([
+            x.tile([bbox_attention_map.shape[1], 1, 1, 1]),
+            bbox_attention_map.flatten(0, 1)
+        ], 1)
+        x = self.conv0(x)
+        for inter_layer, adapter_layer, feat in zip(self.conv_inter[:-1],
+                                                    self.adapter, fpns):
+            feat = adapter_layer(feat).tile(
+                [bbox_attention_map.shape[1], 1, 1, 1])
+            x = inter_layer(x)
+            x = feat + F.interpolate(x, size=feat.shape[-2:])
+
+        x = self.conv_inter[-1](x)
+        x = self.conv_out(x)
+        return x
+
+
+@register
+class DETRHead(nn.Layer):
+    __shared__ = ['num_classes', 'hidden_dim', 'use_focal_loss']
+    __inject__ = ['loss']
+
+    def __init__(self,
+                 num_classes=80,
+                 hidden_dim=256,
+                 nhead=8,
+                 num_mlp_layers=3,
+                 loss='DETRLoss',
+                 fpn_dims=[1024, 512, 256],
+                 with_mask_head=False,
+                 use_focal_loss=False):
+        super(DETRHead, self).__init__()
+        # add background class
+        self.num_classes = num_classes if use_focal_loss else num_classes + 1
+        self.hidden_dim = hidden_dim
+        self.loss = loss
+        self.with_mask_head = with_mask_head
+        self.use_focal_loss = use_focal_loss
+
+        self.score_head = nn.Linear(hidden_dim, self.num_classes)
+        self.bbox_head = MLP(hidden_dim,
+                             hidden_dim,
+                             output_dim=4,
+                             num_layers=num_mlp_layers)
+        if self.with_mask_head:
+            self.bbox_attention = MultiHeadAttentionMap(hidden_dim, hidden_dim,
+                                                        nhead)
+            self.mask_head = MaskHeadFPNConv(hidden_dim + nhead, fpn_dims,
+                                             hidden_dim)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.score_head)
+
+    @classmethod
+    def from_config(cls, cfg, hidden_dim, nhead, input_shape):
+
+        return {
+            'hidden_dim': hidden_dim,
+            'nhead': nhead,
+            'fpn_dims': [i.channels for i in input_shape[::-1]][1:]
+        }
+
+    @staticmethod
+    def get_gt_mask_from_polygons(gt_poly, pad_mask):
+        out_gt_mask = []
+        for polygons, padding in zip(gt_poly, pad_mask):
+            height, width = int(padding[:, 0].sum()), int(padding[0, :].sum())
+            masks = []
+            for obj_poly in polygons:
+                rles = mask_util.frPyObjects(obj_poly, height, width)
+                rle = mask_util.merge(rles)
+                masks.append(
+                    paddle.to_tensor(mask_util.decode(rle)).astype('float32'))
+            masks = paddle.stack(masks)
+            masks_pad = paddle.zeros(
+                [masks.shape[0], pad_mask.shape[1], pad_mask.shape[2]])
+            masks_pad[:, :height, :width] = masks
+            out_gt_mask.append(masks_pad)
+        return out_gt_mask
+
+    def forward(self, out_transformer, body_feats, inputs=None):
+        r"""
+        Args:
+            out_transformer (Tuple): (feats: [num_levels, batch_size,
+                                                num_queries, hidden_dim],
+                            memory: [batch_size, hidden_dim, h, w],
+                            src_proj: [batch_size, h*w, hidden_dim],
+                            src_mask: [batch_size, 1, 1, h, w])
+            body_feats (List(Tensor)): list[[B, C, H, W]]
+            inputs (dict): dict(inputs)
+        """
+        feats, memory, src_proj, src_mask = out_transformer
+        outputs_logit = self.score_head(feats)
+        outputs_bbox = F.sigmoid(self.bbox_head(feats))
+        outputs_seg = None
+        if self.with_mask_head:
+            bbox_attention_map = self.bbox_attention(feats[-1], memory,
+                                                     src_mask)
+            fpn_feats = [a for a in body_feats[::-1]][1:]
+            outputs_seg = self.mask_head(src_proj, bbox_attention_map,
+                                         fpn_feats)
+            outputs_seg = outputs_seg.reshape([
+                feats.shape[1], feats.shape[2], outputs_seg.shape[-2],
+                outputs_seg.shape[-1]
+            ])
+
+        if self.training:
+            assert inputs is not None
+            assert 'gt_bbox' in inputs and 'gt_class' in inputs
+            gt_mask = self.get_gt_mask_from_polygons(
+                inputs['gt_poly'],
+                inputs['pad_mask']) if 'gt_poly' in inputs else None
+            return self.loss(
+                outputs_bbox,
+                outputs_logit,
+                inputs['gt_bbox'],
+                inputs['gt_class'],
+                masks=outputs_seg,
+                gt_mask=gt_mask)
+        else:
+            return (outputs_bbox[-1], outputs_logit[-1], outputs_seg)
+
+
+@register
+class DeformableDETRHead(nn.Layer):
+    __shared__ = ['num_classes', 'hidden_dim']
+    __inject__ = ['loss']
+
+    def __init__(self,
+                 num_classes=80,
+                 hidden_dim=512,
+                 nhead=8,
+                 num_mlp_layers=3,
+                 loss='DETRLoss'):
+        super(DeformableDETRHead, self).__init__()
+        self.num_classes = num_classes
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+        self.loss = loss
+
+        self.score_head = nn.Linear(hidden_dim, self.num_classes)
+        self.bbox_head = MLP(hidden_dim,
+                             hidden_dim,
+                             output_dim=4,
+                             num_layers=num_mlp_layers)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.score_head)
+        constant_(self.score_head.bias, -4.595)
+        constant_(self.bbox_head.layers[-1].weight)
+
+        with paddle.no_grad():
+            bias = paddle.zeros_like(self.bbox_head.layers[-1].bias)
+            bias[2:] = -2.0
+            self.bbox_head.layers[-1].bias.set_value(bias)
+
+    @classmethod
+    def from_config(cls, cfg, hidden_dim, nhead, input_shape):
+        return {'hidden_dim': hidden_dim, 'nhead': nhead}
+
+    def forward(self, out_transformer, body_feats, inputs=None):
+        r"""
+        Args:
+            out_transformer (Tuple): (feats: [num_levels, batch_size,
+                                                num_queries, hidden_dim],
+                            memory: [batch_size,
+                                \sum_{l=0}^{L-1} H_l \cdot W_l, hidden_dim],
+                            reference_points: [batch_size, num_queries, 2])
+            body_feats (List(Tensor)): list[[B, C, H, W]]
+            inputs (dict): dict(inputs)
+        """
+        feats, memory, reference_points = out_transformer
+        reference_points = inverse_sigmoid(reference_points.unsqueeze(0))
+        outputs_bbox = self.bbox_head(feats)
+
+        # It's equivalent to "outputs_bbox[:, :, :, :2] += reference_points",
+        # but the gradient is wrong in paddle.
+        outputs_bbox = paddle.concat(
+            [
+                outputs_bbox[:, :, :, :2] + reference_points,
+                outputs_bbox[:, :, :, 2:]
+            ],
+            axis=-1)
+
+        outputs_bbox = F.sigmoid(outputs_bbox)
+        outputs_logit = self.score_head(feats)
+
+        if self.training:
+            assert inputs is not None
+            assert 'gt_bbox' in inputs and 'gt_class' in inputs
+
+            return self.loss(outputs_bbox, outputs_logit, inputs['gt_bbox'],
+                             inputs['gt_class'])
+        else:
+            return (outputs_bbox[-1], outputs_logit[-1], None)
+
+
+@register
+class DINOHead(nn.Layer):
+    __inject__ = ['loss']
+
+    def __init__(self, loss='DINOLoss'):
+        super(DINOHead, self).__init__()
+        self.loss = loss
+
+    def forward(self, out_transformer, body_feats, inputs=None):
+        (dec_out_bboxes, dec_out_logits, enc_topk_bboxes, enc_topk_logits,
+         dn_meta) = out_transformer
+        if self.training:
+            assert inputs is not None
+            assert 'gt_bbox' in inputs and 'gt_class' in inputs
+
+            if dn_meta is not None:
+                if isinstance(dn_meta, list):
+                    dual_groups = len(dn_meta) - 1
+                    dec_out_bboxes = paddle.split(
+                        dec_out_bboxes, dual_groups + 1, axis=2)
+                    dec_out_logits = paddle.split(
+                        dec_out_logits, dual_groups + 1, axis=2)
+                    enc_topk_bboxes = paddle.split(
+                        enc_topk_bboxes, dual_groups + 1, axis=1)
+                    enc_topk_logits = paddle.split(
+                        enc_topk_logits, dual_groups + 1, axis=1)
+
+                    dec_out_bboxes_list = []
+                    dec_out_logits_list = []
+                    dn_out_bboxes_list = []
+                    dn_out_logits_list = []
+                    loss = {}
+                    for g_id in range(dual_groups + 1):
+                        if dn_meta[g_id] is not None:
+                            dn_out_bboxes_gid, dec_out_bboxes_gid = paddle.split(
+                                dec_out_bboxes[g_id],
+                                dn_meta[g_id]['dn_num_split'],
+                                axis=2)
+                            dn_out_logits_gid, dec_out_logits_gid = paddle.split(
+                                dec_out_logits[g_id],
+                                dn_meta[g_id]['dn_num_split'],
+                                axis=2)
+                        else:
+                            dn_out_bboxes_gid, dn_out_logits_gid = None, None
+                            dec_out_bboxes_gid = dec_out_bboxes[g_id]
+                            dec_out_logits_gid = dec_out_logits[g_id]
+                        out_bboxes_gid = paddle.concat([
+                            enc_topk_bboxes[g_id].unsqueeze(0),
+                            dec_out_bboxes_gid
+                        ])
+                        out_logits_gid = paddle.concat([
+                            enc_topk_logits[g_id].unsqueeze(0),
+                            dec_out_logits_gid
+                        ])
+                        loss_gid = self.loss(
+                            out_bboxes_gid,
+                            out_logits_gid,
+                            inputs['gt_bbox'],
+                            inputs['gt_class'],
+                            dn_out_bboxes=dn_out_bboxes_gid,
+                            dn_out_logits=dn_out_logits_gid,
+                            dn_meta=dn_meta[g_id])
+                        # sum loss
+                        for key, value in loss_gid.items():
+                            loss.update({
+                                key: loss.get(key, paddle.zeros([1])) + value
+                            })
+
+                    # average across (dual_groups + 1)
+                    for key, value in loss.items():
+                        loss.update({key: value / (dual_groups + 1)})
+                    return loss
+                else:
+                    dn_out_bboxes, dec_out_bboxes = paddle.split(
+                        dec_out_bboxes, dn_meta['dn_num_split'], axis=2)
+                    dn_out_logits, dec_out_logits = paddle.split(
+                        dec_out_logits, dn_meta['dn_num_split'], axis=2)
+            else:
+                dn_out_bboxes, dn_out_logits = None, None
+
+            out_bboxes = paddle.concat(
+                [enc_topk_bboxes.unsqueeze(0), dec_out_bboxes])
+            out_logits = paddle.concat(
+                [enc_topk_logits.unsqueeze(0), dec_out_logits])
+
+            return self.loss(
+                out_bboxes,
+                out_logits,
+                inputs['gt_bbox'],
+                inputs['gt_class'],
+                dn_out_bboxes=dn_out_bboxes,
+                dn_out_logits=dn_out_logits,
+                dn_meta=dn_meta)
+        else:
+            return (dec_out_bboxes[-1], dec_out_logits[-1], None)
+
+
+@register
+class MaskDINOHead(nn.Layer):
+    __inject__ = ['loss']
+
+    def __init__(self, loss='DINOLoss'):
+        super(MaskDINOHead, self).__init__()
+        self.loss = loss
+
+    def forward(self, out_transformer, body_feats, inputs=None):
+        (dec_out_logits, dec_out_bboxes, dec_out_masks, enc_out, init_out,
+         dn_meta) = out_transformer
+        if self.training:
+            assert inputs is not None
+            assert 'gt_bbox' in inputs and 'gt_class' in inputs
+            assert 'gt_segm' in inputs
+
+            if dn_meta is not None:
+                dn_out_logits, dec_out_logits = paddle.split(
+                    dec_out_logits, dn_meta['dn_num_split'], axis=2)
+                dn_out_bboxes, dec_out_bboxes = paddle.split(
+                    dec_out_bboxes, dn_meta['dn_num_split'], axis=2)
+                dn_out_masks, dec_out_masks = paddle.split(
+                    dec_out_masks, dn_meta['dn_num_split'], axis=2)
+                if init_out is not None:
+                    init_out_logits, init_out_bboxes, init_out_masks = init_out
+                    init_out_logits_dn, init_out_logits = paddle.split(
+                        init_out_logits, dn_meta['dn_num_split'], axis=1)
+                    init_out_bboxes_dn, init_out_bboxes = paddle.split(
+                        init_out_bboxes, dn_meta['dn_num_split'], axis=1)
+                    init_out_masks_dn, init_out_masks = paddle.split(
+                        init_out_masks, dn_meta['dn_num_split'], axis=1)
+
+                    dec_out_logits = paddle.concat(
+                        [init_out_logits.unsqueeze(0), dec_out_logits])
+                    dec_out_bboxes = paddle.concat(
+                        [init_out_bboxes.unsqueeze(0), dec_out_bboxes])
+                    dec_out_masks = paddle.concat(
+                        [init_out_masks.unsqueeze(0), dec_out_masks])
+
+                    dn_out_logits = paddle.concat(
+                        [init_out_logits_dn.unsqueeze(0), dn_out_logits])
+                    dn_out_bboxes = paddle.concat(
+                        [init_out_bboxes_dn.unsqueeze(0), dn_out_bboxes])
+                    dn_out_masks = paddle.concat(
+                        [init_out_masks_dn.unsqueeze(0), dn_out_masks])
+            else:
+                dn_out_bboxes, dn_out_logits = None, None
+                dn_out_masks = None
+
+            enc_out_logits, enc_out_bboxes, enc_out_masks = enc_out
+            out_logits = paddle.concat(
+                [enc_out_logits.unsqueeze(0), dec_out_logits])
+            out_bboxes = paddle.concat(
+                [enc_out_bboxes.unsqueeze(0), dec_out_bboxes])
+            out_masks = paddle.concat(
+                [enc_out_masks.unsqueeze(0), dec_out_masks])
+
+            return self.loss(
+                out_bboxes,
+                out_logits,
+                inputs['gt_bbox'],
+                inputs['gt_class'],
+                masks=out_masks,
+                gt_mask=inputs['gt_segm'],
+                dn_out_logits=dn_out_logits,
+                dn_out_bboxes=dn_out_bboxes,
+                dn_out_masks=dn_out_masks,
+                dn_meta=dn_meta)
+        else:
+            return (dec_out_bboxes[-1], dec_out_logits[-1], dec_out_masks[-1])
diff --git a/rtdetr_paddle/ppdet/modeling/initializer.py b/rtdetr_paddle/ppdet/modeling/initializer.py
new file mode 100644
index 0000000..308c51b
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/initializer.py
@@ -0,0 +1,325 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file.
+"""
+
+import math
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+
+__all__ = [
+    'uniform_',
+    'normal_',
+    'constant_',
+    'ones_',
+    'zeros_',
+    'xavier_uniform_',
+    'xavier_normal_',
+    'kaiming_uniform_',
+    'kaiming_normal_',
+    'linear_init_',
+    'conv_init_',
+    'reset_initialized_parameter',
+]
+
+
+def _no_grad_uniform_(tensor, a, b):
+    with paddle.no_grad():
+        tensor.set_value(
+            paddle.uniform(
+                shape=tensor.shape, dtype=tensor.dtype, min=a, max=b))
+    return tensor
+
+
+def _no_grad_normal_(tensor, mean=0., std=1.):
+    with paddle.no_grad():
+        tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape))
+    return tensor
+
+
+def _no_grad_fill_(tensor, value=0.):
+    with paddle.no_grad():
+        tensor.set_value(paddle.full_like(tensor, value, dtype=tensor.dtype))
+    return tensor
+
+
+def uniform_(tensor, a, b):
+    """
+    Modified tensor inspace using uniform_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        a (float|int): min value.
+        b (float|int): max value.
+    Return:
+        tensor
+    """
+    return _no_grad_uniform_(tensor, a, b)
+
+
+def normal_(tensor, mean=0., std=1.):
+    """
+    Modified tensor inspace using normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mean (float|int): mean value.
+        std (float|int): std value.
+    Return:
+        tensor
+    """
+    return _no_grad_normal_(tensor, mean, std)
+
+
+def constant_(tensor, value=0.):
+    """
+    Modified tensor inspace using constant_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        value (float|int): value to fill tensor.
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, value)
+
+
+def ones_(tensor):
+    """
+    Modified tensor inspace using ones_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, 1)
+
+
+def zeros_(tensor):
+    """
+    Modified tensor inspace using zeros_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, 0)
+
+
+def vector_(tensor, vector):
+    with paddle.no_grad():
+        tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype))
+    return tensor
+
+
+def _calculate_fan_in_and_fan_out(tensor, reverse=False):
+    """
+    Calculate (fan_in, _fan_out) for tensor
+
+    Args:
+        tensor (Tensor): paddle.Tensor
+        reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True
+
+    Return:
+        Tuple[fan_in, fan_out]
+    """
+    if tensor.ndim < 2:
+        raise ValueError(
+            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
+        )
+
+    if reverse:
+        num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
+    else:
+        num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0]
+
+    receptive_field_size = 1
+    if tensor.ndim > 2:
+        receptive_field_size = np.prod(tensor.shape[2:])
+
+    fan_in = num_input_fmaps * receptive_field_size
+    fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+def xavier_uniform_(tensor, gain=1., reverse=False):
+    """
+    Modified tensor inspace using xavier_uniform_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        gain (float): super parameter, 1. default.
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    k = math.sqrt(3.0) * std
+    return _no_grad_uniform_(tensor, -k, k)
+
+
+def xavier_normal_(tensor, gain=1., reverse=False):
+    """
+    Modified tensor inspace using xavier_normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        gain (float): super parameter, 1. default.
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    return _no_grad_normal_(tensor, 0, std)
+
+
+# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html
+def _calculate_correct_fan(tensor, mode, reverse=False):
+    mode = mode.lower()
+    valid_modes = ['fan_in', 'fan_out']
+    if mode not in valid_modes:
+        raise ValueError("Mode {} not supported, please use one of {}".format(
+            mode, valid_modes))
+
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
+
+    return fan_in if mode == 'fan_in' else fan_out
+
+
+def _calculate_gain(nonlinearity, param=None):
+    linear_fns = [
+        'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',
+        'conv_transpose2d', 'conv_transpose3d'
+    ]
+    if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
+        return 1
+    elif nonlinearity == 'tanh':
+        return 5.0 / 3
+    elif nonlinearity == 'relu':
+        return math.sqrt(2.0)
+    elif nonlinearity == 'leaky_relu':
+        if param is None:
+            negative_slope = 0.01
+        elif not isinstance(param, bool) and isinstance(
+                param, int) or isinstance(param, float):
+            # True/False are instances of int, hence check above
+            negative_slope = param
+        else:
+            raise ValueError("negative_slope {} not a valid number".format(
+                param))
+        return math.sqrt(2.0 / (1 + negative_slope**2))
+    elif nonlinearity == 'selu':
+        return 3.0 / 4
+    else:
+        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
+
+
+def kaiming_uniform_(tensor,
+                     a=0,
+                     mode='fan_in',
+                     nonlinearity='leaky_relu',
+                     reverse=False):
+    """
+    Modified tensor inspace using kaiming_uniform method
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+        nonlinearity (str): nonlinearity method name
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = _calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    k = math.sqrt(3.0) * std
+    return _no_grad_uniform_(tensor, -k, k)
+
+
+def kaiming_normal_(tensor,
+                    a=0,
+                    mode='fan_in',
+                    nonlinearity='leaky_relu',
+                    reverse=False):
+    """
+    Modified tensor inspace using kaiming_normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+        nonlinearity (str): nonlinearity method name
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = _calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    return _no_grad_normal_(tensor, 0, std)
+
+
+def linear_init_(module):
+    bound = 1 / math.sqrt(module.weight.shape[0])
+    uniform_(module.weight, -bound, bound)
+    if hasattr(module, "bias") and module.bias is not None:
+        uniform_(module.bias, -bound, bound)
+
+
+def conv_init_(module):
+    bound = 1 / np.sqrt(np.prod(module.weight.shape[1:]))
+    uniform_(module.weight, -bound, bound)
+    if module.bias is not None:
+        uniform_(module.bias, -bound, bound)
+
+
+def bias_init_with_prob(prior_prob=0.01):
+    """initialize conv/fc bias value according to a given probability value."""
+    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
+    return bias_init
+
+
+@paddle.no_grad()
+def reset_initialized_parameter(model, include_self=True):
+    """
+    Reset initialized parameter using following method for [conv, linear, embedding, bn]
+
+    Args:
+        model (paddle.Layer): paddle Layer
+        include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself
+    Return:
+        None
+    """
+    for _, m in model.named_sublayers(include_self=include_self):
+        if isinstance(m, nn.Conv2D):
+            k = float(m._groups) / (m._in_channels * m._kernel_size[0] *
+                                    m._kernel_size[1])
+            k = math.sqrt(k)
+            _no_grad_uniform_(m.weight, -k, k)
+            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
+                _no_grad_uniform_(m.bias, -k, k)
+
+        elif isinstance(m, nn.Linear):
+            k = math.sqrt(1. / m.weight.shape[0])
+            _no_grad_uniform_(m.weight, -k, k)
+            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
+                _no_grad_uniform_(m.bias, -k, k)
+
+        elif isinstance(m, nn.Embedding):
+            _no_grad_normal_(m.weight, mean=0., std=1.)
+
+        elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)):
+            _no_grad_fill_(m.weight, 1.)
+            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
+                _no_grad_fill_(m.bias, 0)
diff --git a/rtdetr_paddle/ppdet/modeling/keypoint_utils.py b/rtdetr_paddle/ppdet/modeling/keypoint_utils.py
new file mode 100644
index 0000000..377f1d7
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/keypoint_utils.py
@@ -0,0 +1,403 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+this code is based on https://github.com/open-mmlab/mmpose
+"""
+
+import cv2
+import numpy as np
+import paddle.nn.functional as F
+
+
+def get_affine_mat_kernel(h, w, s, inv=False):
+    if w < h:
+        w_ = s
+        h_ = int(np.ceil((s / w * h) / 64.) * 64)
+        scale_w = w
+        scale_h = h_ / w_ * w
+
+    else:
+        h_ = s
+        w_ = int(np.ceil((s / h * w) / 64.) * 64)
+        scale_h = h
+        scale_w = w_ / h_ * h
+
+    center = np.array([np.round(w / 2.), np.round(h / 2.)])
+
+    size_resized = (w_, h_)
+    trans = get_affine_transform(
+        center, np.array([scale_w, scale_h]), 0, size_resized, inv=inv)
+
+    return trans, size_resized
+
+
+def get_affine_transform(center,
+                         input_size,
+                         rot,
+                         output_size,
+                         shift=(0., 0.),
+                         inv=False):
+    """Get the affine transform matrix, given the center/scale/rot/output_size.
+
+    Args:
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        input_size (np.ndarray[2, ]): Size of input feature (width, height).
+        rot (float): Rotation angle (degree).
+        output_size (np.ndarray[2, ]): Size of the destination heatmaps.
+        shift (0-100%): Shift translation ratio wrt the width/height.
+            Default (0., 0.).
+        inv (bool): Option to inverse the affine transform direction.
+            (inv=False: src->dst or inv=True: dst->src)
+
+    Returns:
+        np.ndarray: The transform matrix.
+    """
+    assert len(center) == 2
+    assert len(output_size) == 2
+    assert len(shift) == 2
+
+    if not isinstance(input_size, (np.ndarray, list)):
+        input_size = np.array([input_size, input_size], dtype=np.float32)
+    scale_tmp = input_size
+
+    shift = np.array(shift)
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = rotate_point([0., src_w * -0.5], rot_rad)
+    dst_dir = np.array([0., dst_w * -0.5])
+
+    src = np.zeros((3, 2), dtype=np.float32)
+
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    src[2, :] = _get_3rd_point(src[0, :], src[1, :])
+
+    dst = np.zeros((3, 2), dtype=np.float32)
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+def get_warp_matrix(theta, size_input, size_dst, size_target):
+    """This code is based on
+        https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py
+
+        Calculate the transformation matrix under the constraint of unbiased.
+    Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased
+    Data Processing for Human Pose Estimation (CVPR 2020).
+
+    Args:
+        theta (float): Rotation angle in degrees.
+        size_input (np.ndarray): Size of input image [w, h].
+        size_dst (np.ndarray): Size of output image [w, h].
+        size_target (np.ndarray): Size of ROI in input plane [w, h].
+
+    Returns:
+        matrix (np.ndarray): A matrix for transformation.
+    """
+    theta = np.deg2rad(theta)
+    matrix = np.zeros((2, 3), dtype=np.float32)
+    scale_x = size_dst[0] / size_target[0]
+    scale_y = size_dst[1] / size_target[1]
+    matrix[0, 0] = np.cos(theta) * scale_x
+    matrix[0, 1] = -np.sin(theta) * scale_x
+    matrix[0, 2] = scale_x * (
+        -0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] *
+        np.sin(theta) + 0.5 * size_target[0])
+    matrix[1, 0] = np.sin(theta) * scale_y
+    matrix[1, 1] = np.cos(theta) * scale_y
+    matrix[1, 2] = scale_y * (
+        -0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] *
+        np.cos(theta) + 0.5 * size_target[1])
+    return matrix
+
+
+def _get_3rd_point(a, b):
+    """To calculate the affine matrix, three pairs of points are required. This
+    function is used to get the 3rd point, given 2D points a & b.
+
+    The 3rd point is defined by rotating vector `a - b` by 90 degrees
+    anticlockwise, using b as the rotation center.
+
+    Args:
+        a (np.ndarray): point(x,y)
+        b (np.ndarray): point(x,y)
+
+    Returns:
+        np.ndarray: The 3rd point.
+    """
+    assert len(
+        a) == 2, 'input of _get_3rd_point should be point with length of 2'
+    assert len(
+        b) == 2, 'input of _get_3rd_point should be point with length of 2'
+    direction = a - b
+    third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)
+
+    return third_pt
+
+
+def rotate_point(pt, angle_rad):
+    """Rotate a point by an angle.
+
+    Args:
+        pt (list[float]): 2 dimensional point to be rotated
+        angle_rad (float): rotation angle by radian
+
+    Returns:
+        list[float]: Rotated point.
+    """
+    assert len(pt) == 2
+    sn, cs = np.sin(angle_rad), np.cos(angle_rad)
+    new_x = pt[0] * cs - pt[1] * sn
+    new_y = pt[0] * sn + pt[1] * cs
+    rotated_pt = [new_x, new_y]
+
+    return rotated_pt
+
+
+def transpred(kpts, h, w, s):
+    trans, _ = get_affine_mat_kernel(h, w, s, inv=True)
+
+    return warp_affine_joints(kpts[..., :2].copy(), trans)
+
+
+def warp_affine_joints(joints, mat):
+    """Apply affine transformation defined by the transform matrix on the
+    joints.
+
+    Args:
+        joints (np.ndarray[..., 2]): Origin coordinate of joints.
+        mat (np.ndarray[3, 2]): The affine matrix.
+
+    Returns:
+        matrix (np.ndarray[..., 2]): Result coordinate of joints.
+    """
+    joints = np.array(joints)
+    shape = joints.shape
+    joints = joints.reshape(-1, 2)
+    return np.dot(np.concatenate(
+        (joints, joints[:, 0:1] * 0 + 1), axis=1),
+                  mat.T).reshape(shape)
+
+
+def affine_transform(pt, t):
+    new_pt = np.array([pt[0], pt[1], 1.]).T
+    new_pt = np.dot(t, new_pt)
+    return new_pt[:2]
+
+
+def transform_preds(coords, center, scale, output_size):
+    target_coords = np.zeros(coords.shape)
+    trans = get_affine_transform(center, scale * 200, 0, output_size, inv=1)
+    for p in range(coords.shape[0]):
+        target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
+    return target_coords
+
+
+def oks_iou(g, d, a_g, a_d, sigmas=None, in_vis_thre=None):
+    if not isinstance(sigmas, np.ndarray):
+        sigmas = np.array([
+            .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07,
+            .87, .87, .89, .89
+        ]) / 10.0
+    vars = (sigmas * 2)**2
+    xg = g[0::3]
+    yg = g[1::3]
+    vg = g[2::3]
+    ious = np.zeros((d.shape[0]))
+    for n_d in range(0, d.shape[0]):
+        xd = d[n_d, 0::3]
+        yd = d[n_d, 1::3]
+        vd = d[n_d, 2::3]
+        dx = xd - xg
+        dy = yd - yg
+        e = (dx**2 + dy**2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2
+        if in_vis_thre is not None:
+            ind = list(vg > in_vis_thre) and list(vd > in_vis_thre)
+            e = e[ind]
+        ious[n_d] = np.sum(np.exp(-e)) / e.shape[0] if e.shape[0] != 0 else 0.0
+    return ious
+
+
+def oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None):
+    """greedily select boxes with high confidence and overlap with current maximum <= thresh
+    rule out overlap >= thresh
+
+    Args:
+        kpts_db (list): The predicted keypoints within the image
+        thresh (float): The threshold to select the boxes
+        sigmas (np.array): The variance to calculate the oks iou
+            Default: None
+        in_vis_thre (float): The threshold to select the high confidence boxes
+            Default: None
+
+    Return:
+        keep (list): indexes to keep
+    """
+
+    if len(kpts_db) == 0:
+        return []
+
+    scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))])
+    kpts = np.array(
+        [kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))])
+    areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))])
+
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+
+        oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
+                          sigmas, in_vis_thre)
+
+        inds = np.where(oks_ovr <= thresh)[0]
+        order = order[inds + 1]
+
+    return keep
+
+
+def rescore(overlap, scores, thresh, type='gaussian'):
+    assert overlap.shape[0] == scores.shape[0]
+    if type == 'linear':
+        inds = np.where(overlap >= thresh)[0]
+        scores[inds] = scores[inds] * (1 - overlap[inds])
+    else:
+        scores = scores * np.exp(-overlap**2 / thresh)
+
+    return scores
+
+
+def soft_oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None):
+    """greedily select boxes with high confidence and overlap with current maximum <= thresh
+    rule out overlap >= thresh
+
+    Args:
+        kpts_db (list): The predicted keypoints within the image
+        thresh (float): The threshold to select the boxes
+        sigmas (np.array): The variance to calculate the oks iou
+            Default: None
+        in_vis_thre (float): The threshold to select the high confidence boxes
+            Default: None
+
+    Return:
+        keep (list): indexes to keep
+    """
+
+    if len(kpts_db) == 0:
+        return []
+
+    scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))])
+    kpts = np.array(
+        [kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))])
+    areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))])
+
+    order = scores.argsort()[::-1]
+    scores = scores[order]
+
+    # max_dets = order.size
+    max_dets = 20
+    keep = np.zeros(max_dets, dtype=np.intp)
+    keep_cnt = 0
+    while order.size > 0 and keep_cnt < max_dets:
+        i = order[0]
+
+        oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
+                          sigmas, in_vis_thre)
+
+        order = order[1:]
+        scores = rescore(oks_ovr, scores[1:], thresh)
+
+        tmp = scores.argsort()[::-1]
+        order = order[tmp]
+        scores = scores[tmp]
+
+        keep[keep_cnt] = i
+        keep_cnt += 1
+
+    keep = keep[:keep_cnt]
+
+    return keep
+
+
+def resize(input,
+           size=None,
+           scale_factor=None,
+           mode='nearest',
+           align_corners=None,
+           warning=True):
+    if warning:
+        if size is not None and align_corners:
+            input_h, input_w = tuple(int(x) for x in input.shape[2:])
+            output_h, output_w = tuple(int(x) for x in size)
+            if output_h > input_h or output_w > output_h:
+                if ((output_h > 1 and output_w > 1 and input_h > 1 and
+                     input_w > 1) and (output_h - 1) % (input_h - 1) and
+                    (output_w - 1) % (input_w - 1)):
+                    warnings.warn(
+                        f'When align_corners={align_corners}, '
+                        'the output would more aligned if '
+                        f'input size {(input_h, input_w)} is `x+1` and '
+                        f'out size {(output_h, output_w)} is `nx+1`')
+
+    return F.interpolate(input, size, scale_factor, mode, align_corners)
+
+
+def flip_back(output_flipped, flip_pairs, target_type='GaussianHeatmap'):
+    """Flip the flipped heatmaps back to the original form.
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+        - heatmap height: H
+        - heatmap width: W
+    Args:
+        output_flipped (np.ndarray[N, K, H, W]): The output heatmaps obtained
+            from the flipped images.
+        flip_pairs (list[tuple()): Pairs of keypoints which are mirrored
+            (for example, left ear -- right ear).
+        target_type (str): GaussianHeatmap or CombinedTarget
+    Returns:
+        np.ndarray: heatmaps that flipped back to the original image
+    """
+    assert len(output_flipped.shape) == 4, \
+        'output_flipped should be [batch_size, num_keypoints, height, width]'
+    shape_ori = output_flipped.shape
+    channels = 1
+    if target_type.lower() == 'CombinedTarget'.lower():
+        channels = 3
+        output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...]
+    output_flipped = output_flipped.reshape((shape_ori[0], -1, channels,
+                                             shape_ori[2], shape_ori[3]))
+    output_flipped_back = output_flipped.clone()
+
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        output_flipped_back[:, left, ...] = output_flipped[:, right, ...]
+        output_flipped_back[:, right, ...] = output_flipped[:, left, ...]
+    output_flipped_back = output_flipped_back.reshape(shape_ori)
+    # Flip horizontally
+    output_flipped_back = output_flipped_back[..., ::-1]
+    return output_flipped_back
diff --git a/rtdetr_paddle/ppdet/modeling/layers.py b/rtdetr_paddle/ppdet/modeling/layers.py
new file mode 100644
index 0000000..86c6d96
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/layers.py
@@ -0,0 +1,1346 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import six
+import numpy as np
+from numbers import Integral
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle import to_tensor
+import paddle.nn.functional as F
+from paddle.nn.initializer import Normal, Constant, XavierUniform
+from paddle.regularizer import L2Decay
+
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling.bbox_utils import delta2bbox
+from . import ops
+from .initializer import xavier_uniform_, constant_
+
+from paddle.vision.ops import DeformConv2D
+
+
+def _to_list(l):
+    if isinstance(l, (list, tuple)):
+        return list(l)
+    return [l]
+
+
+class AlignConv(nn.Layer):
+    def __init__(self, in_channels, out_channels, kernel_size=3, groups=1):
+        super(AlignConv, self).__init__()
+        self.kernel_size = kernel_size
+        self.align_conv = paddle.vision.ops.DeformConv2D(
+            in_channels,
+            out_channels,
+            kernel_size=self.kernel_size,
+            padding=(self.kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(initializer=Normal(0, 0.01)),
+            bias_attr=None)
+
+    @paddle.no_grad()
+    def get_offset(self, anchors, featmap_size, stride):
+        """
+        Args:
+            anchors: [B, L, 5] xc,yc,w,h,angle
+            featmap_size: (feat_h, feat_w)
+            stride: 8
+        Returns:
+
+        """
+        batch = anchors.shape[0]
+        dtype = anchors.dtype
+        feat_h, feat_w = featmap_size
+        pad = (self.kernel_size - 1) // 2
+        idx = paddle.arange(-pad, pad + 1, dtype=dtype)
+
+        yy, xx = paddle.meshgrid(idx, idx)
+        xx = paddle.reshape(xx, [-1])
+        yy = paddle.reshape(yy, [-1])
+
+        # get sampling locations of default conv
+        xc = paddle.arange(0, feat_w, dtype=dtype)
+        yc = paddle.arange(0, feat_h, dtype=dtype)
+        yc, xc = paddle.meshgrid(yc, xc)
+
+        xc = paddle.reshape(xc, [-1, 1])
+        yc = paddle.reshape(yc, [-1, 1])
+        x_conv = xc + xx
+        y_conv = yc + yy
+
+        # get sampling locations of anchors
+        x_ctr, y_ctr, w, h, a = paddle.split(anchors, 5, axis=-1)
+        x_ctr = x_ctr / stride
+        y_ctr = y_ctr / stride
+        w_s = w / stride
+        h_s = h / stride
+        cos, sin = paddle.cos(a), paddle.sin(a)
+        dw, dh = w_s / self.kernel_size, h_s / self.kernel_size
+        x, y = dw * xx, dh * yy
+        xr = cos * x - sin * y
+        yr = sin * x + cos * y
+        x_anchor, y_anchor = xr + x_ctr, yr + y_ctr
+        # get offset filed
+        offset_x = x_anchor - x_conv
+        offset_y = y_anchor - y_conv
+        offset = paddle.stack([offset_y, offset_x], axis=-1)
+        offset = offset.reshape(
+            [batch, feat_h, feat_w, self.kernel_size * self.kernel_size * 2])
+        offset = offset.transpose([0, 3, 1, 2])
+
+        return offset
+
+    def forward(self, x, refine_anchors, featmap_size, stride):
+        batch = paddle.shape(x)[0].numpy()
+        offset = self.get_offset(refine_anchors, featmap_size, stride)
+        if self.training:
+            x = F.relu(self.align_conv(x, offset.detach()))
+        else:
+            x = F.relu(self.align_conv(x, offset))
+        return x
+
+
+class DeformableConvV2(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 weight_attr=None,
+                 bias_attr=None,
+                 lr_scale=1,
+                 regularizer=None,
+                 skip_quant=False,
+                 dcn_bias_regularizer=L2Decay(0.),
+                 dcn_bias_lr_scale=2.):
+        super(DeformableConvV2, self).__init__()
+        self.offset_channel = 2 * kernel_size**2
+        self.mask_channel = kernel_size**2
+
+        if lr_scale == 1 and regularizer is None:
+            offset_bias_attr = ParamAttr(initializer=Constant(0.))
+        else:
+            offset_bias_attr = ParamAttr(
+                initializer=Constant(0.),
+                learning_rate=lr_scale,
+                regularizer=regularizer)
+        self.conv_offset = nn.Conv2D(
+            in_channels,
+            3 * kernel_size**2,
+            kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            weight_attr=ParamAttr(initializer=Constant(0.0)),
+            bias_attr=offset_bias_attr)
+        if skip_quant:
+            self.conv_offset.skip_quant = True
+
+        if bias_attr:
+            # in FCOS-DCN head, specifically need learning_rate and regularizer
+            dcn_bias_attr = ParamAttr(
+                initializer=Constant(value=0),
+                regularizer=dcn_bias_regularizer,
+                learning_rate=dcn_bias_lr_scale)
+        else:
+            # in ResNet backbone, do not need bias
+            dcn_bias_attr = False
+        self.conv_dcn = DeformConv2D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2 * dilation,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=dcn_bias_attr)
+
+    def forward(self, x):
+        offset_mask = self.conv_offset(x)
+        offset, mask = paddle.split(
+            offset_mask,
+            num_or_sections=[self.offset_channel, self.mask_channel],
+            axis=1)
+        mask = F.sigmoid(mask)
+        y = self.conv_dcn(x, offset, mask=mask)
+        return y
+
+
+class ConvNormLayer(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size,
+                 stride,
+                 groups=1,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 norm_groups=32,
+                 use_dcn=False,
+                 bias_on=False,
+                 lr_scale=1.,
+                 freeze_norm=False,
+                 initializer=Normal(
+                     mean=0., std=0.01),
+                 skip_quant=False,
+                 dcn_lr_scale=2.,
+                 dcn_regularizer=L2Decay(0.)):
+        super(ConvNormLayer, self).__init__()
+        assert norm_type in ['bn', 'sync_bn', 'gn', None]
+
+        if bias_on:
+            bias_attr = ParamAttr(
+                initializer=Constant(value=0.), learning_rate=lr_scale)
+        else:
+            bias_attr = False
+
+        if not use_dcn:
+            self.conv = nn.Conv2D(
+                in_channels=ch_in,
+                out_channels=ch_out,
+                kernel_size=filter_size,
+                stride=stride,
+                padding=(filter_size - 1) // 2,
+                groups=groups,
+                weight_attr=ParamAttr(
+                    initializer=initializer, learning_rate=1.),
+                bias_attr=bias_attr)
+            if skip_quant:
+                self.conv.skip_quant = True
+        else:
+            # in FCOS-DCN head, specifically need learning_rate and regularizer
+            self.conv = DeformableConvV2(
+                in_channels=ch_in,
+                out_channels=ch_out,
+                kernel_size=filter_size,
+                stride=stride,
+                padding=(filter_size - 1) // 2,
+                groups=groups,
+                weight_attr=ParamAttr(
+                    initializer=initializer, learning_rate=1.),
+                bias_attr=True,
+                lr_scale=dcn_lr_scale,
+                regularizer=dcn_regularizer,
+                dcn_bias_regularizer=dcn_regularizer,
+                dcn_bias_lr_scale=dcn_lr_scale,
+                skip_quant=skip_quant)
+
+        norm_lr = 0. if freeze_norm else 1.
+        param_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay) if norm_decay is not None else None)
+        bias_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay) if norm_decay is not None else None)
+        if norm_type in ['bn', 'sync_bn']:
+            self.norm = nn.BatchNorm2D(
+                ch_out, weight_attr=param_attr, bias_attr=bias_attr)
+        elif norm_type == 'gn':
+            self.norm = nn.GroupNorm(
+                num_groups=norm_groups,
+                num_channels=ch_out,
+                weight_attr=param_attr,
+                bias_attr=bias_attr)
+        else:
+            self.norm = None
+
+    def forward(self, inputs):
+        out = self.conv(inputs)
+        if self.norm is not None:
+            out = self.norm(out)
+        return out
+
+
+class LiteConv(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 with_act=True,
+                 norm_type='sync_bn',
+                 name=None):
+        super(LiteConv, self).__init__()
+        self.lite_conv = nn.Sequential()
+        conv1 = ConvNormLayer(
+            in_channels,
+            in_channels,
+            filter_size=5,
+            stride=stride,
+            groups=in_channels,
+            norm_type=norm_type,
+            initializer=XavierUniform())
+        conv2 = ConvNormLayer(
+            in_channels,
+            out_channels,
+            filter_size=1,
+            stride=stride,
+            norm_type=norm_type,
+            initializer=XavierUniform())
+        conv3 = ConvNormLayer(
+            out_channels,
+            out_channels,
+            filter_size=1,
+            stride=stride,
+            norm_type=norm_type,
+            initializer=XavierUniform())
+        conv4 = ConvNormLayer(
+            out_channels,
+            out_channels,
+            filter_size=5,
+            stride=stride,
+            groups=out_channels,
+            norm_type=norm_type,
+            initializer=XavierUniform())
+        conv_list = [conv1, conv2, conv3, conv4]
+        self.lite_conv.add_sublayer('conv1', conv1)
+        self.lite_conv.add_sublayer('relu6_1', nn.ReLU6())
+        self.lite_conv.add_sublayer('conv2', conv2)
+        if with_act:
+            self.lite_conv.add_sublayer('relu6_2', nn.ReLU6())
+        self.lite_conv.add_sublayer('conv3', conv3)
+        self.lite_conv.add_sublayer('relu6_3', nn.ReLU6())
+        self.lite_conv.add_sublayer('conv4', conv4)
+        if with_act:
+            self.lite_conv.add_sublayer('relu6_4', nn.ReLU6())
+
+    def forward(self, inputs):
+        out = self.lite_conv(inputs)
+        return out
+
+
+class DropBlock(nn.Layer):
+    def __init__(self, block_size, keep_prob, name=None, data_format='NCHW'):
+        """
+        DropBlock layer, see https://arxiv.org/abs/1810.12890
+
+        Args:
+            block_size (int): block size
+            keep_prob (int): keep probability
+            name (str): layer name
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(DropBlock, self).__init__()
+        self.block_size = block_size
+        self.keep_prob = keep_prob
+        self.name = name
+        self.data_format = data_format
+
+    def forward(self, x):
+        if not self.training or self.keep_prob == 1:
+            return x
+        else:
+            gamma = (1. - self.keep_prob) / (self.block_size**2)
+            if self.data_format == 'NCHW':
+                shape = x.shape[2:]
+            else:
+                shape = x.shape[1:3]
+            for s in shape:
+                gamma *= s / (s - self.block_size + 1)
+
+            matrix = paddle.cast(paddle.rand(x.shape) < gamma, x.dtype)
+            mask_inv = F.max_pool2d(
+                matrix,
+                self.block_size,
+                stride=1,
+                padding=self.block_size // 2,
+                data_format=self.data_format)
+            mask = 1. - mask_inv
+            y = x * mask * (mask.numel() / mask.sum())
+            return y
+
+
+@register
+@serializable
+class AnchorGeneratorSSD(object):
+    def __init__(self,
+                 steps=[8, 16, 32, 64, 100, 300],
+                 aspect_ratios=[[2.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]],
+                 min_ratio=15,
+                 max_ratio=90,
+                 base_size=300,
+                 min_sizes=[30.0, 60.0, 111.0, 162.0, 213.0, 264.0],
+                 max_sizes=[60.0, 111.0, 162.0, 213.0, 264.0, 315.0],
+                 offset=0.5,
+                 flip=True,
+                 clip=False,
+                 min_max_aspect_ratios_order=False):
+        self.steps = steps
+        self.aspect_ratios = aspect_ratios
+        self.min_ratio = min_ratio
+        self.max_ratio = max_ratio
+        self.base_size = base_size
+        self.min_sizes = min_sizes
+        self.max_sizes = max_sizes
+        self.offset = offset
+        self.flip = flip
+        self.clip = clip
+        self.min_max_aspect_ratios_order = min_max_aspect_ratios_order
+
+        if self.min_sizes == [] and self.max_sizes == []:
+            num_layer = len(aspect_ratios)
+            step = int(
+                math.floor(((self.max_ratio - self.min_ratio)) / (num_layer - 2
+                                                                  )))
+            for ratio in six.moves.range(self.min_ratio, self.max_ratio + 1,
+                                         step):
+                self.min_sizes.append(self.base_size * ratio / 100.)
+                self.max_sizes.append(self.base_size * (ratio + step) / 100.)
+            self.min_sizes = [self.base_size * .10] + self.min_sizes
+            self.max_sizes = [self.base_size * .20] + self.max_sizes
+
+        self.num_priors = []
+        for aspect_ratio, min_size, max_size in zip(
+                aspect_ratios, self.min_sizes, self.max_sizes):
+            if isinstance(min_size, (list, tuple)):
+                self.num_priors.append(
+                    len(_to_list(min_size)) + len(_to_list(max_size)))
+            else:
+                self.num_priors.append((len(aspect_ratio) * 2 + 1) * len(
+                    _to_list(min_size)) + len(_to_list(max_size)))
+
+    def __call__(self, inputs, image):
+        boxes = []
+        for input, min_size, max_size, aspect_ratio, step in zip(
+                inputs, self.min_sizes, self.max_sizes, self.aspect_ratios,
+                self.steps):
+            box, _ = ops.prior_box(
+                input=input,
+                image=image,
+                min_sizes=_to_list(min_size),
+                max_sizes=_to_list(max_size),
+                aspect_ratios=aspect_ratio,
+                flip=self.flip,
+                clip=self.clip,
+                steps=[step, step],
+                offset=self.offset,
+                min_max_aspect_ratios_order=self.min_max_aspect_ratios_order)
+            boxes.append(paddle.reshape(box, [-1, 4]))
+        return boxes
+
+
+@register
+@serializable
+class RCNNBox(object):
+    __shared__ = ['num_classes', 'export_onnx']
+
+    def __init__(self,
+                 prior_box_var=[10., 10., 5., 5.],
+                 code_type="decode_center_size",
+                 box_normalized=False,
+                 num_classes=80,
+                 export_onnx=False):
+        super(RCNNBox, self).__init__()
+        self.prior_box_var = prior_box_var
+        self.code_type = code_type
+        self.box_normalized = box_normalized
+        self.num_classes = num_classes
+        self.export_onnx = export_onnx
+
+    def __call__(self, bbox_head_out, rois, im_shape, scale_factor):
+        bbox_pred = bbox_head_out[0]
+        cls_prob = bbox_head_out[1]
+        roi = rois[0]
+        rois_num = rois[1]
+
+        if self.export_onnx:
+            onnx_rois_num_per_im = rois_num[0]
+            origin_shape = paddle.expand(im_shape[0, :],
+                                         [onnx_rois_num_per_im, 2])
+
+        else:
+            origin_shape_list = []
+            if isinstance(roi, list):
+                batch_size = len(roi)
+            else:
+                batch_size = paddle.slice(paddle.shape(im_shape), [0], [0], [1])
+
+            # bbox_pred.shape: [N, C*4]
+            for idx in range(batch_size):
+                rois_num_per_im = rois_num[idx]
+                expand_im_shape = paddle.expand(im_shape[idx, :],
+                                                [rois_num_per_im, 2])
+                origin_shape_list.append(expand_im_shape)
+
+            origin_shape = paddle.concat(origin_shape_list)
+
+        # bbox_pred.shape: [N, C*4]
+        # C=num_classes in faster/mask rcnn(bbox_head), C=1 in cascade rcnn(cascade_head)
+        bbox = paddle.concat(roi)
+        bbox = delta2bbox(bbox_pred, bbox, self.prior_box_var)
+        scores = cls_prob[:, :-1]
+
+        # bbox.shape: [N, C, 4]
+        # bbox.shape[1] must be equal to scores.shape[1]
+        total_num = bbox.shape[0]
+        bbox_dim = bbox.shape[-1]
+        bbox = paddle.expand(bbox, [total_num, self.num_classes, bbox_dim])
+
+        origin_h = paddle.unsqueeze(origin_shape[:, 0], axis=1)
+        origin_w = paddle.unsqueeze(origin_shape[:, 1], axis=1)
+        zeros = paddle.zeros_like(origin_h)
+        x1 = paddle.maximum(paddle.minimum(bbox[:, :, 0], origin_w), zeros)
+        y1 = paddle.maximum(paddle.minimum(bbox[:, :, 1], origin_h), zeros)
+        x2 = paddle.maximum(paddle.minimum(bbox[:, :, 2], origin_w), zeros)
+        y2 = paddle.maximum(paddle.minimum(bbox[:, :, 3], origin_h), zeros)
+        bbox = paddle.stack([x1, y1, x2, y2], axis=-1)
+        bboxes = (bbox, rois_num)
+        return bboxes, scores
+
+
+@register
+@serializable
+class MultiClassNMS(object):
+    def __init__(self,
+                 score_threshold=.05,
+                 nms_top_k=-1,
+                 keep_top_k=100,
+                 nms_threshold=.5,
+                 normalized=True,
+                 nms_eta=1.0,
+                 return_index=False,
+                 return_rois_num=True,
+                 trt=False):
+        super(MultiClassNMS, self).__init__()
+        self.score_threshold = score_threshold
+        self.nms_top_k = nms_top_k
+        self.keep_top_k = keep_top_k
+        self.nms_threshold = nms_threshold
+        self.normalized = normalized
+        self.nms_eta = nms_eta
+        self.return_index = return_index
+        self.return_rois_num = return_rois_num
+        self.trt = trt
+
+    def __call__(self, bboxes, score, background_label=-1):
+        """
+        bboxes (Tensor|List[Tensor]): 1. (Tensor) Predicted bboxes with shape 
+                                         [N, M, 4], N is the batch size and M
+                                         is the number of bboxes
+                                      2. (List[Tensor]) bboxes and bbox_num,
+                                         bboxes have shape of [M, C, 4], C
+                                         is the class number and bbox_num means
+                                         the number of bboxes of each batch with
+                                         shape [N,] 
+        score (Tensor): Predicted scores with shape [N, C, M] or [M, C]
+        background_label (int): Ignore the background label; For example, RCNN
+                                is num_classes and YOLO is -1. 
+        """
+        kwargs = self.__dict__.copy()
+        if isinstance(bboxes, tuple):
+            bboxes, bbox_num = bboxes
+            kwargs.update({'rois_num': bbox_num})
+        if background_label > -1:
+            kwargs.update({'background_label': background_label})
+        kwargs.pop('trt')
+        # TODO(wangxinxin08): paddle version should be develop or 2.3 and above to run nms on tensorrt
+        if self.trt and (int(paddle.version.major) == 0 or
+                         (int(paddle.version.major) >= 2 and
+                          int(paddle.version.minor) >= 3)):
+            # TODO(wangxinxin08): tricky switch to run nms on tensorrt
+            kwargs.update({'nms_eta': 1.1})
+            bbox, bbox_num, _ = ops.multiclass_nms(bboxes, score, **kwargs)
+            bbox = bbox.reshape([1, -1, 6])
+            idx = paddle.nonzero(bbox[..., 0] != -1)
+            bbox = paddle.gather_nd(bbox, idx)
+            return bbox, bbox_num, None
+        else:
+            return ops.multiclass_nms(bboxes, score, **kwargs)
+
+
+@register
+@serializable
+class MatrixNMS(object):
+    __append_doc__ = True
+
+    def __init__(self,
+                 score_threshold=.05,
+                 post_threshold=.05,
+                 nms_top_k=-1,
+                 keep_top_k=100,
+                 use_gaussian=False,
+                 gaussian_sigma=2.,
+                 normalized=False,
+                 background_label=0):
+        super(MatrixNMS, self).__init__()
+        self.score_threshold = score_threshold
+        self.post_threshold = post_threshold
+        self.nms_top_k = nms_top_k
+        self.keep_top_k = keep_top_k
+        self.normalized = normalized
+        self.use_gaussian = use_gaussian
+        self.gaussian_sigma = gaussian_sigma
+        self.background_label = background_label
+
+    def __call__(self, bbox, score, *args):
+        return ops.matrix_nms(
+            bboxes=bbox,
+            scores=score,
+            score_threshold=self.score_threshold,
+            post_threshold=self.post_threshold,
+            nms_top_k=self.nms_top_k,
+            keep_top_k=self.keep_top_k,
+            use_gaussian=self.use_gaussian,
+            gaussian_sigma=self.gaussian_sigma,
+            background_label=self.background_label,
+            normalized=self.normalized)
+
+
+@register
+@serializable
+class YOLOBox(object):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 num_classes=80,
+                 conf_thresh=0.005,
+                 downsample_ratio=32,
+                 clip_bbox=True,
+                 scale_x_y=1.):
+        self.num_classes = num_classes
+        self.conf_thresh = conf_thresh
+        self.downsample_ratio = downsample_ratio
+        self.clip_bbox = clip_bbox
+        self.scale_x_y = scale_x_y
+
+    def __call__(self,
+                 yolo_head_out,
+                 anchors,
+                 im_shape,
+                 scale_factor,
+                 var_weight=None):
+        boxes_list = []
+        scores_list = []
+        origin_shape = im_shape / scale_factor
+        origin_shape = paddle.cast(origin_shape, 'int32')
+        for i, head_out in enumerate(yolo_head_out):
+            boxes, scores = paddle.vision.ops.yolo_box(
+                head_out,
+                origin_shape,
+                anchors[i],
+                self.num_classes,
+                self.conf_thresh,
+                self.downsample_ratio // 2**i,
+                self.clip_bbox,
+                scale_x_y=self.scale_x_y)
+            boxes_list.append(boxes)
+            scores_list.append(paddle.transpose(scores, perm=[0, 2, 1]))
+        yolo_boxes = paddle.concat(boxes_list, axis=1)
+        yolo_scores = paddle.concat(scores_list, axis=2)
+        return yolo_boxes, yolo_scores
+
+
+@register
+@serializable
+class SSDBox(object):
+    def __init__(self,
+                 is_normalized=True,
+                 prior_box_var=[0.1, 0.1, 0.2, 0.2],
+                 use_fuse_decode=False):
+        self.is_normalized = is_normalized
+        self.norm_delta = float(not self.is_normalized)
+        self.prior_box_var = prior_box_var
+        self.use_fuse_decode = use_fuse_decode
+
+    def __call__(self,
+                 preds,
+                 prior_boxes,
+                 im_shape,
+                 scale_factor,
+                 var_weight=None):
+        boxes, scores = preds
+        boxes = paddle.concat(boxes, axis=1)
+        prior_boxes = paddle.concat(prior_boxes)
+        if self.use_fuse_decode:
+            output_boxes = ops.box_coder(
+                prior_boxes,
+                self.prior_box_var,
+                boxes,
+                code_type="decode_center_size",
+                box_normalized=self.is_normalized)
+        else:
+            pb_w = prior_boxes[:, 2] - prior_boxes[:, 0] + self.norm_delta
+            pb_h = prior_boxes[:, 3] - prior_boxes[:, 1] + self.norm_delta
+            pb_x = prior_boxes[:, 0] + pb_w * 0.5
+            pb_y = prior_boxes[:, 1] + pb_h * 0.5
+            out_x = pb_x + boxes[:, :, 0] * pb_w * self.prior_box_var[0]
+            out_y = pb_y + boxes[:, :, 1] * pb_h * self.prior_box_var[1]
+            out_w = paddle.exp(boxes[:, :, 2] * self.prior_box_var[2]) * pb_w
+            out_h = paddle.exp(boxes[:, :, 3] * self.prior_box_var[3]) * pb_h
+            output_boxes = paddle.stack(
+                [
+                    out_x - out_w / 2., out_y - out_h / 2., out_x + out_w / 2.,
+                    out_y + out_h / 2.
+                ],
+                axis=-1)
+
+        if self.is_normalized:
+            h = (im_shape[:, 0] / scale_factor[:, 0]).unsqueeze(-1)
+            w = (im_shape[:, 1] / scale_factor[:, 1]).unsqueeze(-1)
+            im_shape = paddle.stack([w, h, w, h], axis=-1)
+            output_boxes *= im_shape
+        else:
+            output_boxes[..., -2:] -= 1.0
+        output_scores = F.softmax(paddle.concat(
+            scores, axis=1)).transpose([0, 2, 1])
+
+        return output_boxes, output_scores
+
+
+@register
+class TTFBox(object):
+    __shared__ = ['down_ratio']
+
+    def __init__(self, max_per_img=100, score_thresh=0.01, down_ratio=4):
+        super(TTFBox, self).__init__()
+        self.max_per_img = max_per_img
+        self.score_thresh = score_thresh
+        self.down_ratio = down_ratio
+
+    def _simple_nms(self, heat, kernel=3):
+        """
+        Use maxpool to filter the max score, get local peaks.
+        """
+        pad = (kernel - 1) // 2
+        hmax = F.max_pool2d(heat, kernel, stride=1, padding=pad)
+        keep = paddle.cast(hmax == heat, 'float32')
+        return heat * keep
+
+    def _topk(self, scores):
+        """
+        Select top k scores and decode to get xy coordinates.
+        """
+        k = self.max_per_img
+        shape_fm = paddle.shape(scores)
+        shape_fm.stop_gradient = True
+        cat, height, width = shape_fm[1], shape_fm[2], shape_fm[3]
+        # batch size is 1
+        scores_r = paddle.reshape(scores, [cat, -1])
+        topk_scores, topk_inds = paddle.topk(scores_r, k)
+        topk_ys = topk_inds // width
+        topk_xs = topk_inds % width
+
+        topk_score_r = paddle.reshape(topk_scores, [-1])
+        topk_score, topk_ind = paddle.topk(topk_score_r, k)
+        k_t = paddle.full(paddle.shape(topk_ind), k, dtype='int64')
+        topk_clses = paddle.cast(paddle.floor_divide(topk_ind, k_t), 'float32')
+
+        topk_inds = paddle.reshape(topk_inds, [-1])
+        topk_ys = paddle.reshape(topk_ys, [-1, 1])
+        topk_xs = paddle.reshape(topk_xs, [-1, 1])
+        topk_inds = paddle.gather(topk_inds, topk_ind)
+        topk_ys = paddle.gather(topk_ys, topk_ind)
+        topk_xs = paddle.gather(topk_xs, topk_ind)
+
+        return topk_score, topk_inds, topk_clses, topk_ys, topk_xs
+
+    def _decode(self, hm, wh, im_shape, scale_factor):
+        heatmap = F.sigmoid(hm)
+        heat = self._simple_nms(heatmap)
+        scores, inds, clses, ys, xs = self._topk(heat)
+        ys = paddle.cast(ys, 'float32') * self.down_ratio
+        xs = paddle.cast(xs, 'float32') * self.down_ratio
+        scores = paddle.tensor.unsqueeze(scores, [1])
+        clses = paddle.tensor.unsqueeze(clses, [1])
+
+        wh_t = paddle.transpose(wh, [0, 2, 3, 1])
+        wh = paddle.reshape(wh_t, [-1, paddle.shape(wh_t)[-1]])
+        wh = paddle.gather(wh, inds)
+
+        x1 = xs - wh[:, 0:1]
+        y1 = ys - wh[:, 1:2]
+        x2 = xs + wh[:, 2:3]
+        y2 = ys + wh[:, 3:4]
+
+        bboxes = paddle.concat([x1, y1, x2, y2], axis=1)
+
+        scale_y = scale_factor[:, 0:1]
+        scale_x = scale_factor[:, 1:2]
+        scale_expand = paddle.concat(
+            [scale_x, scale_y, scale_x, scale_y], axis=1)
+        boxes_shape = paddle.shape(bboxes)
+        boxes_shape.stop_gradient = True
+        scale_expand = paddle.expand(scale_expand, shape=boxes_shape)
+        bboxes = paddle.divide(bboxes, scale_expand)
+        results = paddle.concat([clses, scores, bboxes], axis=1)
+        # hack: append result with cls=-1 and score=1. to avoid all scores
+        # are less than score_thresh which may cause error in gather.
+        fill_r = paddle.to_tensor(np.array([[-1, 1, 0, 0, 0, 0]]))
+        fill_r = paddle.cast(fill_r, results.dtype)
+        results = paddle.concat([results, fill_r])
+        scores = results[:, 1]
+        valid_ind = paddle.nonzero(scores > self.score_thresh)
+        results = paddle.gather(results, valid_ind)
+        return results, paddle.shape(results)[0:1]
+
+    def __call__(self, hm, wh, im_shape, scale_factor):
+        results = []
+        results_num = []
+        for i in range(scale_factor.shape[0]):
+            result, num = self._decode(hm[i:i + 1, ], wh[i:i + 1, ],
+                                       im_shape[i:i + 1, ],
+                                       scale_factor[i:i + 1, ])
+            results.append(result)
+            results_num.append(num)
+        results = paddle.concat(results, axis=0)
+        results_num = paddle.concat(results_num, axis=0)
+        return results, results_num
+
+
+@register
+@serializable
+class JDEBox(object):
+    __shared__ = ['num_classes']
+
+    def __init__(self, num_classes=1, conf_thresh=0.3, downsample_ratio=32):
+        self.num_classes = num_classes
+        self.conf_thresh = conf_thresh
+        self.downsample_ratio = downsample_ratio
+
+    def generate_anchor(self, nGh, nGw, anchor_wh):
+        nA = len(anchor_wh)
+        yv, xv = paddle.meshgrid([paddle.arange(nGh), paddle.arange(nGw)])
+        mesh = paddle.stack(
+            (xv, yv), axis=0).cast(dtype='float32')  # 2 x nGh x nGw
+        meshs = paddle.tile(mesh, [nA, 1, 1, 1])
+
+        anchor_offset_mesh = anchor_wh[:, :, None][:, :, :, None].repeat(
+            int(nGh), axis=-2).repeat(
+                int(nGw), axis=-1)
+        anchor_offset_mesh = paddle.to_tensor(
+            anchor_offset_mesh.astype(np.float32))
+        # nA x 2 x nGh x nGw
+
+        anchor_mesh = paddle.concat([meshs, anchor_offset_mesh], axis=1)
+        anchor_mesh = paddle.transpose(anchor_mesh,
+                                       [0, 2, 3, 1])  # (nA x nGh x nGw) x 4
+        return anchor_mesh
+
+    def decode_delta(self, delta, fg_anchor_list):
+        px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \
+                        fg_anchor_list[:, 2], fg_anchor_list[:,3]
+        dx, dy, dw, dh = delta[:, 0], delta[:, 1], delta[:, 2], delta[:, 3]
+        gx = pw * dx + px
+        gy = ph * dy + py
+        gw = pw * paddle.exp(dw)
+        gh = ph * paddle.exp(dh)
+        gx1 = gx - gw * 0.5
+        gy1 = gy - gh * 0.5
+        gx2 = gx + gw * 0.5
+        gy2 = gy + gh * 0.5
+        return paddle.stack([gx1, gy1, gx2, gy2], axis=1)
+
+    def decode_delta_map(self, nA, nGh, nGw, delta_map, anchor_vec):
+        anchor_mesh = self.generate_anchor(nGh, nGw, anchor_vec)
+        anchor_mesh = paddle.unsqueeze(anchor_mesh, 0)
+        pred_list = self.decode_delta(
+            paddle.reshape(
+                delta_map, shape=[-1, 4]),
+            paddle.reshape(
+                anchor_mesh, shape=[-1, 4]))
+        pred_map = paddle.reshape(pred_list, shape=[nA * nGh * nGw, 4])
+        return pred_map
+
+    def _postprocessing_by_level(self, nA, stride, head_out, anchor_vec):
+        boxes_shape = head_out.shape  # [nB, nA*6, nGh, nGw]
+        nGh, nGw = boxes_shape[-2], boxes_shape[-1]
+        nB = 1  # TODO: only support bs=1 now
+        boxes_list, scores_list = [], []
+        for idx in range(nB):
+            p = paddle.reshape(
+                head_out[idx], shape=[nA, self.num_classes + 5, nGh, nGw])
+            p = paddle.transpose(p, perm=[0, 2, 3, 1])  # [nA, nGh, nGw, 6]
+            delta_map = p[:, :, :, :4]
+            boxes = self.decode_delta_map(nA, nGh, nGw, delta_map, anchor_vec)
+            # [nA * nGh * nGw, 4]
+            boxes_list.append(boxes * stride)
+
+            p_conf = paddle.transpose(
+                p[:, :, :, 4:6], perm=[3, 0, 1, 2])  # [2, nA, nGh, nGw]
+            p_conf = F.softmax(
+                p_conf, axis=0)[1, :, :, :].unsqueeze(-1)  # [nA, nGh, nGw, 1]
+            scores = paddle.reshape(p_conf, shape=[nA * nGh * nGw, 1])
+            scores_list.append(scores)
+
+        boxes_results = paddle.stack(boxes_list)
+        scores_results = paddle.stack(scores_list)
+        return boxes_results, scores_results
+
+    def __call__(self, yolo_head_out, anchors):
+        bbox_pred_list = []
+        for i, head_out in enumerate(yolo_head_out):
+            stride = self.downsample_ratio // 2**i
+            anc_w, anc_h = anchors[i][0::2], anchors[i][1::2]
+            anchor_vec = np.stack((anc_w, anc_h), axis=1) / stride
+            nA = len(anc_w)
+            boxes, scores = self._postprocessing_by_level(nA, stride, head_out,
+                                                          anchor_vec)
+            bbox_pred_list.append(paddle.concat([boxes, scores], axis=-1))
+
+        yolo_boxes_scores = paddle.concat(bbox_pred_list, axis=1)
+        boxes_idx_over_conf_thr = paddle.nonzero(
+            yolo_boxes_scores[:, :, -1] > self.conf_thresh)
+        boxes_idx_over_conf_thr.stop_gradient = True
+
+        return boxes_idx_over_conf_thr, yolo_boxes_scores
+
+
+@register
+@serializable
+class MaskMatrixNMS(object):
+    """
+    Matrix NMS for multi-class masks.
+    Args:
+        update_threshold (float): Updated threshold of categroy score in second time.
+        pre_nms_top_n (int): Number of total instance to be kept per image before NMS
+        post_nms_top_n (int): Number of total instance to be kept per image after NMS.
+        kernel (str):  'linear' or 'gaussian'.
+        sigma (float): std in gaussian method.
+    Input:
+        seg_preds (Variable): shape (n, h, w), segmentation feature maps
+        seg_masks (Variable): shape (n, h, w), segmentation feature maps
+        cate_labels (Variable): shape (n), mask labels in descending order
+        cate_scores (Variable): shape (n), mask scores in descending order
+        sum_masks (Variable): a float tensor of the sum of seg_masks
+    Returns:
+        Variable: cate_scores, tensors of shape (n)
+    """
+
+    def __init__(self,
+                 update_threshold=0.05,
+                 pre_nms_top_n=500,
+                 post_nms_top_n=100,
+                 kernel='gaussian',
+                 sigma=2.0):
+        super(MaskMatrixNMS, self).__init__()
+        self.update_threshold = update_threshold
+        self.pre_nms_top_n = pre_nms_top_n
+        self.post_nms_top_n = post_nms_top_n
+        self.kernel = kernel
+        self.sigma = sigma
+
+    def _sort_score(self, scores, top_num):
+        if paddle.shape(scores)[0] > top_num:
+            return paddle.topk(scores, top_num)[1]
+        else:
+            return paddle.argsort(scores, descending=True)
+
+    def __call__(self,
+                 seg_preds,
+                 seg_masks,
+                 cate_labels,
+                 cate_scores,
+                 sum_masks=None):
+        # sort and keep top nms_pre
+        sort_inds = self._sort_score(cate_scores, self.pre_nms_top_n)
+        seg_masks = paddle.gather(seg_masks, index=sort_inds)
+        seg_preds = paddle.gather(seg_preds, index=sort_inds)
+        sum_masks = paddle.gather(sum_masks, index=sort_inds)
+        cate_scores = paddle.gather(cate_scores, index=sort_inds)
+        cate_labels = paddle.gather(cate_labels, index=sort_inds)
+
+        seg_masks = paddle.flatten(seg_masks, start_axis=1, stop_axis=-1)
+        # inter.
+        inter_matrix = paddle.mm(seg_masks, paddle.transpose(seg_masks, [1, 0]))
+        n_samples = paddle.shape(cate_labels)
+        # union.
+        sum_masks_x = paddle.expand(sum_masks, shape=[n_samples, n_samples])
+        # iou.
+        iou_matrix = (inter_matrix / (
+            sum_masks_x + paddle.transpose(sum_masks_x, [1, 0]) - inter_matrix))
+        iou_matrix = paddle.triu(iou_matrix, diagonal=1)
+        # label_specific matrix.
+        cate_labels_x = paddle.expand(cate_labels, shape=[n_samples, n_samples])
+        label_matrix = paddle.cast(
+            (cate_labels_x == paddle.transpose(cate_labels_x, [1, 0])),
+            'float32')
+        label_matrix = paddle.triu(label_matrix, diagonal=1)
+
+        # IoU compensation
+        compensate_iou = paddle.max((iou_matrix * label_matrix), axis=0)
+        compensate_iou = paddle.expand(
+            compensate_iou, shape=[n_samples, n_samples])
+        compensate_iou = paddle.transpose(compensate_iou, [1, 0])
+
+        # IoU decay
+        decay_iou = iou_matrix * label_matrix
+
+        # matrix nms
+        if self.kernel == 'gaussian':
+            decay_matrix = paddle.exp(-1 * self.sigma * (decay_iou**2))
+            compensate_matrix = paddle.exp(-1 * self.sigma *
+                                           (compensate_iou**2))
+            decay_coefficient = paddle.min(decay_matrix / compensate_matrix,
+                                           axis=0)
+        elif self.kernel == 'linear':
+            decay_matrix = (1 - decay_iou) / (1 - compensate_iou)
+            decay_coefficient = paddle.min(decay_matrix, axis=0)
+        else:
+            raise NotImplementedError
+
+        # update the score.
+        cate_scores = cate_scores * decay_coefficient
+        y = paddle.zeros(shape=paddle.shape(cate_scores), dtype='float32')
+        keep = paddle.where(cate_scores >= self.update_threshold, cate_scores,
+                            y)
+        keep = paddle.nonzero(keep)
+        keep = paddle.squeeze(keep, axis=[1])
+        # Prevent empty and increase fake data
+        keep = paddle.concat(
+            [keep, paddle.cast(paddle.shape(cate_scores)[0:1] - 1, 'int64')])
+
+        seg_preds = paddle.gather(seg_preds, index=keep)
+        cate_scores = paddle.gather(cate_scores, index=keep)
+        cate_labels = paddle.gather(cate_labels, index=keep)
+
+        # sort and keep top_k
+        sort_inds = self._sort_score(cate_scores, self.post_nms_top_n)
+        seg_preds = paddle.gather(seg_preds, index=sort_inds)
+        cate_scores = paddle.gather(cate_scores, index=sort_inds)
+        cate_labels = paddle.gather(cate_labels, index=sort_inds)
+        return seg_preds, cate_scores, cate_labels
+
+
+def Conv2d(in_channels,
+           out_channels,
+           kernel_size,
+           stride=1,
+           padding=0,
+           dilation=1,
+           groups=1,
+           bias=True,
+           weight_init=Normal(std=0.001),
+           bias_init=Constant(0.)):
+    weight_attr = paddle.framework.ParamAttr(initializer=weight_init)
+    if bias:
+        bias_attr = paddle.framework.ParamAttr(initializer=bias_init)
+    else:
+        bias_attr = False
+    conv = nn.Conv2D(
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        groups,
+        weight_attr=weight_attr,
+        bias_attr=bias_attr)
+    return conv
+
+
+def ConvTranspose2d(in_channels,
+                    out_channels,
+                    kernel_size,
+                    stride=1,
+                    padding=0,
+                    output_padding=0,
+                    groups=1,
+                    bias=True,
+                    dilation=1,
+                    weight_init=Normal(std=0.001),
+                    bias_init=Constant(0.)):
+    weight_attr = paddle.framework.ParamAttr(initializer=weight_init)
+    if bias:
+        bias_attr = paddle.framework.ParamAttr(initializer=bias_init)
+    else:
+        bias_attr = False
+    conv = nn.Conv2DTranspose(
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        dilation,
+        groups,
+        weight_attr=weight_attr,
+        bias_attr=bias_attr)
+    return conv
+
+
+def BatchNorm2d(num_features, eps=1e-05, momentum=0.9, affine=True):
+    if not affine:
+        weight_attr = False
+        bias_attr = False
+    else:
+        weight_attr = None
+        bias_attr = None
+    batchnorm = nn.BatchNorm2D(
+        num_features,
+        momentum,
+        eps,
+        weight_attr=weight_attr,
+        bias_attr=bias_attr)
+    return batchnorm
+
+
+def ReLU():
+    return nn.ReLU()
+
+
+def Upsample(scale_factor=None, mode='nearest', align_corners=False):
+    return nn.Upsample(None, scale_factor, mode, align_corners)
+
+
+def MaxPool(kernel_size, stride, padding, ceil_mode=False):
+    return nn.MaxPool2D(kernel_size, stride, padding, ceil_mode=ceil_mode)
+
+
+class Concat(nn.Layer):
+    def __init__(self, dim=0):
+        super(Concat, self).__init__()
+        self.dim = dim
+
+    def forward(self, inputs):
+        return paddle.concat(inputs, axis=self.dim)
+
+    def extra_repr(self):
+        return 'dim={}'.format(self.dim)
+
+
+def _convert_attention_mask(attn_mask, dtype):
+    """
+    Convert the attention mask to the target dtype we expect.
+    Parameters:
+        attn_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
+                When the data type is bool, the unwanted positions have `False` 
+                values and the others have `True` values. When the data type is 
+                int, the unwanted positions have 0 values and the others have 1 
+                values. When the data type is float, the unwanted positions have 
+                `-INF` values and the others have 0 values. It can be None when 
+                nothing wanted or needed to be prevented attention to. Default None.
+        dtype (VarType): The target type of `attn_mask` we expect.
+    Returns:
+        Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`.
+    """
+    return nn.layer.transformer._convert_attention_mask(attn_mask, dtype)
+
+
+@register
+class MultiHeadAttention(nn.Layer):
+    """
+    Attention mapps queries and a set of key-value pairs to outputs, and
+    Multi-Head Attention performs multiple parallel attention to jointly attending
+    to information from different representation subspaces.
+
+    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
+    for more details.
+
+    Parameters:
+        embed_dim (int): The expected feature size in the input and output.
+        num_heads (int): The number of heads in multi-head attention.
+        dropout (float, optional): The dropout probability used on attention
+            weights to drop some attention targets. 0 for no dropout. Default 0
+        kdim (int, optional): The feature size in key. If None, assumed equal to
+            `embed_dim`. Default None.
+        vdim (int, optional): The feature size in value. If None, assumed equal to
+            `embed_dim`. Default None.
+        need_weights (bool, optional): Indicate whether to return the attention
+            weights. Default False.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            # encoder input: [batch_size, sequence_length, d_model]
+            query = paddle.rand((2, 4, 128))
+            # self attention mask: [batch_size, num_heads, query_len, query_len]
+            attn_mask = paddle.rand((2, 2, 4, 4))
+            multi_head_attn = paddle.nn.MultiHeadAttention(128, 2)
+            output = multi_head_attn(query, None, None, attn_mask=attn_mask)  # [2, 4, 128]
+    """
+
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 dropout=0.,
+                 kdim=None,
+                 vdim=None,
+                 need_weights=False):
+        super(MultiHeadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.need_weights = need_weights
+
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        if self._qkv_same_embed_dim:
+            self.in_proj_weight = self.create_parameter(
+                shape=[embed_dim, 3 * embed_dim],
+                attr=None,
+                dtype=self._dtype,
+                is_bias=False)
+            self.in_proj_bias = self.create_parameter(
+                shape=[3 * embed_dim],
+                attr=None,
+                dtype=self._dtype,
+                is_bias=True)
+        else:
+            self.q_proj = nn.Linear(embed_dim, embed_dim)
+            self.k_proj = nn.Linear(self.kdim, embed_dim)
+            self.v_proj = nn.Linear(self.vdim, embed_dim)
+
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+        self._type_list = ('q_proj', 'k_proj', 'v_proj')
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                xavier_uniform_(p)
+            else:
+                constant_(p)
+
+    def compute_qkv(self, tensor, index):
+        if self._qkv_same_embed_dim:
+            tensor = F.linear(
+                x=tensor,
+                weight=self.in_proj_weight[:, index * self.embed_dim:(index + 1)
+                                           * self.embed_dim],
+                bias=self.in_proj_bias[index * self.embed_dim:(index + 1) *
+                                       self.embed_dim]
+                if self.in_proj_bias is not None else None)
+        else:
+            tensor = getattr(self, self._type_list[index])(tensor)
+        tensor = tensor.reshape(
+            [0, 0, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
+        return tensor
+
+    def forward(self, query, key=None, value=None, attn_mask=None):
+        r"""
+        Applies multi-head attention to map queries and a set of key-value pairs
+        to outputs.
+
+        Parameters:
+            query (Tensor): The queries for multi-head attention. It is a
+                tensor with shape `[batch_size, query_length, embed_dim]`. The
+                data type should be float32 or float64.
+            key (Tensor, optional): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, key_length, kdim]`. The
+                data type should be float32 or float64. If None, use `query` as
+                `key`. Default None.
+            value (Tensor, optional): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, value_length, vdim]`.
+                The data type should be float32 or float64. If None, use `query` as
+                `value`. Default None.
+            attn_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
+                When the data type is bool, the unwanted positions have `False`
+                values and the others have `True` values. When the data type is
+                int, the unwanted positions have 0 values and the others have 1
+                values. When the data type is float, the unwanted positions have
+                `-INF` values and the others have 0 values. It can be None when
+                nothing wanted or needed to be prevented attention to. Default None.
+
+        Returns:
+            Tensor|tuple: It is a tensor that has the same shape and data type \
+                as `query`, representing attention output. Or a tuple if \
+                `need_weights` is True or `cache` is not None. If `need_weights` \
+                is True, except for attention output, the tuple also includes \
+                the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \
+                If `cache` is not None, the tuple then includes the new cache \
+                having the same type as `cache`, and if it is `StaticCache`, it \
+                is same as the input `cache`, if it is `Cache`, the new cache \
+                reserves tensors concatanating raw tensors with intermediate \
+                results of current query.
+        """
+        key = query if key is None else key
+        value = query if value is None else value
+        # compute q ,k ,v
+        q, k, v = (self.compute_qkv(t, i)
+                   for i, t in enumerate([query, key, value]))
+
+        # scale dot product attention
+        product = paddle.matmul(x=q, y=k, transpose_y=True)
+        scaling = float(self.head_dim)**-0.5
+        product = product * scaling
+
+        if attn_mask is not None:
+            # Support bool or int mask
+            attn_mask = _convert_attention_mask(attn_mask, product.dtype)
+            product = product + attn_mask
+        weights = F.softmax(product)
+        if self.dropout:
+            weights = F.dropout(
+                weights,
+                self.dropout,
+                training=self.training,
+                mode="upscale_in_train")
+        out = paddle.matmul(weights, v)
+
+        # combine heads
+        out = paddle.transpose(out, perm=[0, 2, 1, 3])
+        out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+
+        # project to output
+        out = self.out_proj(out)
+
+        outs = [out]
+        if self.need_weights:
+            outs.append(weights)
+        return out if len(outs) == 1 else tuple(outs)
+
+
+@register
+class ConvMixer(nn.Layer):
+    def __init__(
+            self,
+            dim,
+            depth,
+            kernel_size=3, ):
+        super().__init__()
+        self.dim = dim
+        self.depth = depth
+        self.kernel_size = kernel_size
+
+        self.mixer = self.conv_mixer(dim, depth, kernel_size)
+
+    def forward(self, x):
+        return self.mixer(x)
+
+    @staticmethod
+    def conv_mixer(
+            dim,
+            depth,
+            kernel_size, ):
+        Seq, ActBn = nn.Sequential, lambda x: Seq(x, nn.GELU(), nn.BatchNorm2D(dim))
+        Residual = type('Residual', (Seq, ),
+                        {'forward': lambda self, x: self[0](x) + x})
+        return Seq(* [
+            Seq(Residual(
+                ActBn(
+                    nn.Conv2D(
+                        dim, dim, kernel_size, groups=dim, padding="same"))),
+                ActBn(nn.Conv2D(dim, dim, 1))) for i in range(depth)
+        ])
diff --git a/rtdetr_paddle/ppdet/modeling/losses/__init__.py b/rtdetr_paddle/ppdet/modeling/losses/__init__.py
new file mode 100644
index 0000000..1f633cc
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/losses/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .iou_loss import *
+from .gfocal_loss import *
+from .detr_loss import *
+from .focal_loss import *
+from .smooth_l1_loss import *
diff --git a/rtdetr_paddle/ppdet/modeling/losses/detr_loss.py b/rtdetr_paddle/ppdet/modeling/losses/detr_loss.py
new file mode 100644
index 0000000..24f14c3
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/losses/detr_loss.py
@@ -0,0 +1,578 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+from .iou_loss import GIoULoss
+from ..transformers import bbox_cxcywh_to_xyxy, sigmoid_focal_loss, varifocal_loss_with_logits
+from ..bbox_utils import bbox_iou
+
+__all__ = ['DETRLoss', 'DINOLoss']
+
+
+@register
+class DETRLoss(nn.Layer):
+    __shared__ = ['num_classes', 'use_focal_loss']
+    __inject__ = ['matcher']
+
+    def __init__(self,
+                 num_classes=80,
+                 matcher='HungarianMatcher',
+                 loss_coeff={
+                     'class': 1,
+                     'bbox': 5,
+                     'giou': 2,
+                     'no_object': 0.1,
+                     'mask': 1,
+                     'dice': 1
+                 },
+                 aux_loss=True,
+                 use_focal_loss=False,
+                 use_vfl=False,
+                 use_uni_match=False,
+                 uni_match_ind=0):
+        r"""
+        Args:
+            num_classes (int): The number of classes.
+            matcher (HungarianMatcher): It computes an assignment between the targets
+                and the predictions of the network.
+            loss_coeff (dict): The coefficient of loss.
+            aux_loss (bool): If 'aux_loss = True', loss at each decoder layer are to be used.
+            use_focal_loss (bool): Use focal loss or not.
+        """
+        super(DETRLoss, self).__init__()
+
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.loss_coeff = loss_coeff
+        self.aux_loss = aux_loss
+        self.use_focal_loss = use_focal_loss
+        self.use_vfl = use_vfl
+        self.use_uni_match = use_uni_match
+        self.uni_match_ind = uni_match_ind
+
+        if not self.use_focal_loss:
+            self.loss_coeff['class'] = paddle.full([num_classes + 1],
+                                                   loss_coeff['class'])
+            self.loss_coeff['class'][-1] = loss_coeff['no_object']
+        self.giou_loss = GIoULoss()
+
+    def _get_loss_class(self,
+                        logits,
+                        gt_class,
+                        match_indices,
+                        bg_index,
+                        num_gts,
+                        postfix="",
+                        iou_score=None):
+        # logits: [b, query, num_classes], gt_class: list[[n, 1]]
+        name_class = "loss_class" + postfix
+
+        target_label = paddle.full(logits.shape[:2], bg_index, dtype='int64')
+        bs, num_query_objects = target_label.shape
+        num_gt = sum(len(a) for a in gt_class)
+        if num_gt > 0:
+            index, updates = self._get_index_updates(num_query_objects,
+                                                     gt_class, match_indices)
+            target_label = paddle.scatter(
+                target_label.reshape([-1, 1]), index, updates.astype('int64'))
+            target_label = target_label.reshape([bs, num_query_objects])
+        if self.use_focal_loss:
+            target_label = F.one_hot(target_label,
+                                     self.num_classes + 1)[..., :-1]
+            if iou_score is not None and self.use_vfl:
+                target_score = paddle.zeros([bs, num_query_objects])
+                if num_gt > 0:
+                    target_score = paddle.scatter(
+                        target_score.reshape([-1, 1]), index, iou_score)
+                target_score = target_score.reshape(
+                    [bs, num_query_objects, 1]) * target_label
+                loss_ = self.loss_coeff['class'] * varifocal_loss_with_logits(
+                    logits, target_score, target_label,
+                    num_gts / num_query_objects)
+            else:
+                loss_ = self.loss_coeff['class'] * sigmoid_focal_loss(
+                    logits, target_label, num_gts / num_query_objects)
+        else:
+            loss_ = F.cross_entropy(
+                logits, target_label, weight=self.loss_coeff['class'])
+        return {name_class: loss_}
+
+    def _get_loss_bbox(self, boxes, gt_bbox, match_indices, num_gts,
+                       postfix=""):
+        # boxes: [b, query, 4], gt_bbox: list[[n, 4]]
+        name_bbox = "loss_bbox" + postfix
+        name_giou = "loss_giou" + postfix
+
+        loss = dict()
+        if sum(len(a) for a in gt_bbox) == 0:
+            loss[name_bbox] = paddle.to_tensor([0.])
+            loss[name_giou] = paddle.to_tensor([0.])
+            return loss
+
+        src_bbox, target_bbox = self._get_src_target_assign(boxes, gt_bbox,
+                                                            match_indices)
+        loss[name_bbox] = self.loss_coeff['bbox'] * F.l1_loss(
+            src_bbox, target_bbox, reduction='sum') / num_gts
+        loss[name_giou] = self.giou_loss(
+            bbox_cxcywh_to_xyxy(src_bbox), bbox_cxcywh_to_xyxy(target_bbox))
+        loss[name_giou] = loss[name_giou].sum() / num_gts
+        loss[name_giou] = self.loss_coeff['giou'] * loss[name_giou]
+        return loss
+
+    def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts,
+                       postfix=""):
+        # masks: [b, query, h, w], gt_mask: list[[n, H, W]]
+        name_mask = "loss_mask" + postfix
+        name_dice = "loss_dice" + postfix
+
+        loss = dict()
+        if sum(len(a) for a in gt_mask) == 0:
+            loss[name_mask] = paddle.to_tensor([0.])
+            loss[name_dice] = paddle.to_tensor([0.])
+            return loss
+
+        src_masks, target_masks = self._get_src_target_assign(masks, gt_mask,
+                                                              match_indices)
+        src_masks = F.interpolate(
+            src_masks.unsqueeze(0),
+            size=target_masks.shape[-2:],
+            mode="bilinear")[0]
+        loss[name_mask] = self.loss_coeff['mask'] * F.sigmoid_focal_loss(
+            src_masks,
+            target_masks,
+            paddle.to_tensor(
+                [num_gts], dtype='float32'))
+        loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss(
+            src_masks, target_masks, num_gts)
+        return loss
+
+    def _dice_loss(self, inputs, targets, num_gts):
+        inputs = F.sigmoid(inputs)
+        inputs = inputs.flatten(1)
+        targets = targets.flatten(1)
+        numerator = 2 * (inputs * targets).sum(1)
+        denominator = inputs.sum(-1) + targets.sum(-1)
+        loss = 1 - (numerator + 1) / (denominator + 1)
+        return loss.sum() / num_gts
+
+    def _get_loss_aux(self,
+                      boxes,
+                      logits,
+                      gt_bbox,
+                      gt_class,
+                      bg_index,
+                      num_gts,
+                      dn_match_indices=None,
+                      postfix="",
+                      masks=None,
+                      gt_mask=None):
+        loss_class = []
+        loss_bbox, loss_giou = [], []
+        loss_mask, loss_dice = [], []
+        if dn_match_indices is not None:
+            match_indices = dn_match_indices
+        elif self.use_uni_match:
+            match_indices = self.matcher(
+                boxes[self.uni_match_ind],
+                logits[self.uni_match_ind],
+                gt_bbox,
+                gt_class,
+                masks=masks[self.uni_match_ind] if masks is not None else None,
+                gt_mask=gt_mask)
+        for i, (aux_boxes, aux_logits) in enumerate(zip(boxes, logits)):
+            aux_masks = masks[i] if masks is not None else None
+            if not self.use_uni_match and dn_match_indices is None:
+                match_indices = self.matcher(
+                    aux_boxes,
+                    aux_logits,
+                    gt_bbox,
+                    gt_class,
+                    masks=aux_masks,
+                    gt_mask=gt_mask)
+            if self.use_vfl:
+                if sum(len(a) for a in gt_bbox) > 0:
+                    src_bbox, target_bbox = self._get_src_target_assign(
+                        aux_boxes.detach(), gt_bbox, match_indices)
+                    iou_score = bbox_iou(
+                        bbox_cxcywh_to_xyxy(src_bbox).split(4, -1),
+                        bbox_cxcywh_to_xyxy(target_bbox).split(4, -1))
+                else:
+                    iou_score = None
+            else:
+                iou_score = None
+            loss_class.append(
+                self._get_loss_class(aux_logits, gt_class, match_indices,
+                                     bg_index, num_gts, postfix, iou_score)[
+                                         'loss_class' + postfix])
+            loss_ = self._get_loss_bbox(aux_boxes, gt_bbox, match_indices,
+                                        num_gts, postfix)
+            loss_bbox.append(loss_['loss_bbox' + postfix])
+            loss_giou.append(loss_['loss_giou' + postfix])
+            if masks is not None and gt_mask is not None:
+                loss_ = self._get_loss_mask(aux_masks, gt_mask, match_indices,
+                                            num_gts, postfix)
+                loss_mask.append(loss_['loss_mask' + postfix])
+                loss_dice.append(loss_['loss_dice' + postfix])
+        loss = {
+            "loss_class_aux" + postfix: paddle.add_n(loss_class),
+            "loss_bbox_aux" + postfix: paddle.add_n(loss_bbox),
+            "loss_giou_aux" + postfix: paddle.add_n(loss_giou)
+        }
+        if masks is not None and gt_mask is not None:
+            loss["loss_mask_aux" + postfix] = paddle.add_n(loss_mask)
+            loss["loss_dice_aux" + postfix] = paddle.add_n(loss_dice)
+        return loss
+
+    def _get_index_updates(self, num_query_objects, target, match_indices):
+        batch_idx = paddle.concat([
+            paddle.full_like(src, i) for i, (src, _) in enumerate(match_indices)
+        ])
+        src_idx = paddle.concat([src for (src, _) in match_indices])
+        src_idx += (batch_idx * num_query_objects)
+        target_assign = paddle.concat([
+            paddle.gather(
+                t, dst, axis=0) for t, (_, dst) in zip(target, match_indices)
+        ])
+        return src_idx, target_assign
+
+    def _get_src_target_assign(self, src, target, match_indices):
+        src_assign = paddle.concat([
+            paddle.gather(
+                t, I, axis=0) if len(I) > 0 else paddle.zeros([0, t.shape[-1]])
+            for t, (I, _) in zip(src, match_indices)
+        ])
+        target_assign = paddle.concat([
+            paddle.gather(
+                t, J, axis=0) if len(J) > 0 else paddle.zeros([0, t.shape[-1]])
+            for t, (_, J) in zip(target, match_indices)
+        ])
+        return src_assign, target_assign
+
+    def _get_num_gts(self, targets, dtype="float32"):
+        num_gts = sum(len(a) for a in targets)
+        num_gts = paddle.to_tensor([num_gts], dtype=dtype)
+        if paddle.distributed.get_world_size() > 1:
+            paddle.distributed.all_reduce(num_gts)
+            num_gts /= paddle.distributed.get_world_size()
+        num_gts = paddle.clip(num_gts, min=1.)
+        return num_gts
+
+    def _get_prediction_loss(self,
+                             boxes,
+                             logits,
+                             gt_bbox,
+                             gt_class,
+                             masks=None,
+                             gt_mask=None,
+                             postfix="",
+                             dn_match_indices=None,
+                             num_gts=1):
+        if dn_match_indices is None:
+            match_indices = self.matcher(
+                boxes, logits, gt_bbox, gt_class, masks=masks, gt_mask=gt_mask)
+        else:
+            match_indices = dn_match_indices
+
+        if self.use_vfl:
+            if sum(len(a) for a in gt_bbox) > 0:
+                src_bbox, target_bbox = self._get_src_target_assign(
+                    boxes.detach(), gt_bbox, match_indices)
+                iou_score = bbox_iou(
+                    bbox_cxcywh_to_xyxy(src_bbox).split(4, -1),
+                    bbox_cxcywh_to_xyxy(target_bbox).split(4, -1))
+            else:
+                iou_score = None
+        else:
+            iou_score = None
+
+        loss = dict()
+        loss.update(
+            self._get_loss_class(logits, gt_class, match_indices,
+                                 self.num_classes, num_gts, postfix, iou_score))
+        loss.update(
+            self._get_loss_bbox(boxes, gt_bbox, match_indices, num_gts,
+                                postfix))
+        if masks is not None and gt_mask is not None:
+            loss.update(
+                self._get_loss_mask(masks, gt_mask, match_indices, num_gts,
+                                    postfix))
+        return loss
+
+    def forward(self,
+                boxes,
+                logits,
+                gt_bbox,
+                gt_class,
+                masks=None,
+                gt_mask=None,
+                postfix="",
+                **kwargs):
+        r"""
+        Args:
+            boxes (Tensor): [l, b, query, 4]
+            logits (Tensor): [l, b, query, num_classes]
+            gt_bbox (List(Tensor)): list[[n, 4]]
+            gt_class (List(Tensor)): list[[n, 1]]
+            masks (Tensor, optional): [l, b, query, h, w]
+            gt_mask (List(Tensor), optional): list[[n, H, W]]
+            postfix (str): postfix of loss name
+        """
+
+        dn_match_indices = kwargs.get("dn_match_indices", None)
+        num_gts = kwargs.get("num_gts", None)
+        if num_gts is None:
+            num_gts = self._get_num_gts(gt_class)
+
+        total_loss = self._get_prediction_loss(
+            boxes[-1],
+            logits[-1],
+            gt_bbox,
+            gt_class,
+            masks=masks[-1] if masks is not None else None,
+            gt_mask=gt_mask,
+            postfix=postfix,
+            dn_match_indices=dn_match_indices,
+            num_gts=num_gts)
+
+        if self.aux_loss:
+            total_loss.update(
+                self._get_loss_aux(
+                    boxes[:-1],
+                    logits[:-1],
+                    gt_bbox,
+                    gt_class,
+                    self.num_classes,
+                    num_gts,
+                    dn_match_indices,
+                    postfix,
+                    masks=masks[:-1] if masks is not None else None,
+                    gt_mask=gt_mask))
+
+        return total_loss
+
+
+@register
+class DINOLoss(DETRLoss):
+    def forward(self,
+                boxes,
+                logits,
+                gt_bbox,
+                gt_class,
+                masks=None,
+                gt_mask=None,
+                postfix="",
+                dn_out_bboxes=None,
+                dn_out_logits=None,
+                dn_meta=None,
+                **kwargs):
+        num_gts = self._get_num_gts(gt_class)
+        total_loss = super(DINOLoss, self).forward(
+            boxes, logits, gt_bbox, gt_class, num_gts=num_gts)
+
+        if dn_meta is not None:
+            dn_positive_idx, dn_num_group = \
+                dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
+            assert len(gt_class) == len(dn_positive_idx)
+
+            # denoising match indices
+            dn_match_indices = self.get_dn_match_indices(
+                gt_class, dn_positive_idx, dn_num_group)
+
+            # compute denoising training loss
+            num_gts *= dn_num_group
+            dn_loss = super(DINOLoss, self).forward(
+                dn_out_bboxes,
+                dn_out_logits,
+                gt_bbox,
+                gt_class,
+                postfix="_dn",
+                dn_match_indices=dn_match_indices,
+                num_gts=num_gts)
+            total_loss.update(dn_loss)
+        else:
+            total_loss.update(
+                {k + '_dn': paddle.to_tensor([0.])
+                 for k in total_loss.keys()})
+
+        return total_loss
+
+    @staticmethod
+    def get_dn_match_indices(labels, dn_positive_idx, dn_num_group):
+        dn_match_indices = []
+        for i in range(len(labels)):
+            num_gt = len(labels[i])
+            if num_gt > 0:
+                gt_idx = paddle.arange(end=num_gt, dtype="int64")
+                gt_idx = gt_idx.tile([dn_num_group])
+                assert len(dn_positive_idx[i]) == len(gt_idx)
+                dn_match_indices.append((dn_positive_idx[i], gt_idx))
+            else:
+                dn_match_indices.append((paddle.zeros(
+                    [0], dtype="int64"), paddle.zeros(
+                        [0], dtype="int64")))
+        return dn_match_indices
+
+
+@register
+class MaskDINOLoss(DETRLoss):
+    __shared__ = ['num_classes', 'use_focal_loss', 'num_sample_points']
+    __inject__ = ['matcher']
+
+    def __init__(self,
+                 num_classes=80,
+                 matcher='HungarianMatcher',
+                 loss_coeff={
+                     'class': 4,
+                     'bbox': 5,
+                     'giou': 2,
+                     'mask': 5,
+                     'dice': 5
+                 },
+                 aux_loss=True,
+                 use_focal_loss=False,
+                 num_sample_points=12544,
+                 oversample_ratio=3.0,
+                 important_sample_ratio=0.75):
+        super(MaskDINOLoss, self).__init__(num_classes, matcher, loss_coeff,
+                                           aux_loss, use_focal_loss)
+        assert oversample_ratio >= 1
+        assert important_sample_ratio <= 1 and important_sample_ratio >= 0
+
+        self.num_sample_points = num_sample_points
+        self.oversample_ratio = oversample_ratio
+        self.important_sample_ratio = important_sample_ratio
+        self.num_oversample_points = int(num_sample_points * oversample_ratio)
+        self.num_important_points = int(num_sample_points *
+                                        important_sample_ratio)
+        self.num_random_points = num_sample_points - self.num_important_points
+
+    def forward(self,
+                boxes,
+                logits,
+                gt_bbox,
+                gt_class,
+                masks=None,
+                gt_mask=None,
+                postfix="",
+                dn_out_bboxes=None,
+                dn_out_logits=None,
+                dn_out_masks=None,
+                dn_meta=None,
+                **kwargs):
+        num_gts = self._get_num_gts(gt_class)
+        total_loss = super(MaskDINOLoss, self).forward(
+            boxes,
+            logits,
+            gt_bbox,
+            gt_class,
+            masks=masks,
+            gt_mask=gt_mask,
+            num_gts=num_gts)
+
+        if dn_meta is not None:
+            dn_positive_idx, dn_num_group = \
+                dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
+            assert len(gt_class) == len(dn_positive_idx)
+
+            # denoising match indices
+            dn_match_indices = DINOLoss.get_dn_match_indices(
+                gt_class, dn_positive_idx, dn_num_group)
+
+            # compute denoising training loss
+            num_gts *= dn_num_group
+            dn_loss = super(MaskDINOLoss, self).forward(
+                dn_out_bboxes,
+                dn_out_logits,
+                gt_bbox,
+                gt_class,
+                masks=dn_out_masks,
+                gt_mask=gt_mask,
+                postfix="_dn",
+                dn_match_indices=dn_match_indices,
+                num_gts=num_gts)
+            total_loss.update(dn_loss)
+        else:
+            total_loss.update(
+                {k + '_dn': paddle.to_tensor([0.])
+                 for k in total_loss.keys()})
+
+        return total_loss
+
+    def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts,
+                       postfix=""):
+        # masks: [b, query, h, w], gt_mask: list[[n, H, W]]
+        name_mask = "loss_mask" + postfix
+        name_dice = "loss_dice" + postfix
+
+        loss = dict()
+        if sum(len(a) for a in gt_mask) == 0:
+            loss[name_mask] = paddle.to_tensor([0.])
+            loss[name_dice] = paddle.to_tensor([0.])
+            return loss
+
+        src_masks, target_masks = self._get_src_target_assign(masks, gt_mask,
+                                                              match_indices)
+        # sample points
+        sample_points = self._get_point_coords_by_uncertainty(src_masks)
+        sample_points = 2.0 * sample_points.unsqueeze(1) - 1.0
+
+        src_masks = F.grid_sample(
+            src_masks.unsqueeze(1), sample_points,
+            align_corners=False).squeeze([1, 2])
+
+        target_masks = F.grid_sample(
+            target_masks.unsqueeze(1), sample_points,
+            align_corners=False).squeeze([1, 2]).detach()
+
+        loss[name_mask] = self.loss_coeff[
+            'mask'] * F.binary_cross_entropy_with_logits(
+                src_masks, target_masks,
+                reduction='none').mean(1).sum() / num_gts
+        loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss(
+            src_masks, target_masks, num_gts)
+        return loss
+
+    def _get_point_coords_by_uncertainty(self, masks):
+        # Sample points based on their uncertainty.
+        masks = masks.detach()
+        num_masks = masks.shape[0]
+        sample_points = paddle.rand(
+            [num_masks, 1, self.num_oversample_points, 2])
+
+        out_mask = F.grid_sample(
+            masks.unsqueeze(1), 2.0 * sample_points - 1.0,
+            align_corners=False).squeeze([1, 2])
+        out_mask = -paddle.abs(out_mask)
+
+        _, topk_ind = paddle.topk(out_mask, self.num_important_points, axis=1)
+        batch_ind = paddle.arange(end=num_masks, dtype=topk_ind.dtype)
+        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_important_points])
+        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
+
+        sample_points = paddle.gather_nd(sample_points.squeeze(1), topk_ind)
+        if self.num_random_points > 0:
+            sample_points = paddle.concat(
+                [
+                    sample_points,
+                    paddle.rand([num_masks, self.num_random_points, 2])
+                ],
+                axis=1)
+        return sample_points
diff --git a/rtdetr_paddle/ppdet/modeling/losses/focal_loss.py b/rtdetr_paddle/ppdet/modeling/losses/focal_loss.py
new file mode 100644
index 0000000..b9a64e1
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/losses/focal_loss.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn.functional as F
+import paddle.nn as nn
+from ppdet.core.workspace import register
+
+__all__ = ['FocalLoss', 'Weighted_FocalLoss']
+
+@register
+class FocalLoss(nn.Layer):
+    """A wrapper around paddle.nn.functional.sigmoid_focal_loss.
+    Args:
+        use_sigmoid (bool): currently only support use_sigmoid=True
+        alpha (float): parameter alpha in Focal Loss
+        gamma (float): parameter gamma in Focal Loss
+        loss_weight (float): final loss will be multiplied by this
+    """
+    def __init__(self,
+                 use_sigmoid=True,
+                 alpha=0.25,
+                 gamma=2.0,
+                 loss_weight=1.0):
+        super(FocalLoss, self).__init__()
+        assert use_sigmoid == True, \
+            'Focal Loss only supports sigmoid at the moment'
+        self.use_sigmoid = use_sigmoid
+        self.alpha = alpha
+        self.gamma = gamma
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, reduction='none'):
+        """forward function.
+        Args:
+            pred (Tensor): logits of class prediction, of shape (N, num_classes)
+            target (Tensor): target class label, of shape (N, )
+            reduction (str): the way to reduce loss, one of (none, sum, mean)
+        """
+        num_classes = pred.shape[1]
+        target = F.one_hot(target, num_classes+1).cast(pred.dtype)
+        target = target[:, :-1].detach()
+        loss = F.sigmoid_focal_loss(
+            pred, target, alpha=self.alpha, gamma=self.gamma,
+            reduction=reduction)
+        return loss * self.loss_weight
+
+
+@register
+class Weighted_FocalLoss(FocalLoss):
+    """A wrapper around paddle.nn.functional.sigmoid_focal_loss.
+    Args:
+        use_sigmoid (bool): currently only support use_sigmoid=True
+        alpha (float): parameter alpha in Focal Loss
+        gamma (float): parameter gamma in Focal Loss
+        loss_weight (float): final loss will be multiplied by this
+    """
+    def __init__(self,
+                 use_sigmoid=True,
+                 alpha=0.25,
+                 gamma=2.0,
+                 loss_weight=1.0,
+                 reduction="mean"):
+        super(FocalLoss, self).__init__()
+        assert use_sigmoid == True, \
+            'Focal Loss only supports sigmoid at the moment'
+        self.use_sigmoid = use_sigmoid
+        self.alpha = alpha
+        self.gamma = gamma
+        self.loss_weight = loss_weight
+        self.reduction = reduction
+
+    def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None):
+        """forward function.
+        Args:
+            pred (Tensor): logits of class prediction, of shape (N, num_classes)
+            target (Tensor): target class label, of shape (N, )
+            reduction (str): the way to reduce loss, one of (none, sum, mean)
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        num_classes = pred.shape[1]
+        target = F.one_hot(target, num_classes + 1).astype(pred.dtype)
+        target = target[:, :-1].detach()
+        loss = F.sigmoid_focal_loss(
+            pred, target, alpha=self.alpha, gamma=self.gamma,
+            reduction='none')
+
+        if weight is not None:
+            if weight.shape != loss.shape:
+                if weight.shape[0] == loss.shape[0]:
+                    # For most cases, weight is of shape (num_priors, ),
+                    #  which means it does not have the second axis num_class
+                    weight = weight.reshape((-1, 1))
+                else:
+                    # Sometimes, weight per anchor per class is also needed. e.g.
+                    #  in FSAF. But it may be flattened of shape
+                    #  (num_priors x num_class, ), while loss is still of shape
+                    #  (num_priors, num_class).
+                    assert weight.numel() == loss.numel()
+                    weight = weight.reshape((loss.shape[0], -1))
+            assert weight.ndim == loss.ndim
+            loss = loss * weight
+
+        # if avg_factor is not specified, just reduce the loss
+        if avg_factor is None:
+            if reduction == 'mean':
+                loss = loss.mean()
+            elif reduction == 'sum':
+                loss = loss.sum()
+        else:
+            # if reduction is mean, then average the loss by avg_factor
+            if reduction == 'mean':
+                # Avoid causing ZeroDivisionError when avg_factor is 0.0,
+                # i.e., all labels of an image belong to ignore index.
+                eps = 1e-10
+                loss = loss.sum() / (avg_factor + eps)
+            # if reduction is 'none', then do nothing, otherwise raise an error
+            elif reduction != 'none':
+                raise ValueError('avg_factor can not be used with reduction="sum"')
+
+        return loss * self.loss_weight
diff --git a/rtdetr_paddle/ppdet/modeling/losses/gfocal_loss.py b/rtdetr_paddle/ppdet/modeling/losses/gfocal_loss.py
new file mode 100644
index 0000000..37e27f0
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/losses/gfocal_loss.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The code is based on:
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/gfocal_loss.py
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling import ops
+
+__all__ = ['QualityFocalLoss', 'DistributionFocalLoss']
+
+
+def quality_focal_loss(pred, target, beta=2.0, use_sigmoid=True):
+    """
+    Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning
+    Qualified and Distributed Bounding Boxes for Dense Object Detection
+    <https://arxiv.org/abs/2006.04388>`_.
+    Args:
+        pred (Tensor): Predicted joint representation of classification
+            and quality (IoU) estimation with shape (N, C), C is the number of
+            classes.
+        target (tuple([Tensor])): Target category label with shape (N,)
+            and target quality label with shape (N,).
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+    Returns:
+        Tensor: Loss tensor with shape (N,).
+    """
+    assert len(target) == 2, """target for QFL must be a tuple of two elements,
+        including category label and quality label, respectively"""
+    # label denotes the category id, score denotes the quality score
+    label, score = target
+    if use_sigmoid:
+        func = F.binary_cross_entropy_with_logits
+    else:
+        func = F.binary_cross_entropy
+
+    # negatives are supervised by 0 quality score
+    pred_sigmoid = F.sigmoid(pred) if use_sigmoid else pred
+    scale_factor = pred_sigmoid
+    zerolabel = paddle.zeros(pred.shape, dtype='float32')
+    loss = func(pred, zerolabel, reduction='none') * scale_factor.pow(beta)
+
+    # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+    bg_class_ind = pred.shape[1]
+    pos = paddle.logical_and((label >= 0),
+                             (label < bg_class_ind)).nonzero().squeeze(1)
+    if pos.shape[0] == 0:
+        return loss.sum(axis=1)
+    pos_label = paddle.gather(label, pos, axis=0)
+    pos_mask = np.zeros(pred.shape, dtype=np.int32)
+    pos_mask[pos.numpy(), pos_label.numpy()] = 1
+    pos_mask = paddle.to_tensor(pos_mask, dtype='bool')
+    score = score.unsqueeze(-1).expand([-1, pred.shape[1]]).cast('float32')
+    # positives are supervised by bbox quality (IoU) score
+    scale_factor_new = score - pred_sigmoid
+
+    loss_pos = func(
+        pred, score, reduction='none') * scale_factor_new.abs().pow(beta)
+    loss = loss * paddle.logical_not(pos_mask) + loss_pos * pos_mask
+    loss = loss.sum(axis=1)
+    return loss
+
+
+def distribution_focal_loss(pred, label):
+    """Distribution Focal Loss (DFL) is from `Generalized Focal Loss: Learning
+    Qualified and Distributed Bounding Boxes for Dense Object Detection
+    <https://arxiv.org/abs/2006.04388>`_.
+    Args:
+        pred (Tensor): Predicted general distribution of bounding boxes
+            (before softmax) with shape (N, n+1), n is the max value of the
+            integral set `{0, ..., n}` in paper.
+        label (Tensor): Target distance label for bounding boxes with
+            shape (N,).
+    Returns:
+        Tensor: Loss tensor with shape (N,).
+    """
+    dis_left = label.cast('int64')
+    dis_right = dis_left + 1
+    weight_left = dis_right.cast('float32') - label
+    weight_right = label - dis_left.cast('float32')
+    loss = F.cross_entropy(pred, dis_left, reduction='none') * weight_left \
+        + F.cross_entropy(pred, dis_right, reduction='none') * weight_right
+    return loss
+
+
+@register
+@serializable
+class QualityFocalLoss(nn.Layer):
+    r"""Quality Focal Loss (QFL) is a variant of `Generalized Focal Loss:
+    Learning Qualified and Distributed Bounding Boxes for Dense Object
+    Detection <https://arxiv.org/abs/2006.04388>`_.
+    Args:
+        use_sigmoid (bool): Whether sigmoid operation is conducted in QFL.
+            Defaults to True.
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Loss weight of current loss.
+    """
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 beta=2.0,
+                 reduction='mean',
+                 loss_weight=1.0):
+        super(QualityFocalLoss, self).__init__()
+        self.use_sigmoid = use_sigmoid
+        self.beta = beta
+        assert reduction in ('none', 'mean', 'sum')
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, weight=None, avg_factor=None):
+        """Forward function.
+        Args:
+            pred (Tensor): Predicted joint representation of
+                classification and quality (IoU) estimation with shape (N, C),
+                C is the number of classes.
+            target (tuple([Tensor])): Target category label with shape
+                (N,) and target quality label with shape (N,).
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+        """
+
+        loss = self.loss_weight * quality_focal_loss(
+            pred, target, beta=self.beta, use_sigmoid=self.use_sigmoid)
+
+        if weight is not None:
+            loss = loss * weight
+        if avg_factor is None:
+            if self.reduction == 'none':
+                return loss
+            elif self.reduction == 'mean':
+                return loss.mean()
+            elif self.reduction == 'sum':
+                return loss.sum()
+        else:
+            # if reduction is mean, then average the loss by avg_factor
+            if self.reduction == 'mean':
+                loss = loss.sum() / avg_factor
+            # if reduction is 'none', then do nothing, otherwise raise an error
+            elif self.reduction != 'none':
+                raise ValueError(
+                    'avg_factor can not be used with reduction="sum"')
+        return loss
+
+
+@register
+@serializable
+class DistributionFocalLoss(nn.Layer):
+    """Distribution Focal Loss (DFL) is a variant of `Generalized Focal Loss:
+    Learning Qualified and Distributed Bounding Boxes for Dense Object
+    Detection <https://arxiv.org/abs/2006.04388>`_.
+    Args:
+        reduction (str): Options are `'none'`, `'mean'` and `'sum'`.
+        loss_weight (float): Loss weight of current loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(DistributionFocalLoss, self).__init__()
+        assert reduction in ('none', 'mean', 'sum')
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, weight=None, avg_factor=None):
+        """Forward function.
+        Args:
+            pred (Tensor): Predicted general distribution of bounding
+                boxes (before softmax) with shape (N, n+1), n is the max value
+                of the integral set `{0, ..., n}` in paper.
+            target (Tensor): Target distance label for bounding boxes
+                with shape (N,).
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+        """
+        loss = self.loss_weight * distribution_focal_loss(pred, target)
+        if weight is not None:
+            loss = loss * weight
+        if avg_factor is None:
+            if self.reduction == 'none':
+                return loss
+            elif self.reduction == 'mean':
+                return loss.mean()
+            elif self.reduction == 'sum':
+                return loss.sum()
+        else:
+            # if reduction is mean, then average the loss by avg_factor
+            if self.reduction == 'mean':
+                loss = loss.sum() / avg_factor
+            # if reduction is 'none', then do nothing, otherwise raise an error
+            elif self.reduction != 'none':
+                raise ValueError(
+                    'avg_factor can not be used with reduction="sum"')
+        return loss
diff --git a/rtdetr_paddle/ppdet/modeling/losses/iou_loss.py b/rtdetr_paddle/ppdet/modeling/losses/iou_loss.py
new file mode 100644
index 0000000..b5cac22
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/losses/iou_loss.py
@@ -0,0 +1,295 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import math
+import paddle
+
+from ppdet.core.workspace import register, serializable
+from ..bbox_utils import bbox_iou
+
+__all__ = ['IouLoss', 'GIoULoss', 'DIouLoss', 'SIoULoss']
+
+
+@register
+@serializable
+class IouLoss(object):
+    """
+    iou loss, see https://arxiv.org/abs/1908.03851
+    loss = 1.0 - iou * iou
+    Args:
+        loss_weight (float): iou loss weight, default is 2.5
+        max_height (int): max height of input to support random shape input
+        max_width (int): max width of input to support random shape input
+        ciou_term (bool): whether to add ciou_term
+        loss_square (bool): whether to square the iou term
+    """
+
+    def __init__(self,
+                 loss_weight=2.5,
+                 giou=False,
+                 diou=False,
+                 ciou=False,
+                 loss_square=True):
+        self.loss_weight = loss_weight
+        self.giou = giou
+        self.diou = diou
+        self.ciou = ciou
+        self.loss_square = loss_square
+
+    def __call__(self, pbox, gbox):
+        iou = bbox_iou(
+            pbox, gbox, giou=self.giou, diou=self.diou, ciou=self.ciou)
+        if self.loss_square:
+            loss_iou = 1 - iou * iou
+        else:
+            loss_iou = 1 - iou
+
+        loss_iou = loss_iou * self.loss_weight
+        return loss_iou
+
+
+@register
+@serializable
+class GIoULoss(object):
+    """
+    Generalized Intersection over Union, see https://arxiv.org/abs/1902.09630
+    Args:
+        loss_weight (float): giou loss weight, default as 1
+        eps (float): epsilon to avoid divide by zero, default as 1e-10
+        reduction (string): Options are "none", "mean" and "sum". default as none
+    """
+
+    def __init__(self, loss_weight=1., eps=1e-10, reduction='none'):
+        self.loss_weight = loss_weight
+        self.eps = eps
+        assert reduction in ('none', 'mean', 'sum')
+        self.reduction = reduction
+
+    def bbox_overlap(self, box1, box2, eps=1e-10):
+        """calculate the iou of box1 and box2
+        Args:
+            box1 (Tensor): box1 with the shape (..., 4)
+            box2 (Tensor): box1 with the shape (..., 4)
+            eps (float): epsilon to avoid divide by zero
+        Return:
+            iou (Tensor): iou of box1 and box2
+            overlap (Tensor): overlap of box1 and box2
+            union (Tensor): union of box1 and box2
+        """
+        x1, y1, x2, y2 = box1
+        x1g, y1g, x2g, y2g = box2
+
+        xkis1 = paddle.maximum(x1, x1g)
+        ykis1 = paddle.maximum(y1, y1g)
+        xkis2 = paddle.minimum(x2, x2g)
+        ykis2 = paddle.minimum(y2, y2g)
+        w_inter = (xkis2 - xkis1).clip(0)
+        h_inter = (ykis2 - ykis1).clip(0)
+        overlap = w_inter * h_inter
+
+        area1 = (x2 - x1) * (y2 - y1)
+        area2 = (x2g - x1g) * (y2g - y1g)
+        union = area1 + area2 - overlap + eps
+        iou = overlap / union
+
+        return iou, overlap, union
+
+    def __call__(self, pbox, gbox, iou_weight=1., loc_reweight=None):
+        x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)
+        x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)
+        box1 = [x1, y1, x2, y2]
+        box2 = [x1g, y1g, x2g, y2g]
+        iou, overlap, union = self.bbox_overlap(box1, box2, self.eps)
+        xc1 = paddle.minimum(x1, x1g)
+        yc1 = paddle.minimum(y1, y1g)
+        xc2 = paddle.maximum(x2, x2g)
+        yc2 = paddle.maximum(y2, y2g)
+
+        area_c = (xc2 - xc1) * (yc2 - yc1) + self.eps
+        miou = iou - ((area_c - union) / area_c)
+        if loc_reweight is not None:
+            loc_reweight = paddle.reshape(loc_reweight, shape=(-1, 1))
+            loc_thresh = 0.9
+            giou = 1 - (1 - loc_thresh
+                        ) * miou - loc_thresh * miou * loc_reweight
+        else:
+            giou = 1 - miou
+        if self.reduction == 'none':
+            loss = giou
+        elif self.reduction == 'sum':
+            loss = paddle.sum(giou * iou_weight)
+        else:
+            loss = paddle.mean(giou * iou_weight)
+        return loss * self.loss_weight
+
+
+@register
+@serializable
+class DIouLoss(GIoULoss):
+    """
+    Distance-IoU Loss, see https://arxiv.org/abs/1911.08287
+    Args:
+        loss_weight (float): giou loss weight, default as 1
+        eps (float): epsilon to avoid divide by zero, default as 1e-10
+        use_complete_iou_loss (bool): whether to use complete iou loss
+    """
+
+    def __init__(self, loss_weight=1., eps=1e-10, use_complete_iou_loss=True):
+        super(DIouLoss, self).__init__(loss_weight=loss_weight, eps=eps)
+        self.use_complete_iou_loss = use_complete_iou_loss
+
+    def __call__(self, pbox, gbox, iou_weight=1.):
+        x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)
+        x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)
+        cx = (x1 + x2) / 2
+        cy = (y1 + y2) / 2
+        w = x2 - x1
+        h = y2 - y1
+
+        cxg = (x1g + x2g) / 2
+        cyg = (y1g + y2g) / 2
+        wg = x2g - x1g
+        hg = y2g - y1g
+
+        x2 = paddle.maximum(x1, x2)
+        y2 = paddle.maximum(y1, y2)
+
+        # A and B
+        xkis1 = paddle.maximum(x1, x1g)
+        ykis1 = paddle.maximum(y1, y1g)
+        xkis2 = paddle.minimum(x2, x2g)
+        ykis2 = paddle.minimum(y2, y2g)
+
+        # A or B
+        xc1 = paddle.minimum(x1, x1g)
+        yc1 = paddle.minimum(y1, y1g)
+        xc2 = paddle.maximum(x2, x2g)
+        yc2 = paddle.maximum(y2, y2g)
+
+        intsctk = (xkis2 - xkis1) * (ykis2 - ykis1)
+        intsctk = intsctk * paddle.greater_than(
+            xkis2, xkis1) * paddle.greater_than(ykis2, ykis1)
+        unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g
+                                                        ) - intsctk + self.eps
+        iouk = intsctk / unionk
+
+        # DIOU term
+        dist_intersection = (cx - cxg) * (cx - cxg) + (cy - cyg) * (cy - cyg)
+        dist_union = (xc2 - xc1) * (xc2 - xc1) + (yc2 - yc1) * (yc2 - yc1)
+        diou_term = (dist_intersection + self.eps) / (dist_union + self.eps)
+
+        # CIOU term
+        ciou_term = 0
+        if self.use_complete_iou_loss:
+            ar_gt = wg / hg
+            ar_pred = w / h
+            arctan = paddle.atan(ar_gt) - paddle.atan(ar_pred)
+            ar_loss = 4. / np.pi / np.pi * arctan * arctan
+            alpha = ar_loss / (1 - iouk + ar_loss + self.eps)
+            alpha.stop_gradient = True
+            ciou_term = alpha * ar_loss
+
+        diou = paddle.mean((1 - iouk + ciou_term + diou_term) * iou_weight)
+
+        return diou * self.loss_weight
+
+
+@register
+@serializable
+class SIoULoss(GIoULoss):
+    """
+    see https://arxiv.org/pdf/2205.12740.pdf 
+    Args:
+        loss_weight (float): siou loss weight, default as 1
+        eps (float): epsilon to avoid divide by zero, default as 1e-10
+        theta (float): default as 4
+        reduction (str): Options are "none", "mean" and "sum". default as none
+    """
+
+    def __init__(self, loss_weight=1., eps=1e-10, theta=4., reduction='none'):
+        super(SIoULoss, self).__init__(loss_weight=loss_weight, eps=eps)
+        self.loss_weight = loss_weight
+        self.eps = eps
+        self.theta = theta
+        self.reduction = reduction
+
+    def __call__(self, pbox, gbox):
+        x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)
+        x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)
+
+        box1 = [x1, y1, x2, y2]
+        box2 = [x1g, y1g, x2g, y2g]
+        iou = bbox_iou(box1, box2)
+
+        cx = (x1 + x2) / 2
+        cy = (y1 + y2) / 2
+        w = x2 - x1 + self.eps
+        h = y2 - y1 + self.eps
+
+        cxg = (x1g + x2g) / 2
+        cyg = (y1g + y2g) / 2
+        wg = x2g - x1g + self.eps
+        hg = y2g - y1g + self.eps
+
+        x2 = paddle.maximum(x1, x2)
+        y2 = paddle.maximum(y1, y2)
+
+        # A or B
+        xc1 = paddle.minimum(x1, x1g)
+        yc1 = paddle.minimum(y1, y1g)
+        xc2 = paddle.maximum(x2, x2g)
+        yc2 = paddle.maximum(y2, y2g)
+
+        cw_out = xc2 - xc1
+        ch_out = yc2 - yc1
+
+        ch = paddle.maximum(cy, cyg) - paddle.minimum(cy, cyg)
+        cw = paddle.maximum(cx, cxg) - paddle.minimum(cx, cxg)
+
+        # angle cost
+        dist_intersection = paddle.sqrt((cx - cxg)**2 + (cy - cyg)**2)
+        sin_angle_alpha = ch / dist_intersection
+        sin_angle_beta = cw / dist_intersection
+        thred = paddle.pow(paddle.to_tensor(2), 0.5) / 2
+        thred.stop_gradient = True
+        sin_alpha = paddle.where(sin_angle_alpha > thred, sin_angle_beta,
+                                 sin_angle_alpha)
+        angle_cost = paddle.cos(paddle.asin(sin_alpha) * 2 - math.pi / 2)
+
+        # distance cost
+        gamma = 2 - angle_cost
+        # gamma.stop_gradient = True
+        beta_x = ((cxg - cx) / cw_out)**2
+        beta_y = ((cyg - cy) / ch_out)**2
+        dist_cost = 1 - paddle.exp(-gamma * beta_x) + 1 - paddle.exp(-gamma *
+                                                                     beta_y)
+
+        # shape cost
+        omega_w = paddle.abs(w - wg) / paddle.maximum(w, wg)
+        omega_h = paddle.abs(hg - h) / paddle.maximum(h, hg)
+        omega = (1 - paddle.exp(-omega_w))**self.theta + (
+            1 - paddle.exp(-omega_h))**self.theta
+        siou_loss = 1 - iou + (omega + dist_cost) / 2
+
+        if self.reduction == 'mean':
+            siou_loss = paddle.mean(siou_loss)
+        elif self.reduction == 'sum':
+            siou_loss = paddle.sum(siou_loss)
+
+        return siou_loss * self.loss_weight
diff --git a/rtdetr_paddle/ppdet/modeling/losses/smooth_l1_loss.py b/rtdetr_paddle/ppdet/modeling/losses/smooth_l1_loss.py
new file mode 100644
index 0000000..f89c28f
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/losses/smooth_l1_loss.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+
+__all__ = ['SmoothL1Loss']
+
+@register
+class SmoothL1Loss(nn.Layer):
+    """Smooth L1 Loss.
+    Args:
+        beta (float): controls smooth region, it becomes L1 Loss when beta=0.0
+        loss_weight (float): the final loss will be multiplied by this 
+    """
+    def __init__(self,
+                 beta=1.0,
+                 loss_weight=1.0):
+        super(SmoothL1Loss, self).__init__()
+        assert beta >= 0
+        self.beta = beta
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, reduction='none'):
+        """forward function, based on fvcore.
+        Args:
+            pred (Tensor): prediction tensor
+            target (Tensor): target tensor, pred.shape must be the same as target.shape
+            reduction (str): the way to reduce loss, one of (none, sum, mean)
+        """
+        assert reduction in ('none', 'sum', 'mean')
+        target = target.detach()
+        if self.beta < 1e-5:
+            loss = paddle.abs(pred - target)
+        else:
+            n = paddle.abs(pred - target)
+            cond = n < self.beta
+            loss = paddle.where(cond, 0.5 * n ** 2 / self.beta, n - 0.5 * self.beta)
+        if reduction == 'mean':
+            loss = loss.mean() if loss.size > 0 else 0.0 * loss.sum()
+        elif reduction == 'sum':
+            loss = loss.sum()
+        return loss * self.loss_weight
diff --git a/rtdetr_paddle/ppdet/modeling/losses/varifocal_loss.py b/rtdetr_paddle/ppdet/modeling/losses/varifocal_loss.py
new file mode 100644
index 0000000..42d18a6
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/losses/varifocal_loss.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+# The code is based on:
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/varifocal_loss.py
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling import ops
+
+__all__ = ['VarifocalLoss']
+
+
+def varifocal_loss(pred,
+                   target,
+                   alpha=0.75,
+                   gamma=2.0,
+                   iou_weighted=True,
+                   use_sigmoid=True):
+    """`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_
+
+    Args:
+        pred (Tensor): The prediction with shape (N, C), C is the
+            number of classes
+        target (Tensor): The learning target of the iou-aware
+            classification score with shape (N, C), C is the number of classes.
+        alpha (float, optional): A balance factor for the negative part of
+            Varifocal Loss, which is different from the alpha of Focal Loss.
+            Defaults to 0.75.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        iou_weighted (bool, optional): Whether to weight the loss of the
+            positive example with the iou target. Defaults to True.
+    """
+    # pred and target should be of the same size
+    assert pred.shape == target.shape
+    if use_sigmoid:
+        pred_new = F.sigmoid(pred)
+    else:
+        pred_new = pred
+    target = target.cast(pred.dtype)
+    if iou_weighted:
+        focal_weight = target * (target > 0.0).cast('float32') + \
+            alpha * (pred_new - target).abs().pow(gamma) * \
+            (target <= 0.0).cast('float32')
+    else:
+        focal_weight = (target > 0.0).cast('float32') + \
+            alpha * (pred_new - target).abs().pow(gamma) * \
+            (target <= 0.0).cast('float32')
+
+    if use_sigmoid:
+        loss = F.binary_cross_entropy_with_logits(
+            pred, target, reduction='none') * focal_weight
+    else:
+        loss = F.binary_cross_entropy(
+            pred, target, reduction='none') * focal_weight
+        loss = loss.sum(axis=1)
+    return loss
+
+
+@register
+@serializable
+class VarifocalLoss(nn.Layer):
+    def __init__(self,
+                 use_sigmoid=True,
+                 alpha=0.75,
+                 gamma=2.0,
+                 iou_weighted=True,
+                 reduction='mean',
+                 loss_weight=1.0):
+        """`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_
+
+        Args:
+            use_sigmoid (bool, optional): Whether the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            alpha (float, optional): A balance factor for the negative part of
+                Varifocal Loss, which is different from the alpha of Focal
+                Loss. Defaults to 0.75.
+            gamma (float, optional): The gamma for calculating the modulating
+                factor. Defaults to 2.0.
+            iou_weighted (bool, optional): Whether to weight the loss of the
+                positive examples with the iou target. Defaults to True.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'. Options are "none", "mean" and
+                "sum".
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+        """
+        super(VarifocalLoss, self).__init__()
+        assert alpha >= 0.0
+        self.use_sigmoid = use_sigmoid
+        self.alpha = alpha
+        self.gamma = gamma
+        self.iou_weighted = iou_weighted
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, weight=None, avg_factor=None):
+        """Forward function.
+
+        Args:
+            pred (Tensor): The prediction.
+            target (Tensor): The learning target of the prediction.
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+        Returns:
+            Tensor: The calculated loss
+        """
+        loss = self.loss_weight * varifocal_loss(
+            pred,
+            target,
+            alpha=self.alpha,
+            gamma=self.gamma,
+            iou_weighted=self.iou_weighted,
+            use_sigmoid=self.use_sigmoid)
+
+        if weight is not None:
+            loss = loss * weight
+        if avg_factor is None:
+            if self.reduction == 'none':
+                return loss
+            elif self.reduction == 'mean':
+                return loss.mean()
+            elif self.reduction == 'sum':
+                return loss.sum()
+        else:
+            # if reduction is mean, then average the loss by avg_factor
+            if self.reduction == 'mean':
+                loss = loss.sum() / avg_factor
+            # if reduction is 'none', then do nothing, otherwise raise an error
+            elif self.reduction != 'none':
+                raise ValueError(
+                    'avg_factor can not be used with reduction="sum"')
+        return loss
diff --git a/rtdetr_paddle/ppdet/modeling/ops.py b/rtdetr_paddle/ppdet/modeling/ops.py
new file mode 100644
index 0000000..d9a1192
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/ops.py
@@ -0,0 +1,1114 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+try:
+    import paddle._legacy_C_ops as C_ops
+except:
+    import paddle._C_ops as C_ops
+
+from paddle import in_dynamic_mode
+from paddle.common_ops_import import Variable, LayerHelper, check_variable_and_dtype, check_type, check_dtype
+
+__all__ = [
+    'prior_box', 'generate_proposals', 'box_coder', 'multiclass_nms',
+    'distribute_fpn_proposals', 'matrix_nms', 'batch_norm', 'mish', 'silu',
+    'swish', 'identity', 'anchor_generator'
+]
+
+
+def identity(x):
+    return x
+
+
+def mish(x):
+    return F.mish(x) if hasattr(F, mish) else x * F.tanh(F.softplus(x))
+
+
+def silu(x):
+    return F.silu(x)
+
+
+def swish(x):
+    return x * F.sigmoid(x)
+
+
+TRT_ACT_SPEC = {'swish': swish, 'silu': swish}
+
+ACT_SPEC = {'mish': mish, 'silu': silu}
+
+
+def get_act_fn(act=None, trt=False):
+    assert act is None or isinstance(act, (
+        str, dict)), 'name of activation should be str, dict or None'
+    if not act:
+        return identity
+
+    if isinstance(act, dict):
+        name = act['name']
+        act.pop('name')
+        kwargs = act
+    else:
+        name = act
+        kwargs = dict()
+
+    if trt and name in TRT_ACT_SPEC:
+        fn = TRT_ACT_SPEC[name]
+    elif name in ACT_SPEC:
+        fn = ACT_SPEC[name]
+    else:
+        fn = getattr(F, name)
+
+    return lambda x: fn(x, **kwargs)
+
+
+def batch_norm(ch,
+               norm_type='bn',
+               norm_decay=0.,
+               freeze_norm=False,
+               initializer=None,
+               data_format='NCHW'):
+
+    norm_lr = 0. if freeze_norm else 1.
+    weight_attr = ParamAttr(
+        initializer=initializer,
+        learning_rate=norm_lr,
+        regularizer=L2Decay(norm_decay),
+        trainable=False if freeze_norm else True)
+    bias_attr = ParamAttr(
+        learning_rate=norm_lr,
+        regularizer=L2Decay(norm_decay),
+        trainable=False if freeze_norm else True)
+
+    if norm_type in ['sync_bn', 'bn']:
+        norm_layer = nn.BatchNorm2D(
+            ch,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
+
+    norm_params = norm_layer.parameters()
+    if freeze_norm:
+        for param in norm_params:
+            param.stop_gradient = True
+
+    return norm_layer
+
+
+@paddle.jit.not_to_static
+def anchor_generator(input,
+                     anchor_sizes=None,
+                     aspect_ratios=None,
+                     variance=[0.1, 0.1, 0.2, 0.2],
+                     stride=None,
+                     offset=0.5):
+    """
+    **Anchor generator operator**
+    Generate anchors for Faster RCNN algorithm.
+    Each position of the input produce N anchors, N =
+    size(anchor_sizes) * size(aspect_ratios). The order of generated anchors
+    is firstly aspect_ratios loop then anchor_sizes loop.
+    Args:
+       input(Variable): 4-D Tensor with shape [N,C,H,W]. The input feature map.
+       anchor_sizes(float32|list|tuple, optional): The anchor sizes of generated
+          anchors, given in absolute pixels e.g. [64., 128., 256., 512.].
+          For instance, the anchor size of 64 means the area of this anchor 
+          equals to 64**2. None by default.
+       aspect_ratios(float32|list|tuple, optional): The height / width ratios 
+           of generated anchors, e.g. [0.5, 1.0, 2.0]. None by default.
+       variance(list|tuple, optional): The variances to be used in box 
+           regression deltas. The data type is float32, [0.1, 0.1, 0.2, 0.2] by 
+           default.
+       stride(list|tuple, optional): The anchors stride across width and height.
+           The data type is float32. e.g. [16.0, 16.0]. None by default.
+       offset(float32, optional): Prior boxes center offset. 0.5 by default.
+    Returns:
+        Tuple:
+        Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4].
+        H is the height of input, W is the width of input,
+        num_anchors is the box count of each position. 
+        Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
+ 
+        Variances(Variable): The expanded variances of anchors
+        with a layout of [H, W, num_priors, 4].
+        H is the height of input, W is the width of input
+        num_anchors is the box count of each position.
+        Each variance is in (xcenter, ycenter, w, h) format.
+    Examples:
+        .. code-block:: python
+            import paddle.fluid as fluid
+            conv1 = fluid.data(name='conv1', shape=[None, 48, 16, 16], dtype='float32')
+            anchor, var = fluid.layers.anchor_generator(
+                input=conv1,
+                anchor_sizes=[64, 128, 256, 512],
+                aspect_ratios=[0.5, 1.0, 2.0],
+                variance=[0.1, 0.1, 0.2, 0.2],
+                stride=[16.0, 16.0],
+                offset=0.5)
+    """
+
+    def _is_list_or_tuple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
+    if not _is_list_or_tuple_(anchor_sizes):
+        anchor_sizes = [anchor_sizes]
+    if not _is_list_or_tuple_(aspect_ratios):
+        aspect_ratios = [aspect_ratios]
+    if not (_is_list_or_tuple_(stride) and len(stride) == 2):
+        raise ValueError('stride should be a list or tuple ',
+                         'with length 2, (stride_width, stride_height).')
+
+    anchor_sizes = list(map(float, anchor_sizes))
+    aspect_ratios = list(map(float, aspect_ratios))
+    stride = list(map(float, stride))
+
+    if in_dynamic_mode():
+        attrs = ('anchor_sizes', anchor_sizes, 'aspect_ratios', aspect_ratios,
+                 'variances', variance, 'stride', stride, 'offset', offset)
+        anchor, var = C_ops.anchor_generator(input, *attrs)
+        return anchor, var
+
+    helper = LayerHelper("anchor_generator", **locals())
+    dtype = helper.input_dtype()
+    attrs = {
+        'anchor_sizes': anchor_sizes,
+        'aspect_ratios': aspect_ratios,
+        'variances': variance,
+        'stride': stride,
+        'offset': offset
+    }
+
+    anchor = helper.create_variable_for_type_inference(dtype)
+    var = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type="anchor_generator",
+        inputs={"Input": input},
+        outputs={"Anchors": anchor,
+                 "Variances": var},
+        attrs=attrs, )
+    anchor.stop_gradient = True
+    var.stop_gradient = True
+    return anchor, var
+
+
+@paddle.jit.not_to_static
+def distribute_fpn_proposals(fpn_rois,
+                             min_level,
+                             max_level,
+                             refer_level,
+                             refer_scale,
+                             pixel_offset=False,
+                             rois_num=None,
+                             name=None):
+    r"""
+    
+    **This op only takes LoDTensor as input.** In Feature Pyramid Networks 
+    (FPN) models, it is needed to distribute all proposals into different FPN 
+    level, with respect to scale of the proposals, the referring scale and the 
+    referring level. Besides, to restore the order of proposals, we return an 
+    array which indicates the original index of rois in current proposals. 
+    To compute FPN level for each roi, the formula is given as follows:
+    
+    .. math::
+
+        roi\_scale &= \sqrt{BBoxArea(fpn\_roi)}
+
+        level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level)
+
+    where BBoxArea is a function to compute the area of each roi.
+
+    Args:
+
+        fpn_rois(Variable): 2-D Tensor with shape [N, 4] and data type is 
+            float32 or float64. The input fpn_rois.
+        min_level(int32): The lowest level of FPN layer where the proposals come 
+            from.
+        max_level(int32): The highest level of FPN layer where the proposals
+            come from.
+        refer_level(int32): The referring level of FPN layer with specified scale.
+        refer_scale(int32): The referring scale of FPN layer with specified level.
+        rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. 
+            The shape is [B] and data type is int32. B is the number of images.
+            If it is not None then return a list of 1-D Tensor. Each element 
+            is the output RoIs' number of each image on the corresponding level
+            and the shape is [B]. None by default.
+        name(str, optional): For detailed information, please refer 
+            to :ref:`api_guide_Name`. Usually name is no need to set and 
+            None by default. 
+
+    Returns:
+        Tuple:
+
+        multi_rois(List) : A list of 2-D LoDTensor with shape [M, 4] 
+        and data type of float32 and float64. The length is 
+        max_level-min_level+1. The proposals in each FPN level.
+
+        restore_ind(Variable): A 2-D Tensor with shape [N, 1], N is 
+        the number of total rois. The data type is int32. It is
+        used to restore the order of fpn_rois.
+
+        rois_num_per_level(List): A list of 1-D Tensor and each Tensor is 
+        the RoIs' number in each image on the corresponding level. The shape 
+        is [B] and data type of int32. B is the number of images
+
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from ppdet.modeling import ops
+            paddle.enable_static()
+            fpn_rois = paddle.static.data(
+                name='data', shape=[None, 4], dtype='float32', lod_level=1)
+            multi_rois, restore_ind = ops.distribute_fpn_proposals(
+                fpn_rois=fpn_rois,
+                min_level=2,
+                max_level=5,
+                refer_level=4,
+                refer_scale=224)
+    """
+    num_lvl = max_level - min_level + 1
+
+    if in_dynamic_mode():
+        assert rois_num is not None, "rois_num should not be None in dygraph mode."
+        attrs = ('min_level', min_level, 'max_level', max_level, 'refer_level',
+                 refer_level, 'refer_scale', refer_scale, 'pixel_offset',
+                 pixel_offset)
+        multi_rois, restore_ind, rois_num_per_level = C_ops.distribute_fpn_proposals(
+            fpn_rois, rois_num, num_lvl, num_lvl, *attrs)
+
+        return multi_rois, restore_ind, rois_num_per_level
+
+    else:
+        check_variable_and_dtype(fpn_rois, 'fpn_rois', ['float32', 'float64'],
+                                 'distribute_fpn_proposals')
+        helper = LayerHelper('distribute_fpn_proposals', **locals())
+        dtype = helper.input_dtype('fpn_rois')
+        multi_rois = [
+            helper.create_variable_for_type_inference(dtype)
+            for i in range(num_lvl)
+        ]
+
+        restore_ind = helper.create_variable_for_type_inference(dtype='int32')
+
+        inputs = {'FpnRois': fpn_rois}
+        outputs = {
+            'MultiFpnRois': multi_rois,
+            'RestoreIndex': restore_ind,
+        }
+
+        if rois_num is not None:
+            inputs['RoisNum'] = rois_num
+            rois_num_per_level = [
+                helper.create_variable_for_type_inference(dtype='int32')
+                for i in range(num_lvl)
+            ]
+            outputs['MultiLevelRoIsNum'] = rois_num_per_level
+        else:
+            rois_num_per_level = None
+
+        helper.append_op(
+            type='distribute_fpn_proposals',
+            inputs=inputs,
+            outputs=outputs,
+            attrs={
+                'min_level': min_level,
+                'max_level': max_level,
+                'refer_level': refer_level,
+                'refer_scale': refer_scale,
+                'pixel_offset': pixel_offset
+            })
+        return multi_rois, restore_ind, rois_num_per_level
+
+
+@paddle.jit.not_to_static
+def prior_box(input,
+              image,
+              min_sizes,
+              max_sizes=None,
+              aspect_ratios=[1.],
+              variance=[0.1, 0.1, 0.2, 0.2],
+              flip=False,
+              clip=False,
+              steps=[0.0, 0.0],
+              offset=0.5,
+              min_max_aspect_ratios_order=False,
+              name=None):
+    """
+
+    This op generates prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
+    Each position of the input produce N prior boxes, N is determined by
+    the count of min_sizes, max_sizes and aspect_ratios, The size of the
+    box is in range(min_size, max_size) interval, which is generated in
+    sequence according to the aspect_ratios.
+
+    Parameters:
+       input(Tensor): 4-D tensor(NCHW), the data type should be float32 or float64.
+       image(Tensor): 4-D tensor(NCHW), the input image data of PriorBoxOp,
+            the data type should be float32 or float64.
+       min_sizes(list|tuple|float): the min sizes of generated prior boxes.
+       max_sizes(list|tuple|None): the max sizes of generated prior boxes.
+            Default: None.
+       aspect_ratios(list|tuple|float): the aspect ratios of generated
+            prior boxes. Default: [1.].
+       variance(list|tuple): the variances to be encoded in prior boxes.
+            Default:[0.1, 0.1, 0.2, 0.2].
+       flip(bool): Whether to flip aspect ratios. Default:False.
+       clip(bool): Whether to clip out-of-boundary boxes. Default: False.
+       step(list|tuple): Prior boxes step across width and height, If
+            step[0] equals to 0.0 or step[1] equals to 0.0, the prior boxes step across
+            height or weight of the input will be automatically calculated.
+            Default: [0., 0.]
+       offset(float): Prior boxes center offset. Default: 0.5
+       min_max_aspect_ratios_order(bool): If set True, the output prior box is
+            in order of [min, max, aspect_ratios], which is consistent with
+            Caffe. Please note, this order affects the weights order of
+            convolution layer followed by and does not affect the final
+            detection results. Default: False.
+       name(str, optional): The default value is None.  Normally there is no need for 
+            user to set this property. For more information, please refer to :ref:`api_guide_Name`
+
+    Returns:
+        Tuple: A tuple with two Variable (boxes, variances)
+
+        boxes(Tensor): the output prior boxes of PriorBox.
+        4-D tensor, the layout is [H, W, num_priors, 4].
+        H is the height of input, W is the width of input,
+        num_priors is the total box count of each position of input.
+
+        variances(Tensor): the expanded variances of PriorBox.
+        4-D tensor, the layput is [H, W, num_priors, 4].
+        H is the height of input, W is the width of input
+        num_priors is the total box count of each position of input
+
+    Examples:
+        .. code-block:: python
+
+        import paddle
+        from ppdet.modeling import ops
+
+        paddle.enable_static()
+        input = paddle.static.data(name="input", shape=[None,3,6,9])
+        image = paddle.static.data(name="image", shape=[None,3,9,12])
+        box, var = ops.prior_box(
+                    input=input,
+                    image=image,
+                    min_sizes=[100.],
+                    clip=True,
+                    flip=True)
+    """
+    helper = LayerHelper("prior_box", **locals())
+    dtype = helper.input_dtype()
+    check_variable_and_dtype(
+        input, 'input', ['uint8', 'int8', 'float32', 'float64'], 'prior_box')
+
+    def _is_list_or_tuple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
+    if not _is_list_or_tuple_(min_sizes):
+        min_sizes = [min_sizes]
+    if not _is_list_or_tuple_(aspect_ratios):
+        aspect_ratios = [aspect_ratios]
+    if not (_is_list_or_tuple_(steps) and len(steps) == 2):
+        raise ValueError('steps should be a list or tuple ',
+                         'with length 2, (step_width, step_height).')
+
+    min_sizes = list(map(float, min_sizes))
+    aspect_ratios = list(map(float, aspect_ratios))
+    steps = list(map(float, steps))
+
+    cur_max_sizes = None
+    if max_sizes is not None and len(max_sizes) > 0 and max_sizes[0] > 0:
+        if not _is_list_or_tuple_(max_sizes):
+            max_sizes = [max_sizes]
+        cur_max_sizes = max_sizes
+
+    if in_dynamic_mode():
+        attrs = ('min_sizes', min_sizes, 'aspect_ratios', aspect_ratios,
+                 'variances', variance, 'flip', flip, 'clip', clip, 'step_w',
+                 steps[0], 'step_h', steps[1], 'offset', offset,
+                 'min_max_aspect_ratios_order', min_max_aspect_ratios_order)
+        if cur_max_sizes is not None:
+            attrs += ('max_sizes', cur_max_sizes)
+        box, var = C_ops.prior_box(input, image, *attrs)
+        return box, var
+    else:
+        attrs = {
+            'min_sizes': min_sizes,
+            'aspect_ratios': aspect_ratios,
+            'variances': variance,
+            'flip': flip,
+            'clip': clip,
+            'step_w': steps[0],
+            'step_h': steps[1],
+            'offset': offset,
+            'min_max_aspect_ratios_order': min_max_aspect_ratios_order
+        }
+
+        if cur_max_sizes is not None:
+            attrs['max_sizes'] = cur_max_sizes
+
+        box = helper.create_variable_for_type_inference(dtype)
+        var = helper.create_variable_for_type_inference(dtype)
+        helper.append_op(
+            type="prior_box",
+            inputs={"Input": input,
+                    "Image": image},
+            outputs={"Boxes": box,
+                     "Variances": var},
+            attrs=attrs, )
+        box.stop_gradient = True
+        var.stop_gradient = True
+        return box, var
+
+
+@paddle.jit.not_to_static
+def multiclass_nms(bboxes,
+                   scores,
+                   score_threshold,
+                   nms_top_k,
+                   keep_top_k,
+                   nms_threshold=0.3,
+                   normalized=True,
+                   nms_eta=1.,
+                   background_label=-1,
+                   return_index=False,
+                   return_rois_num=True,
+                   rois_num=None,
+                   name=None):
+    """
+    This operator is to do multi-class non maximum suppression (NMS) on
+    boxes and scores.
+    In the NMS step, this operator greedily selects a subset of detection bounding
+    boxes that have high scores larger than score_threshold, if providing this
+    threshold, then selects the largest nms_top_k confidences scores if nms_top_k
+    is larger than -1. Then this operator pruns away boxes that have high IOU
+    (intersection over union) overlap with already selected boxes by adaptive
+    threshold NMS based on parameters of nms_threshold and nms_eta.
+    Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
+    per image if keep_top_k is larger than -1.
+    Args:
+        bboxes (Tensor): Two types of bboxes are supported:
+                           1. (Tensor) A 3-D Tensor with shape
+                           [N, M, 4 or 8 16 24 32] represents the
+                           predicted locations of M bounding bboxes,
+                           N is the batch size. Each bounding box has four
+                           coordinate values and the layout is
+                           [xmin, ymin, xmax, ymax], when box size equals to 4.
+                           2. (LoDTensor) A 3-D Tensor with shape [M, C, 4]
+                           M is the number of bounding boxes, C is the
+                           class number
+        scores (Tensor): Two types of scores are supported:
+                           1. (Tensor) A 3-D Tensor with shape [N, C, M]
+                           represents the predicted confidence predictions.
+                           N is the batch size, C is the class number, M is
+                           number of bounding boxes. For each category there
+                           are total M scores which corresponding M bounding
+                           boxes. Please note, M is equal to the 2nd dimension
+                           of BBoxes.
+                           2. (LoDTensor) A 2-D LoDTensor with shape [M, C].
+                           M is the number of bbox, C is the class number.
+                           In this case, input BBoxes should be the second
+                           case with shape [M, C, 4].
+        background_label (int): The index of background label, the background
+                                label will be ignored. If set to -1, then all
+                                categories will be considered. Default: 0
+        score_threshold (float): Threshold to filter out bounding boxes with
+                                 low confidence score. If not provided,
+                                 consider all boxes.
+        nms_top_k (int): Maximum number of detections to be kept according to
+                         the confidences after the filtering detections based
+                         on score_threshold.
+        nms_threshold (float): The threshold to be used in NMS. Default: 0.3
+        nms_eta (float): The threshold to be used in NMS. Default: 1.0
+        keep_top_k (int): Number of total bboxes to be kept per image after NMS
+                          step. -1 means keeping all bboxes after NMS step.
+        normalized (bool): Whether detections are normalized. Default: True
+        return_index(bool): Whether return selected index. Default: False
+        rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. 
+            The shape is [B] and data type is int32. B is the number of images.
+            If it is not None then return a list of 1-D Tensor. Each element 
+            is the output RoIs' number of each image on the corresponding level
+            and the shape is [B]. None by default.
+        name(str): Name of the multiclass nms op. Default: None.
+    Returns:
+        A tuple with two Variables: (Out, Index) if return_index is True,
+        otherwise, a tuple with one Variable(Out) is returned.
+        Out: A 2-D LoDTensor with shape [No, 6] represents the detections.
+        Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
+        or A 2-D LoDTensor with shape [No, 10] represents the detections.
+        Each row has 10 values: [label, confidence, x1, y1, x2, y2, x3, y3,
+        x4, y4]. No is the total number of detections.
+        If all images have not detected results, all elements in LoD will be
+        0, and output tensor is empty (None).
+        Index: Only return when return_index is True. A 2-D LoDTensor with
+        shape [No, 1] represents the selected index which type is Integer.
+        The index is the absolute value cross batches. No is the same number
+        as Out. If the index is used to gather other attribute such as age,
+        one needs to reshape the input(N, M, 1) to (N * M, 1) as first, where
+        N is the batch size and M is the number of boxes.
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from ppdet.modeling import ops
+            boxes = paddle.static.data(name='bboxes', shape=[81, 4],
+                                      dtype='float32', lod_level=1)
+            scores = paddle.static.data(name='scores', shape=[81],
+                                      dtype='float32', lod_level=1)
+            out, index = ops.multiclass_nms(bboxes=boxes,
+                                            scores=scores,
+                                            background_label=0,
+                                            score_threshold=0.5,
+                                            nms_top_k=400,
+                                            nms_threshold=0.3,
+                                            keep_top_k=200,
+                                            normalized=False,
+                                            return_index=True)
+    """
+    helper = LayerHelper('multiclass_nms3', **locals())
+
+    if in_dynamic_mode():
+        attrs = ('background_label', background_label, 'score_threshold',
+                 score_threshold, 'nms_top_k', nms_top_k, 'nms_threshold',
+                 nms_threshold, 'keep_top_k', keep_top_k, 'nms_eta', nms_eta,
+                 'normalized', normalized)
+        output, index, nms_rois_num = C_ops.multiclass_nms3(bboxes, scores,
+                                                            rois_num, *attrs)
+        if not return_index:
+            index = None
+        return output, nms_rois_num, index
+
+    else:
+        output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
+        index = helper.create_variable_for_type_inference(dtype='int32')
+
+        inputs = {'BBoxes': bboxes, 'Scores': scores}
+        outputs = {'Out': output, 'Index': index}
+
+        if rois_num is not None:
+            inputs['RoisNum'] = rois_num
+
+        if return_rois_num:
+            nms_rois_num = helper.create_variable_for_type_inference(
+                dtype='int32')
+            outputs['NmsRoisNum'] = nms_rois_num
+
+        helper.append_op(
+            type="multiclass_nms3",
+            inputs=inputs,
+            attrs={
+                'background_label': background_label,
+                'score_threshold': score_threshold,
+                'nms_top_k': nms_top_k,
+                'nms_threshold': nms_threshold,
+                'keep_top_k': keep_top_k,
+                'nms_eta': nms_eta,
+                'normalized': normalized
+            },
+            outputs=outputs)
+        output.stop_gradient = True
+        index.stop_gradient = True
+        if not return_index:
+            index = None
+        if not return_rois_num:
+            nms_rois_num = None
+
+        return output, nms_rois_num, index
+
+
+@paddle.jit.not_to_static
+def matrix_nms(bboxes,
+               scores,
+               score_threshold,
+               post_threshold,
+               nms_top_k,
+               keep_top_k,
+               use_gaussian=False,
+               gaussian_sigma=2.,
+               background_label=0,
+               normalized=True,
+               return_index=False,
+               return_rois_num=True,
+               name=None):
+    """
+    **Matrix NMS**
+    This operator does matrix non maximum suppression (NMS).
+    First selects a subset of candidate bounding boxes that have higher scores
+    than score_threshold (if provided), then the top k candidate is selected if
+    nms_top_k is larger than -1. Score of the remaining candidate are then
+    decayed according to the Matrix NMS scheme.
+    Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
+    per image if keep_top_k is larger than -1.
+    Args:
+        bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the
+                           predicted locations of M bounding bboxes,
+                           N is the batch size. Each bounding box has four
+                           coordinate values and the layout is
+                           [xmin, ymin, xmax, ymax], when box size equals to 4.
+                           The data type is float32 or float64.
+        scores (Tensor): A 3-D Tensor with shape [N, C, M]
+                           represents the predicted confidence predictions.
+                           N is the batch size, C is the class number, M is
+                           number of bounding boxes. For each category there
+                           are total M scores which corresponding M bounding
+                           boxes. Please note, M is equal to the 2nd dimension
+                           of BBoxes. The data type is float32 or float64.
+        score_threshold (float): Threshold to filter out bounding boxes with
+                                 low confidence score.
+        post_threshold (float): Threshold to filter out bounding boxes with
+                                low confidence score AFTER decaying.
+        nms_top_k (int): Maximum number of detections to be kept according to
+                         the confidences after the filtering detections based
+                         on score_threshold.
+        keep_top_k (int): Number of total bboxes to be kept per image after NMS
+                          step. -1 means keeping all bboxes after NMS step.
+        use_gaussian (bool): Use Gaussian as the decay function. Default: False
+        gaussian_sigma (float): Sigma for Gaussian decay function. Default: 2.0
+        background_label (int): The index of background label, the background
+                                label will be ignored. If set to -1, then all
+                                categories will be considered. Default: 0
+        normalized (bool): Whether detections are normalized. Default: True
+        return_index(bool): Whether return selected index. Default: False
+        return_rois_num(bool): whether return rois_num. Default: True
+        name(str): Name of the matrix nms op. Default: None.
+    Returns:
+        A tuple with three Tensor: (Out, Index, RoisNum) if return_index is True,
+        otherwise, a tuple with two Tensor (Out, RoisNum) is returned.
+        Out (Tensor): A 2-D Tensor with shape [No, 6] containing the
+             detection results.
+             Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
+             (After version 1.3, when no boxes detected, the lod is changed
+             from {0} to {1})
+        Index (Tensor): A 2-D Tensor with shape [No, 1] containing the
+            selected indices, which are absolute values cross batches.
+        rois_num (Tensor): A 1-D Tensor with shape [N] containing 
+            the number of detected boxes in each image.
+    Examples:
+        .. code-block:: python
+            import paddle
+            from ppdet.modeling import ops
+            boxes = paddle.static.data(name='bboxes', shape=[None,81, 4],
+                                      dtype='float32', lod_level=1)
+            scores = paddle.static.data(name='scores', shape=[None,81],
+                                      dtype='float32', lod_level=1)
+            out = ops.matrix_nms(bboxes=boxes, scores=scores, background_label=0,
+                                 score_threshold=0.5, post_threshold=0.1,
+                                 nms_top_k=400, keep_top_k=200, normalized=False)
+    """
+    check_variable_and_dtype(bboxes, 'BBoxes', ['float32', 'float64'],
+                             'matrix_nms')
+    check_variable_and_dtype(scores, 'Scores', ['float32', 'float64'],
+                             'matrix_nms')
+    check_type(score_threshold, 'score_threshold', float, 'matrix_nms')
+    check_type(post_threshold, 'post_threshold', float, 'matrix_nms')
+    check_type(nms_top_k, 'nums_top_k', int, 'matrix_nms')
+    check_type(keep_top_k, 'keep_top_k', int, 'matrix_nms')
+    check_type(normalized, 'normalized', bool, 'matrix_nms')
+    check_type(use_gaussian, 'use_gaussian', bool, 'matrix_nms')
+    check_type(gaussian_sigma, 'gaussian_sigma', float, 'matrix_nms')
+    check_type(background_label, 'background_label', int, 'matrix_nms')
+
+    if in_dynamic_mode():
+        attrs = ('background_label', background_label, 'score_threshold',
+                 score_threshold, 'post_threshold', post_threshold, 'nms_top_k',
+                 nms_top_k, 'gaussian_sigma', gaussian_sigma, 'use_gaussian',
+                 use_gaussian, 'keep_top_k', keep_top_k, 'normalized',
+                 normalized)
+        out, index, rois_num = C_ops.matrix_nms(bboxes, scores, *attrs)
+        if not return_index:
+            index = None
+        if not return_rois_num:
+            rois_num = None
+        return out, rois_num, index
+    else:
+        helper = LayerHelper('matrix_nms', **locals())
+        output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
+        index = helper.create_variable_for_type_inference(dtype='int32')
+        outputs = {'Out': output, 'Index': index}
+        if return_rois_num:
+            rois_num = helper.create_variable_for_type_inference(dtype='int32')
+            outputs['RoisNum'] = rois_num
+
+        helper.append_op(
+            type="matrix_nms",
+            inputs={'BBoxes': bboxes,
+                    'Scores': scores},
+            attrs={
+                'background_label': background_label,
+                'score_threshold': score_threshold,
+                'post_threshold': post_threshold,
+                'nms_top_k': nms_top_k,
+                'gaussian_sigma': gaussian_sigma,
+                'use_gaussian': use_gaussian,
+                'keep_top_k': keep_top_k,
+                'normalized': normalized
+            },
+            outputs=outputs)
+        output.stop_gradient = True
+
+        if not return_index:
+            index = None
+        if not return_rois_num:
+            rois_num = None
+        return output, rois_num, index
+
+
+@paddle.jit.not_to_static
+def box_coder(prior_box,
+              prior_box_var,
+              target_box,
+              code_type="encode_center_size",
+              box_normalized=True,
+              axis=0,
+              name=None):
+    r"""
+    **Box Coder Layer**
+    Encode/Decode the target bounding box with the priorbox information.
+    
+    The Encoding schema described below:
+    .. math::
+        ox = (tx - px) / pw / pxv
+        oy = (ty - py) / ph / pyv
+        ow = \log(\abs(tw / pw)) / pwv 
+        oh = \log(\abs(th / ph)) / phv 
+    The Decoding schema described below:
+    
+    .. math::
+  
+        ox = (pw * pxv * tx * + px) - tw / 2
+        oy = (ph * pyv * ty * + py) - th / 2
+        ow = \exp(pwv * tw) * pw + tw / 2
+        oh = \exp(phv * th) * ph + th / 2   
+    where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, 
+    width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote 
+    the priorbox's (anchor) center coordinates, width and height. `pxv`, 
+    `pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`, 
+    `ow`, `oh` denote the encoded/decoded coordinates, width and height. 
+    During Box Decoding, two modes for broadcast are supported. Say target 
+    box has shape [N, M, 4], and the shape of prior box can be [N, 4] or 
+    [M, 4]. Then prior box will broadcast to target box along the 
+    assigned axis. 
+
+    Args:
+        prior_box(Tensor): Box list prior_box is a 2-D Tensor with shape 
+            [M, 4] holds M boxes and data type is float32 or float64. Each box
+            is represented as [xmin, ymin, xmax, ymax], [xmin, ymin] is the 
+            left top coordinate of the anchor box, if the input is image feature
+            map, they are close to the origin of the coordinate system. 
+            [xmax, ymax] is the right bottom coordinate of the anchor box.       
+        prior_box_var(List|Tensor|None): prior_box_var supports three types 
+            of input. One is Tensor with shape [M, 4] which holds M group and 
+            data type is float32 or float64. The second is list consist of 
+            4 elements shared by all boxes and data type is float32 or float64. 
+            Other is None and not involved in calculation. 
+        target_box(Tensor): This input can be a 2-D LoDTensor with shape 
+            [N, 4] when code_type is 'encode_center_size'. This input also can 
+            be a 3-D Tensor with shape [N, M, 4] when code_type is 
+            'decode_center_size'. Each box is represented as 
+            [xmin, ymin, xmax, ymax]. The data type is float32 or float64. 
+        code_type(str): The code type used with the target box. It can be
+            `encode_center_size` or `decode_center_size`. `encode_center_size` 
+            by default.
+        box_normalized(bool): Whether treat the priorbox as a normalized box.
+            Set true by default.
+        axis(int): Which axis in PriorBox to broadcast for box decode, 
+            for example, if axis is 0 and TargetBox has shape [N, M, 4] and 
+            PriorBox has shape [M, 4], then PriorBox will broadcast to [N, M, 4]
+            for decoding. It is only valid when code type is 
+            `decode_center_size`. Set 0 by default. 
+        name(str, optional): For detailed information, please refer 
+            to :ref:`api_guide_Name`. Usually name is no need to set and 
+            None by default. 
+
+    Returns:
+        Tensor:
+        output_box(Tensor): When code_type is 'encode_center_size', the 
+        output tensor of box_coder_op with shape [N, M, 4] representing the 
+        result of N target boxes encoded with M Prior boxes and variances. 
+        When code_type is 'decode_center_size', N represents the batch size 
+        and M represents the number of decoded boxes.
+
+    Examples:
+ 
+        .. code-block:: python
+ 
+            import paddle
+            from ppdet.modeling import ops
+            paddle.enable_static()
+            # For encode
+            prior_box_encode = paddle.static.data(name='prior_box_encode',
+                                  shape=[512, 4],
+                                  dtype='float32')
+            target_box_encode = paddle.static.data(name='target_box_encode',
+                                   shape=[81, 4],
+                                   dtype='float32')
+            output_encode = ops.box_coder(prior_box=prior_box_encode,
+                                    prior_box_var=[0.1,0.1,0.2,0.2],
+                                    target_box=target_box_encode,
+                                    code_type="encode_center_size")
+            # For decode
+            prior_box_decode = paddle.static.data(name='prior_box_decode',
+                                  shape=[512, 4],
+                                  dtype='float32')
+            target_box_decode = paddle.static.data(name='target_box_decode',
+                                   shape=[512, 81, 4],
+                                   dtype='float32')
+            output_decode = ops.box_coder(prior_box=prior_box_decode,
+                                    prior_box_var=[0.1,0.1,0.2,0.2],
+                                    target_box=target_box_decode,
+                                    code_type="decode_center_size",
+                                    box_normalized=False,
+                                    axis=1)
+    """
+    check_variable_and_dtype(prior_box, 'prior_box', ['float32', 'float64'],
+                             'box_coder')
+    check_variable_and_dtype(target_box, 'target_box', ['float32', 'float64'],
+                             'box_coder')
+
+    if in_dynamic_mode():
+        if isinstance(prior_box_var, Variable):
+            output_box = C_ops.box_coder(
+                prior_box, prior_box_var, target_box, "code_type", code_type,
+                "box_normalized", box_normalized, "axis", axis)
+
+        elif isinstance(prior_box_var, list):
+            output_box = C_ops.box_coder(
+                prior_box, None, target_box, "code_type", code_type,
+                "box_normalized", box_normalized, "axis", axis, "variance",
+                prior_box_var)
+        else:
+            raise TypeError(
+                "Input variance of box_coder must be Variable or list")
+        return output_box
+    else:
+        helper = LayerHelper("box_coder", **locals())
+
+        output_box = helper.create_variable_for_type_inference(
+            dtype=prior_box.dtype)
+
+        inputs = {"PriorBox": prior_box, "TargetBox": target_box}
+        attrs = {
+            "code_type": code_type,
+            "box_normalized": box_normalized,
+            "axis": axis
+        }
+        if isinstance(prior_box_var, Variable):
+            inputs['PriorBoxVar'] = prior_box_var
+        elif isinstance(prior_box_var, list):
+            attrs['variance'] = prior_box_var
+        else:
+            raise TypeError(
+                "Input variance of box_coder must be Variable or list")
+        helper.append_op(
+            type="box_coder",
+            inputs=inputs,
+            attrs=attrs,
+            outputs={"OutputBox": output_box})
+        return output_box
+
+
+@paddle.jit.not_to_static
+def generate_proposals(scores,
+                       bbox_deltas,
+                       im_shape,
+                       anchors,
+                       variances,
+                       pre_nms_top_n=6000,
+                       post_nms_top_n=1000,
+                       nms_thresh=0.5,
+                       min_size=0.1,
+                       eta=1.0,
+                       pixel_offset=False,
+                       return_rois_num=False,
+                       name=None):
+    """
+    **Generate proposal Faster-RCNN**
+    This operation proposes RoIs according to each box with their
+    probability to be a foreground object and 
+    the box can be calculated by anchors. Bbox_deltais and scores
+    to be an object are the output of RPN. Final proposals
+    could be used to train detection net.
+    For generating proposals, this operation performs following steps:
+    1. Transposes and resizes scores and bbox_deltas in size of
+       (H*W*A, 1) and (H*W*A, 4)
+    2. Calculate box locations as proposals candidates. 
+    3. Clip boxes to image
+    4. Remove predicted boxes with small area. 
+    5. Apply NMS to get final proposals as output.
+    Args:
+        scores(Tensor): A 4-D Tensor with shape [N, A, H, W] represents
+            the probability for each box to be an object.
+            N is batch size, A is number of anchors, H and W are height and
+            width of the feature map. The data type must be float32.
+        bbox_deltas(Tensor): A 4-D Tensor with shape [N, 4*A, H, W]
+            represents the difference between predicted box location and
+            anchor location. The data type must be float32.
+        im_shape(Tensor): A 2-D Tensor with shape [N, 2] represents H, W, the
+            origin image size or input size. The data type can be float32 or 
+            float64.
+        anchors(Tensor):   A 4-D Tensor represents the anchors with a layout
+            of [H, W, A, 4]. H and W are height and width of the feature map,
+            num_anchors is the box count of each position. Each anchor is
+            in (xmin, ymin, xmax, ymax) format an unnormalized. The data type must be float32.
+        variances(Tensor): A 4-D Tensor. The expanded variances of anchors with a layout of
+            [H, W, num_priors, 4]. Each variance is in
+            (xcenter, ycenter, w, h) format. The data type must be float32.
+        pre_nms_top_n(float): Number of total bboxes to be kept per
+            image before NMS. The data type must be float32. `6000` by default.
+        post_nms_top_n(float): Number of total bboxes to be kept per
+            image after NMS. The data type must be float32. `1000` by default.
+        nms_thresh(float): Threshold in NMS. The data type must be float32. `0.5` by default.
+        min_size(float): Remove predicted boxes with either height or
+            width < min_size. The data type must be float32. `0.1` by default.
+        eta(float): Apply in adaptive NMS, if adaptive `threshold > 0.5`,
+            `adaptive_threshold = adaptive_threshold * eta` in each iteration.
+        return_rois_num(bool): When setting True, it will return a 1D Tensor with shape [N, ] that includes Rois's 
+            num of each image in one batch. The N is the image's num. For example, the tensor has values [4,5] that represents
+            the first image has 4 Rois, the second image has 5 Rois. It only used in rcnn model. 
+            'False' by default. 
+        name(str, optional): For detailed information, please refer 
+            to :ref:`api_guide_Name`. Usually name is no need to set and 
+            None by default. 
+
+    Returns:
+        tuple:
+        A tuple with format ``(rpn_rois, rpn_roi_probs)``.
+        - **rpn_rois**: The generated RoIs. 2-D Tensor with shape ``[N, 4]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``.
+        - **rpn_roi_probs**: The scores of generated RoIs. 2-D Tensor with shape ``[N, 1]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``.
+
+    Examples:
+        .. code-block:: python
+        
+            import paddle
+            from ppdet.modeling import ops
+            paddle.enable_static()
+            scores = paddle.static.data(name='scores', shape=[None, 4, 5, 5], dtype='float32')
+            bbox_deltas = paddle.static.data(name='bbox_deltas', shape=[None, 16, 5, 5], dtype='float32')
+            im_shape = paddle.static.data(name='im_shape', shape=[None, 2], dtype='float32')
+            anchors = paddle.static.data(name='anchors', shape=[None, 5, 4, 4], dtype='float32')
+            variances = paddle.static.data(name='variances', shape=[None, 5, 10, 4], dtype='float32')
+            rois, roi_probs = ops.generate_proposals(scores, bbox_deltas,
+                         im_shape, anchors, variances)
+    """
+    if in_dynamic_mode():
+        assert return_rois_num, "return_rois_num should be True in dygraph mode."
+        attrs = ('pre_nms_topN', pre_nms_top_n, 'post_nms_topN', post_nms_top_n,
+                 'nms_thresh', nms_thresh, 'min_size', min_size, 'eta', eta,
+                 'pixel_offset', pixel_offset)
+        rpn_rois, rpn_roi_probs, rpn_rois_num = C_ops.generate_proposals_v2(
+            scores, bbox_deltas, im_shape, anchors, variances, *attrs)
+        if not return_rois_num:
+            rpn_rois_num = None
+        return rpn_rois, rpn_roi_probs, rpn_rois_num
+
+    else:
+        helper = LayerHelper('generate_proposals_v2', **locals())
+
+        check_variable_and_dtype(scores, 'scores', ['float32'],
+                                 'generate_proposals_v2')
+        check_variable_and_dtype(bbox_deltas, 'bbox_deltas', ['float32'],
+                                 'generate_proposals_v2')
+        check_variable_and_dtype(im_shape, 'im_shape', ['float32', 'float64'],
+                                 'generate_proposals_v2')
+        check_variable_and_dtype(anchors, 'anchors', ['float32'],
+                                 'generate_proposals_v2')
+        check_variable_and_dtype(variances, 'variances', ['float32'],
+                                 'generate_proposals_v2')
+
+        rpn_rois = helper.create_variable_for_type_inference(
+            dtype=bbox_deltas.dtype)
+        rpn_roi_probs = helper.create_variable_for_type_inference(
+            dtype=scores.dtype)
+        outputs = {
+            'RpnRois': rpn_rois,
+            'RpnRoiProbs': rpn_roi_probs,
+        }
+        if return_rois_num:
+            rpn_rois_num = helper.create_variable_for_type_inference(
+                dtype='int32')
+            rpn_rois_num.stop_gradient = True
+            outputs['RpnRoisNum'] = rpn_rois_num
+
+        helper.append_op(
+            type="generate_proposals_v2",
+            inputs={
+                'Scores': scores,
+                'BboxDeltas': bbox_deltas,
+                'ImShape': im_shape,
+                'Anchors': anchors,
+                'Variances': variances
+            },
+            attrs={
+                'pre_nms_topN': pre_nms_top_n,
+                'post_nms_topN': post_nms_top_n,
+                'nms_thresh': nms_thresh,
+                'min_size': min_size,
+                'eta': eta,
+                'pixel_offset': pixel_offset
+            },
+            outputs=outputs)
+        rpn_rois.stop_gradient = True
+        rpn_roi_probs.stop_gradient = True
+        if not return_rois_num:
+            rpn_rois_num = None
+
+        return rpn_rois, rpn_roi_probs, rpn_rois_num
+
+
+def sigmoid_cross_entropy_with_logits(input,
+                                      label,
+                                      ignore_index=-100,
+                                      normalize=False):
+    output = F.binary_cross_entropy_with_logits(input, label, reduction='none')
+    mask_tensor = paddle.cast(label != ignore_index, 'float32')
+    output = paddle.multiply(output, mask_tensor)
+    if normalize:
+        sum_valid_mask = paddle.sum(mask_tensor)
+        output = output / sum_valid_mask
+    return output
+
+
+def smooth_l1(input, label, inside_weight=None, outside_weight=None,
+              sigma=None):
+    input_new = paddle.multiply(input, inside_weight)
+    label_new = paddle.multiply(label, inside_weight)
+    delta = 1 / (sigma * sigma)
+    out = F.smooth_l1_loss(input_new, label_new, reduction='none', delta=delta)
+    out = paddle.multiply(out, outside_weight)
+    out = out / delta
+    out = paddle.reshape(out, shape=[out.shape[0], -1])
+    out = paddle.sum(out, axis=1)
+    return out
+
+
+def channel_shuffle(x, groups):
+    batch_size, num_channels, height, width = x.shape[0:4]
+    assert num_channels % groups == 0, 'num_channels should be divisible by groups'
+    channels_per_group = num_channels // groups
+    x = paddle.reshape(
+        x=x, shape=[batch_size, groups, channels_per_group, height, width])
+    x = paddle.transpose(x=x, perm=[0, 2, 1, 3, 4])
+    x = paddle.reshape(x=x, shape=[batch_size, num_channels, height, width])
+    return x
+
+
+def get_static_shape(tensor):
+    shape = paddle.shape(tensor)
+    shape.stop_gradient = True
+    return shape
diff --git a/rtdetr_paddle/ppdet/modeling/post_process.py b/rtdetr_paddle/ppdet/modeling/post_process.py
new file mode 100644
index 0000000..795bb5c
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/post_process.py
@@ -0,0 +1,244 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+from .transformers import bbox_cxcywh_to_xyxy
+
+__all__ = [
+    'DETRPostProcess',
+]
+
+@register
+class DETRPostProcess(object):
+    __shared__ = ['num_classes', 'use_focal_loss', 'with_mask']
+    __inject__ = []
+
+    def __init__(self,
+                 num_classes=80,
+                 num_top_queries=100,
+                 dual_queries=False,
+                 dual_groups=0,
+                 use_focal_loss=False,
+                 with_mask=False,
+                 mask_threshold=0.5,
+                 use_avg_mask_score=False,
+                 bbox_decode_type='origin'):
+        super(DETRPostProcess, self).__init__()
+        assert bbox_decode_type in ['origin', 'pad']
+
+        self.num_classes = num_classes
+        self.num_top_queries = num_top_queries
+        self.dual_queries = dual_queries
+        self.dual_groups = dual_groups
+        self.use_focal_loss = use_focal_loss
+        self.with_mask = with_mask
+        self.mask_threshold = mask_threshold
+        self.use_avg_mask_score = use_avg_mask_score
+        self.bbox_decode_type = bbox_decode_type
+
+    def _mask_postprocess(self, mask_pred, score_pred, index):
+        mask_score = F.sigmoid(paddle.gather_nd(mask_pred, index))
+        mask_pred = (mask_score > self.mask_threshold).astype(mask_score.dtype)
+        if self.use_avg_mask_score:
+            avg_mask_score = (mask_pred * mask_score).sum([-2, -1]) / (
+                mask_pred.sum([-2, -1]) + 1e-6)
+            score_pred *= avg_mask_score
+
+        return mask_pred[0].astype('int32'), score_pred
+
+    def __call__(self, head_out, im_shape, scale_factor, pad_shape):
+        """
+        Decode the bbox and mask.
+
+        Args:
+            head_out (tuple): bbox_pred, cls_logit and masks of bbox_head output.
+            im_shape (Tensor): The shape of the input image without padding.
+            scale_factor (Tensor): The scale factor of the input image.
+            pad_shape (Tensor): The shape of the input image with padding.
+        Returns:
+            bbox_pred (Tensor): The output prediction with shape [N, 6], including
+                labels, scores and bboxes. The size of bboxes are corresponding
+                to the input image, the bboxes may be used in other branch.
+            bbox_num (Tensor): The number of prediction boxes of each batch with
+                shape [bs], and is N.
+        """
+        bboxes, logits, masks = head_out
+        if self.dual_queries:
+            num_queries = logits.shape[1]
+            logits, bboxes = logits[:, :int(num_queries // (self.dual_groups + 1)), :], \
+                             bboxes[:, :int(num_queries // (self.dual_groups + 1)), :]
+
+        bbox_pred = bbox_cxcywh_to_xyxy(bboxes)
+        # calculate the original shape of the image
+        origin_shape = paddle.floor(im_shape / scale_factor + 0.5)
+        img_h, img_w = paddle.split(origin_shape, 2, axis=-1)
+        if self.bbox_decode_type == 'pad':
+            # calculate the shape of the image with padding
+            out_shape = pad_shape / im_shape * origin_shape
+            out_shape = out_shape.flip(1).tile([1, 2]).unsqueeze(1)
+        elif self.bbox_decode_type == 'origin':
+            out_shape = origin_shape.flip(1).tile([1, 2]).unsqueeze(1)
+        else:
+            raise Exception(
+                f'Wrong `bbox_decode_type`: {self.bbox_decode_type}.')
+        bbox_pred *= out_shape
+
+        scores = F.sigmoid(logits) if self.use_focal_loss else F.softmax(
+            logits)[:, :, :-1]
+
+        if not self.use_focal_loss:
+            scores, labels = scores.max(-1), scores.argmax(-1)
+            if scores.shape[1] > self.num_top_queries:
+                scores, index = paddle.topk(
+                    scores, self.num_top_queries, axis=-1)
+                batch_ind = paddle.arange(
+                    end=scores.shape[0]).unsqueeze(-1).tile(
+                        [1, self.num_top_queries])
+                index = paddle.stack([batch_ind, index], axis=-1)
+                labels = paddle.gather_nd(labels, index)
+                bbox_pred = paddle.gather_nd(bbox_pred, index)
+        else:
+            scores, index = paddle.topk(
+                scores.flatten(1), self.num_top_queries, axis=-1)
+            labels = index % self.num_classes
+            index = index // self.num_classes
+            batch_ind = paddle.arange(end=scores.shape[0]).unsqueeze(-1).tile(
+                [1, self.num_top_queries])
+            index = paddle.stack([batch_ind, index], axis=-1)
+            bbox_pred = paddle.gather_nd(bbox_pred, index)
+
+        mask_pred = None
+        if self.with_mask:
+            assert masks is not None
+            masks = F.interpolate(
+                masks, scale_factor=4, mode="bilinear", align_corners=False)
+            # TODO: Support prediction with bs>1.
+            # remove padding for input image
+            h, w = im_shape.astype('int32')[0]
+            masks = masks[..., :h, :w]
+            # get pred_mask in the original resolution.
+            img_h = img_h[0].astype('int32')
+            img_w = img_w[0].astype('int32')
+            masks = F.interpolate(
+                masks,
+                size=(img_h, img_w),
+                mode="bilinear",
+                align_corners=False)
+            mask_pred, scores = self._mask_postprocess(masks, scores, index)
+
+        bbox_pred = paddle.concat(
+            [
+                labels.unsqueeze(-1).astype('float32'), scores.unsqueeze(-1),
+                bbox_pred
+            ],
+            axis=-1)
+        bbox_num = paddle.to_tensor(
+            self.num_top_queries, dtype='int32').tile([bbox_pred.shape[0]])
+        bbox_pred = bbox_pred.reshape([-1, 6])
+        return bbox_pred, bbox_num, mask_pred
+
+
+
+def paste_mask(masks, boxes, im_h, im_w, assign_on_cpu=False):
+    """
+    Paste the mask prediction to the original image.
+    """
+    x0_int, y0_int = 0, 0
+    x1_int, y1_int = im_w, im_h
+    x0, y0, x1, y1 = paddle.split(boxes, 4, axis=1)
+    N = masks.shape[0]
+    img_y = paddle.arange(y0_int, y1_int) + 0.5
+    img_x = paddle.arange(x0_int, x1_int) + 0.5
+
+    img_y = (img_y - y0) / (y1 - y0) * 2 - 1
+    img_x = (img_x - x0) / (x1 - x0) * 2 - 1
+    # img_x, img_y have shapes (N, w), (N, h)
+
+    if assign_on_cpu:
+        paddle.set_device('cpu')
+    gx = img_x[:, None, :].expand(
+        [N, paddle.shape(img_y)[1], paddle.shape(img_x)[1]])
+    gy = img_y[:, :, None].expand(
+        [N, paddle.shape(img_y)[1], paddle.shape(img_x)[1]])
+    grid = paddle.stack([gx, gy], axis=3)
+    img_masks = F.grid_sample(masks, grid, align_corners=False)
+    return img_masks[:, 0]
+
+
+def multiclass_nms(bboxs, num_classes, match_threshold=0.6, match_metric='iou'):
+    final_boxes = []
+    for c in range(num_classes):
+        idxs = bboxs[:, 0] == c
+        if np.count_nonzero(idxs) == 0: continue
+        r = nms(bboxs[idxs, 1:], match_threshold, match_metric)
+        final_boxes.append(np.concatenate([np.full((r.shape[0], 1), c), r], 1))
+    return final_boxes
+
+
+def nms(dets, match_threshold=0.6, match_metric='iou'):
+    """ Apply NMS to avoid detecting too many overlapping bounding boxes.
+        Args:
+            dets: shape [N, 5], [score, x1, y1, x2, y2]
+            match_metric: 'iou' or 'ios'
+            match_threshold: overlap thresh for match metric.
+    """
+    if dets.shape[0] == 0:
+        return dets[[], :]
+    scores = dets[:, 0]
+    x1 = dets[:, 1]
+    y1 = dets[:, 2]
+    x2 = dets[:, 3]
+    y2 = dets[:, 4]
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    ndets = dets.shape[0]
+    suppressed = np.zeros((ndets), dtype=np.int32)
+
+    for _i in range(ndets):
+        i = order[_i]
+        if suppressed[i] == 1:
+            continue
+        ix1 = x1[i]
+        iy1 = y1[i]
+        ix2 = x2[i]
+        iy2 = y2[i]
+        iarea = areas[i]
+        for _j in range(_i + 1, ndets):
+            j = order[_j]
+            if suppressed[j] == 1:
+                continue
+            xx1 = max(ix1, x1[j])
+            yy1 = max(iy1, y1[j])
+            xx2 = min(ix2, x2[j])
+            yy2 = min(iy2, y2[j])
+            w = max(0.0, xx2 - xx1 + 1)
+            h = max(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            if match_metric == 'iou':
+                union = iarea + areas[j] - inter
+                match_value = inter / union
+            elif match_metric == 'ios':
+                smaller = min(iarea, areas[j])
+                match_value = inter / smaller
+            else:
+                raise ValueError()
+            if match_value >= match_threshold:
+                suppressed[j] = 1
+    keep = np.where(suppressed == 0)[0]
+    dets = dets[keep, :]
+    return dets
diff --git a/rtdetr_paddle/ppdet/modeling/shape_spec.py b/rtdetr_paddle/ppdet/modeling/shape_spec.py
new file mode 100644
index 0000000..81601fd
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/shape_spec.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+# The code is based on:
+# https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/shape_spec.py
+
+from collections import namedtuple
+
+
+class ShapeSpec(
+        namedtuple("_ShapeSpec", ["channels", "height", "width", "stride"])):
+    def __new__(cls, channels=None, height=None, width=None, stride=None):
+        return super(ShapeSpec, cls).__new__(cls, channels, height, width,
+                                             stride)
diff --git a/rtdetr_paddle/ppdet/modeling/transformers/__init__.py b/rtdetr_paddle/ppdet/modeling/transformers/__init__.py
new file mode 100644
index 0000000..47f09bf
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/transformers/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .utils import *
+from .matchers import *
+from .position_encoding import *
+from .rtdetr_transformer import *
+from .dino_transformer import *
+from .hybrid_encoder import *
\ No newline at end of file
diff --git a/rtdetr_paddle/ppdet/modeling/transformers/deformable_transformer.py b/rtdetr_paddle/ppdet/modeling/transformers/deformable_transformer.py
new file mode 100644
index 0000000..ab05704
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/transformers/deformable_transformer.py
@@ -0,0 +1,537 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+
+from ppdet.core.workspace import register
+from ..layers import MultiHeadAttention
+from .position_encoding import PositionEmbedding
+from .utils import _get_clones, get_valid_ratio
+from ..initializer import linear_init_, constant_, xavier_uniform_, normal_
+
+__all__ = ['DeformableTransformer']
+
+
+class MSDeformableAttention(nn.Layer):
+    def __init__(self,
+                 embed_dim=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 lr_mult=0.1):
+        """
+        Multi-Scale Deformable Attention Module
+        """
+        super(MSDeformableAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_levels = num_levels
+        self.num_points = num_points
+        self.total_points = num_heads * num_levels * num_points
+
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        self.sampling_offsets = nn.Linear(
+            embed_dim,
+            self.total_points * 2,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult))
+
+        self.attention_weights = nn.Linear(embed_dim, self.total_points)
+        self.value_proj = nn.Linear(embed_dim, embed_dim)
+        self.output_proj = nn.Linear(embed_dim, embed_dim)
+        try:
+            # use cuda op
+            from deformable_detr_ops import ms_deformable_attn
+        except:
+            # use paddle func
+            from .utils import deformable_attention_core_func as ms_deformable_attn
+        self.ms_deformable_attn_core = ms_deformable_attn
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        # sampling_offsets
+        constant_(self.sampling_offsets.weight)
+        thetas = paddle.arange(
+            self.num_heads,
+            dtype=paddle.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = grid_init / grid_init.abs().max(-1, keepdim=True)
+        grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile(
+            [1, self.num_levels, self.num_points, 1])
+        scaling = paddle.arange(
+            1, self.num_points + 1,
+            dtype=paddle.float32).reshape([1, 1, -1, 1])
+        grid_init *= scaling
+        self.sampling_offsets.bias.set_value(grid_init.flatten())
+        # attention_weights
+        constant_(self.attention_weights.weight)
+        constant_(self.attention_weights.bias)
+        # proj
+        xavier_uniform_(self.value_proj.weight)
+        constant_(self.value_proj.bias)
+        xavier_uniform_(self.output_proj.weight)
+        constant_(self.output_proj.bias)
+
+    def forward(self,
+                query,
+                reference_points,
+                value,
+                value_spatial_shapes,
+                value_level_start_index,
+                value_mask=None):
+        """
+        Args:
+            query (Tensor): [bs, query_length, C]
+            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area
+            value (Tensor): [bs, value_length, C]
+            value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+            value_level_start_index (Tensor(int64)): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
+            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
+
+        Returns:
+            output (Tensor): [bs, Length_{query}, C]
+        """
+        bs, Len_q = query.shape[:2]
+        Len_v = value.shape[1]
+        assert int(value_spatial_shapes.prod(1).sum()) == Len_v
+
+        value = self.value_proj(value)
+        if value_mask is not None:
+            value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
+            value *= value_mask
+        value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])
+
+        sampling_offsets = self.sampling_offsets(query).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
+        attention_weights = self.attention_weights(query).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels * self.num_points])
+        attention_weights = F.softmax(attention_weights).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels, self.num_points])
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = value_spatial_shapes.flip([1]).reshape(
+                [1, 1, 1, self.num_levels, 1, 2])
+            sampling_locations = reference_points.reshape([
+                bs, Len_q, 1, self.num_levels, 1, 2
+            ]) + sampling_offsets / offset_normalizer
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2] + sampling_offsets /
+                self.num_points * reference_points[:, :, None, :, None, 2:] *
+                0.5)
+        else:
+            raise ValueError(
+                "Last dim of reference_points must be 2 or 4, but get {} instead.".
+                format(reference_points.shape[-1]))
+
+        output = self.ms_deformable_attn_core(
+            value, value_spatial_shapes, value_level_start_index,
+            sampling_locations, attention_weights)
+        output = self.output_proj(output)
+
+        return output
+
+
+class DeformableTransformerEncoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model=256,
+                 n_head=8,
+                 dim_feedforward=1024,
+                 dropout=0.1,
+                 activation="relu",
+                 n_levels=4,
+                 n_points=4,
+                 lr_mult=0.1,
+                 weight_attr=None,
+                 bias_attr=None):
+        super(DeformableTransformerEncoderLayer, self).__init__()
+        # self attention
+        self.self_attn = MSDeformableAttention(d_model, n_head, n_levels,
+                                               n_points, lr_mult)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+        # ffn
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.activation = getattr(F, activation)
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+        xavier_uniform_(self.linear1.weight)
+        xavier_uniform_(self.linear2.weight)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, src):
+        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+        src = src + self.dropout3(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward(self,
+                src,
+                reference_points,
+                spatial_shapes,
+                level_start_index,
+                src_mask=None,
+                query_pos_embed=None):
+        # self attention
+        src2 = self.self_attn(
+            self.with_pos_embed(src, query_pos_embed), reference_points, src,
+            spatial_shapes, level_start_index, src_mask)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        # ffn
+        src = self.forward_ffn(src)
+
+        return src
+
+
+class DeformableTransformerEncoder(nn.Layer):
+    def __init__(self, encoder_layer, num_layers):
+        super(DeformableTransformerEncoder, self).__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, offset=0.5):
+        valid_ratios = valid_ratios.unsqueeze(1)
+        reference_points = []
+        for i, (H, W) in enumerate(spatial_shapes):
+            ref_y, ref_x = paddle.meshgrid(
+                paddle.arange(end=H) + offset, paddle.arange(end=W) + offset)
+            ref_y = ref_y.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 1] *
+                                                    H)
+            ref_x = ref_x.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 0] *
+                                                    W)
+            reference_points.append(paddle.stack((ref_x, ref_y), axis=-1))
+        reference_points = paddle.concat(reference_points, 1).unsqueeze(2)
+        reference_points = reference_points * valid_ratios
+        return reference_points
+
+    def forward(self,
+                feat,
+                spatial_shapes,
+                level_start_index,
+                feat_mask=None,
+                query_pos_embed=None,
+                valid_ratios=None):
+        if valid_ratios is None:
+            valid_ratios = paddle.ones(
+                [feat.shape[0], spatial_shapes.shape[0], 2])
+        reference_points = self.get_reference_points(spatial_shapes,
+                                                     valid_ratios)
+        for layer in self.layers:
+            feat = layer(feat, reference_points, spatial_shapes,
+                         level_start_index, feat_mask, query_pos_embed)
+
+        return feat
+
+
+class DeformableTransformerDecoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model=256,
+                 n_head=8,
+                 dim_feedforward=1024,
+                 dropout=0.1,
+                 activation="relu",
+                 n_levels=4,
+                 n_points=4,
+                 lr_mult=0.1,
+                 weight_attr=None,
+                 bias_attr=None):
+        super(DeformableTransformerDecoderLayer, self).__init__()
+
+        # self attention
+        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+
+        # cross attention
+        self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,
+                                                n_points, lr_mult)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.activation = getattr(F, activation)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+        xavier_uniform_(self.linear1.weight)
+        xavier_uniform_(self.linear2.weight)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+    def forward(self,
+                tgt,
+                reference_points,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                memory_mask=None,
+                query_pos_embed=None):
+        # self attention
+        q = k = self.with_pos_embed(tgt, query_pos_embed)
+        tgt2 = self.self_attn(q, k, value=tgt)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        # cross attention
+        tgt2 = self.cross_attn(
+            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
+            memory_spatial_shapes, memory_level_start_index, memory_mask)
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+
+        # ffn
+        tgt = self.forward_ffn(tgt)
+
+        return tgt
+
+
+class DeformableTransformerDecoder(nn.Layer):
+    def __init__(self, decoder_layer, num_layers, return_intermediate=False):
+        super(DeformableTransformerDecoder, self).__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.return_intermediate = return_intermediate
+
+    def forward(self,
+                tgt,
+                reference_points,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                memory_mask=None,
+                query_pos_embed=None):
+        output = tgt
+        intermediate = []
+        for lid, layer in enumerate(self.layers):
+            output = layer(output, reference_points, memory,
+                           memory_spatial_shapes, memory_level_start_index,
+                           memory_mask, query_pos_embed)
+
+            if self.return_intermediate:
+                intermediate.append(output)
+
+        if self.return_intermediate:
+            return paddle.stack(intermediate)
+
+        return output.unsqueeze(0)
+
+
+@register
+class DeformableTransformer(nn.Layer):
+    __shared__ = ['hidden_dim']
+
+    def __init__(self,
+                 num_queries=300,
+                 position_embed_type='sine',
+                 return_intermediate_dec=True,
+                 in_feats_channel=[512, 1024, 2048],
+                 num_feature_levels=4,
+                 num_encoder_points=4,
+                 num_decoder_points=4,
+                 hidden_dim=256,
+                 nhead=8,
+                 num_encoder_layers=6,
+                 num_decoder_layers=6,
+                 dim_feedforward=1024,
+                 dropout=0.1,
+                 activation="relu",
+                 lr_mult=0.1,
+                 pe_temperature=10000,
+                 pe_offset=-0.5):
+        super(DeformableTransformer, self).__init__()
+        assert position_embed_type in ['sine', 'learned'], \
+            f'ValueError: position_embed_type not supported {position_embed_type}!'
+        assert len(in_feats_channel) <= num_feature_levels
+
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+        self.num_feature_levels = num_feature_levels
+
+        encoder_layer = DeformableTransformerEncoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation,
+            num_feature_levels, num_encoder_points, lr_mult)
+        self.encoder = DeformableTransformerEncoder(encoder_layer,
+                                                    num_encoder_layers)
+
+        decoder_layer = DeformableTransformerDecoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation,
+            num_feature_levels, num_decoder_points)
+        self.decoder = DeformableTransformerDecoder(
+            decoder_layer, num_decoder_layers, return_intermediate_dec)
+
+        self.level_embed = nn.Embedding(num_feature_levels, hidden_dim)
+        self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
+        self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)
+
+        self.reference_points = nn.Linear(
+            hidden_dim,
+            2,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult))
+
+        self.input_proj = nn.LayerList()
+        for in_channels in in_feats_channel:
+            self.input_proj.append(
+                nn.Sequential(
+                    nn.Conv2D(
+                        in_channels, hidden_dim, kernel_size=1),
+                    nn.GroupNorm(32, hidden_dim)))
+        in_channels = in_feats_channel[-1]
+        for _ in range(num_feature_levels - len(in_feats_channel)):
+            self.input_proj.append(
+                nn.Sequential(
+                    nn.Conv2D(
+                        in_channels,
+                        hidden_dim,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1),
+                    nn.GroupNorm(32, hidden_dim)))
+            in_channels = hidden_dim
+
+        self.position_embedding = PositionEmbedding(
+            hidden_dim // 2,
+            temperature=pe_temperature,
+            normalize=True if position_embed_type == 'sine' else False,
+            embed_type=position_embed_type,
+            offset=pe_offset,
+            eps=1e-4)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        normal_(self.level_embed.weight)
+        normal_(self.tgt_embed.weight)
+        normal_(self.query_pos_embed.weight)
+        xavier_uniform_(self.reference_points.weight)
+        constant_(self.reference_points.bias)
+        for l in self.input_proj:
+            xavier_uniform_(l[0].weight)
+            constant_(l[0].bias)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_feats_channel': [i.channels for i in input_shape], }
+
+    def forward(self, src_feats, src_mask=None, *args, **kwargs):
+        srcs = []
+        for i in range(len(src_feats)):
+            srcs.append(self.input_proj[i](src_feats[i]))
+        if self.num_feature_levels > len(srcs):
+            len_srcs = len(srcs)
+            for i in range(len_srcs, self.num_feature_levels):
+                if i == len_srcs:
+                    srcs.append(self.input_proj[i](src_feats[-1]))
+                else:
+                    srcs.append(self.input_proj[i](srcs[-1]))
+        src_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        valid_ratios = []
+        for level, src in enumerate(srcs):
+            src_shape = paddle.shape(src)
+            bs = src_shape[0:1]
+            h = src_shape[2:3]
+            w = src_shape[3:4]
+            spatial_shapes.append(paddle.concat([h, w]))
+            src = src.flatten(2).transpose([0, 2, 1])
+            src_flatten.append(src)
+            if src_mask is not None:
+                mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]
+            else:
+                mask = paddle.ones([bs, h, w])
+            valid_ratios.append(get_valid_ratio(mask))
+            pos_embed = self.position_embedding(mask).flatten(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed.weight[level]
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            mask = mask.flatten(1)
+            mask_flatten.append(mask)
+        src_flatten = paddle.concat(src_flatten, 1)
+        mask_flatten = None if src_mask is None else paddle.concat(mask_flatten,
+                                                                   1)
+        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
+        # [l, 2]
+        spatial_shapes = paddle.to_tensor(
+            paddle.stack(spatial_shapes).astype('int64'))
+        # [l], 每一个level的起始index
+        level_start_index = paddle.concat([
+            paddle.zeros(
+                [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
+        ])
+        # [b, l, 2]
+        valid_ratios = paddle.stack(valid_ratios, 1)
+
+        # encoder
+        memory = self.encoder(src_flatten, spatial_shapes, level_start_index,
+                              mask_flatten, lvl_pos_embed_flatten, valid_ratios)
+
+        # prepare input for decoder
+        bs, _, c = memory.shape
+        query_embed = self.query_pos_embed.weight.unsqueeze(0).tile([bs, 1, 1])
+        tgt = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
+        reference_points = F.sigmoid(self.reference_points(query_embed))
+        reference_points_input = reference_points.unsqueeze(
+            2) * valid_ratios.unsqueeze(1)
+
+        # decoder
+        hs = self.decoder(tgt, reference_points_input, memory, spatial_shapes,
+                          level_start_index, mask_flatten, query_embed)
+
+        return (hs, memory, reference_points)
diff --git a/rtdetr_paddle/ppdet/modeling/transformers/detr_transformer.py b/rtdetr_paddle/ppdet/modeling/transformers/detr_transformer.py
new file mode 100644
index 0000000..efeb320
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/transformers/detr_transformer.py
@@ -0,0 +1,359 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppdet.core.workspace import register
+from ..layers import MultiHeadAttention, _convert_attention_mask
+from .position_encoding import PositionEmbedding
+from .utils import _get_clones
+from ..initializer import linear_init_, conv_init_, xavier_uniform_, normal_
+
+__all__ = ['DETRTransformer']
+
+
+class TransformerEncoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False):
+        super(TransformerEncoderLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.activation = getattr(F, activation)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos_embed):
+        return tensor if pos_embed is None else tensor + pos_embed
+
+    def forward(self, src, src_mask=None, pos_embed=None):
+        residual = src
+        if self.normalize_before:
+            src = self.norm1(src)
+        q = k = self.with_pos_embed(src, pos_embed)
+        src = self.self_attn(q, k, value=src, attn_mask=src_mask)
+
+        src = residual + self.dropout1(src)
+        if not self.normalize_before:
+            src = self.norm1(src)
+
+        residual = src
+        if self.normalize_before:
+            src = self.norm2(src)
+        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = residual + self.dropout2(src)
+        if not self.normalize_before:
+            src = self.norm2(src)
+        return src
+
+
+class TransformerEncoder(nn.Layer):
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super(TransformerEncoder, self).__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, src, src_mask=None, pos_embed=None):
+        output = src
+        for layer in self.layers:
+            output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerDecoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False):
+        super(TransformerDecoderLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout3 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.activation = getattr(F, activation)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos_embed):
+        return tensor if pos_embed is None else tensor + pos_embed
+
+    def forward(self,
+                tgt,
+                memory,
+                tgt_mask=None,
+                memory_mask=None,
+                pos_embed=None,
+                query_pos_embed=None):
+        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+        q = k = self.with_pos_embed(tgt, query_pos_embed)
+        tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask)
+        tgt = residual + self.dropout1(tgt)
+        if not self.normalize_before:
+            tgt = self.norm1(tgt)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm2(tgt)
+        q = self.with_pos_embed(tgt, query_pos_embed)
+        k = self.with_pos_embed(memory, pos_embed)
+        tgt = self.cross_attn(q, k, value=memory, attn_mask=memory_mask)
+        tgt = residual + self.dropout2(tgt)
+        if not self.normalize_before:
+            tgt = self.norm2(tgt)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm3(tgt)
+        tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = residual + self.dropout3(tgt)
+        if not self.normalize_before:
+            tgt = self.norm3(tgt)
+        return tgt
+
+
+class TransformerDecoder(nn.Layer):
+    def __init__(self,
+                 decoder_layer,
+                 num_layers,
+                 norm=None,
+                 return_intermediate=False):
+        super(TransformerDecoder, self).__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+
+    def forward(self,
+                tgt,
+                memory,
+                tgt_mask=None,
+                memory_mask=None,
+                pos_embed=None,
+                query_pos_embed=None):
+        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
+
+        output = tgt
+        intermediate = []
+        for layer in self.layers:
+            output = layer(
+                output,
+                memory,
+                tgt_mask=tgt_mask,
+                memory_mask=memory_mask,
+                pos_embed=pos_embed,
+                query_pos_embed=query_pos_embed)
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        if self.return_intermediate:
+            return paddle.stack(intermediate)
+
+        return output.unsqueeze(0)
+
+
+@register
+class DETRTransformer(nn.Layer):
+    __shared__ = ['hidden_dim']
+
+    def __init__(self,
+                 num_queries=100,
+                 position_embed_type='sine',
+                 return_intermediate_dec=True,
+                 backbone_num_channels=2048,
+                 hidden_dim=256,
+                 nhead=8,
+                 num_encoder_layers=6,
+                 num_decoder_layers=6,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 pe_temperature=10000,
+                 pe_offset=0.,
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False):
+        super(DETRTransformer, self).__init__()
+        assert position_embed_type in ['sine', 'learned'],\
+            f'ValueError: position_embed_type not supported {position_embed_type}!'
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+
+        encoder_layer = TransformerEncoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation,
+            attn_dropout, act_dropout, normalize_before)
+        encoder_norm = nn.LayerNorm(hidden_dim) if normalize_before else None
+        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
+                                          encoder_norm)
+
+        decoder_layer = TransformerDecoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation,
+            attn_dropout, act_dropout, normalize_before)
+        decoder_norm = nn.LayerNorm(hidden_dim)
+        self.decoder = TransformerDecoder(
+            decoder_layer,
+            num_decoder_layers,
+            decoder_norm,
+            return_intermediate=return_intermediate_dec)
+
+        self.input_proj = nn.Conv2D(
+            backbone_num_channels, hidden_dim, kernel_size=1)
+        self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)
+        self.position_embedding = PositionEmbedding(
+            hidden_dim // 2,
+            temperature=pe_temperature,
+            normalize=True if position_embed_type == 'sine' else False,
+            embed_type=position_embed_type,
+            offset=pe_offset)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                xavier_uniform_(p)
+        conv_init_(self.input_proj)
+        normal_(self.query_pos_embed.weight)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            'backbone_num_channels': [i.channels for i in input_shape][-1],
+        }
+
+    def _convert_attention_mask(self, mask):
+        return (mask - 1.0) * 1e9
+
+    def forward(self, src, src_mask=None, *args, **kwargs):
+        r"""
+        Applies a Transformer model on the inputs.
+
+        Parameters:
+            src (List(Tensor)): Backbone feature maps with shape [[bs, c, h, w]].
+            src_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                [bs, H, W]`. When the data type is bool, the unwanted positions
+                have `False` values and the others have `True` values. When the
+                data type is int, the unwanted positions have 0 values and the
+                others have 1 values. When the data type is float, the unwanted
+                positions have `-INF` values and the others have 0 values. It
+                can be None when nothing wanted or needed to be prevented
+                attention to. Default None.
+
+        Returns:
+            output (Tensor): [num_levels, batch_size, num_queries, hidden_dim]
+            memory (Tensor): [batch_size, hidden_dim, h, w]
+        """
+        # use last level feature map
+        src_proj = self.input_proj(src[-1])
+        bs, c, h, w = paddle.shape(src_proj)
+        # flatten [B, C, H, W] to [B, HxW, C]
+        src_flatten = src_proj.flatten(2).transpose([0, 2, 1])
+        if src_mask is not None:
+            src_mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]
+        else:
+            src_mask = paddle.ones([bs, h, w])
+        pos_embed = self.position_embedding(src_mask).flatten(1, 2)
+
+        if self.training:
+            src_mask = self._convert_attention_mask(src_mask)
+            src_mask = src_mask.reshape([bs, 1, 1, h * w])
+        else:
+            src_mask = None
+
+        memory = self.encoder(
+            src_flatten, src_mask=src_mask, pos_embed=pos_embed)
+
+        query_pos_embed = self.query_pos_embed.weight.unsqueeze(0).tile(
+            [bs, 1, 1])
+        tgt = paddle.zeros_like(query_pos_embed)
+        output = self.decoder(
+            tgt,
+            memory,
+            memory_mask=src_mask,
+            pos_embed=pos_embed,
+            query_pos_embed=query_pos_embed)
+
+        if self.training:
+            src_mask = src_mask.reshape([bs, 1, 1, h, w])
+        else:
+            src_mask = None
+
+        return (output, memory.transpose([0, 2, 1]).reshape([bs, c, h, w]),
+                src_proj, src_mask)
diff --git a/rtdetr_paddle/ppdet/modeling/transformers/dino_transformer.py b/rtdetr_paddle/ppdet/modeling/transformers/dino_transformer.py
new file mode 100644
index 0000000..088b150
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/transformers/dino_transformer.py
@@ -0,0 +1,527 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Modified from detrex (https://github.com/IDEA-Research/detrex)
+# Copyright 2022 The IDEA Authors. All rights reserved.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+
+from ppdet.core.workspace import register
+from ..layers import MultiHeadAttention
+from .position_encoding import PositionEmbedding
+from .deformable_transformer import (MSDeformableAttention,
+                                     DeformableTransformerEncoderLayer,
+                                     DeformableTransformerEncoder)
+from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,
+                           bias_init_with_prob)
+from .utils import (_get_clones, get_valid_ratio,
+                    get_contrastive_denoising_training_group,
+                    get_sine_pos_embed, inverse_sigmoid, MLP)
+
+__all__ = ['DINOTransformer']
+
+
+class DINOTransformerDecoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model=256,
+                 n_head=8,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 n_levels=4,
+                 n_points=4,
+                 lr_mult=1.0,
+                 weight_attr=None,
+                 bias_attr=None):
+        super(DINOTransformerDecoderLayer, self).__init__()
+
+        # self attention
+        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+
+        # cross attention
+        self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,
+                                                n_points, lr_mult)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.activation = getattr(F, activation)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+        xavier_uniform_(self.linear1.weight)
+        xavier_uniform_(self.linear2.weight)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+
+    def forward(self,
+                tgt,
+                reference_points,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                attn_mask=None,
+                memory_mask=None,
+                query_pos_embed=None):
+        # self attention
+        q = k = self.with_pos_embed(tgt, query_pos_embed)
+        if attn_mask is not None:
+            attn_mask = paddle.where(
+                attn_mask.astype('bool'),
+                paddle.zeros(attn_mask.shape, tgt.dtype),
+                paddle.full(attn_mask.shape, float("-inf"), tgt.dtype))
+        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        # cross attention
+        tgt2 = self.cross_attn(
+            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
+            memory_spatial_shapes, memory_level_start_index, memory_mask)
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+
+        # ffn
+        tgt2 = self.forward_ffn(tgt)
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+
+        return tgt
+
+
+class DINOTransformerDecoder(nn.Layer):
+    def __init__(self,
+                 hidden_dim,
+                 decoder_layer,
+                 num_layers,
+                 weight_attr=None,
+                 bias_attr=None):
+        super(DINOTransformerDecoder, self).__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.norm = nn.LayerNorm(
+            hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr)
+
+    def forward(self,
+                tgt,
+                ref_points_unact,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                bbox_head,
+                query_pos_head,
+                valid_ratios=None,
+                attn_mask=None,
+                memory_mask=None):
+        if valid_ratios is None:
+            valid_ratios = paddle.ones(
+                [memory.shape[0], memory_spatial_shapes.shape[0], 2])
+
+        output = tgt
+        intermediate = []
+        inter_bboxes = []
+        ref_points = F.sigmoid(ref_points_unact)
+        for i, layer in enumerate(self.layers):
+            reference_points_input = ref_points.detach().unsqueeze(
+                2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1)
+            query_pos_embed = get_sine_pos_embed(
+                reference_points_input[..., 0, :], self.hidden_dim // 2)
+            query_pos_embed = query_pos_head(query_pos_embed)
+
+            output = layer(output, reference_points_input, memory,
+                           memory_spatial_shapes, memory_level_start_index,
+                           attn_mask, memory_mask, query_pos_embed)
+
+            ref_points = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
+                ref_points.detach()))
+
+            intermediate.append(self.norm(output))
+            inter_bboxes.append(ref_points)
+
+        return paddle.stack(intermediate), paddle.stack(inter_bboxes)
+
+
+@register
+class DINOTransformer(nn.Layer):
+    __shared__ = ['num_classes', 'hidden_dim']
+
+    def __init__(self,
+                 num_classes=80,
+                 hidden_dim=256,
+                 num_queries=900,
+                 position_embed_type='sine',
+                 in_feats_channel=[512, 1024, 2048],
+                 num_levels=4,
+                 num_encoder_points=4,
+                 num_decoder_points=4,
+                 nhead=8,
+                 num_encoder_layers=6,
+                 num_decoder_layers=6,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 lr_mult=1.0,
+                 pe_temperature=10000,
+                 pe_offset=-0.5,
+                 num_denoising=100,
+                 label_noise_ratio=0.5,
+                 box_noise_scale=1.0,
+                 learnt_init_query=True,
+                 eps=1e-2):
+        super(DINOTransformer, self).__init__()
+        assert position_embed_type in ['sine', 'learned'], \
+            f'ValueError: position_embed_type not supported {position_embed_type}!'
+        assert len(in_feats_channel) <= num_levels
+
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+        self.num_levels = num_levels
+        self.num_classes = num_classes
+        self.num_queries = num_queries
+        self.eps = eps
+        self.num_decoder_layers = num_decoder_layers
+
+        weight_attr = ParamAttr(regularizer=L2Decay(0.0))
+        bias_attr = ParamAttr(regularizer=L2Decay(0.0))
+        # backbone feature projection
+        self._build_input_proj_layer(in_feats_channel, weight_attr, bias_attr)
+
+        # Transformer module
+        encoder_layer = DeformableTransformerEncoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
+            num_encoder_points, lr_mult, weight_attr, bias_attr)
+        self.encoder = DeformableTransformerEncoder(encoder_layer,
+                                                    num_encoder_layers)
+        decoder_layer = DINOTransformerDecoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
+            num_decoder_points, lr_mult, weight_attr, bias_attr)
+        self.decoder = DINOTransformerDecoder(hidden_dim, decoder_layer,
+                                              num_decoder_layers, weight_attr,
+                                              bias_attr)
+
+        # denoising part
+        self.denoising_class_embed = nn.Embedding(
+            num_classes,
+            hidden_dim,
+            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
+        self.num_denoising = num_denoising
+        self.label_noise_ratio = label_noise_ratio
+        self.box_noise_scale = box_noise_scale
+
+        # position embedding
+        self.position_embedding = PositionEmbedding(
+            hidden_dim // 2,
+            temperature=pe_temperature,
+            normalize=True if position_embed_type == 'sine' else False,
+            embed_type=position_embed_type,
+            offset=pe_offset)
+        self.level_embed = nn.Embedding(num_levels, hidden_dim)
+        # decoder embedding
+        self.learnt_init_query = learnt_init_query
+        if learnt_init_query:
+            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
+        self.query_pos_head = MLP(2 * hidden_dim,
+                                  hidden_dim,
+                                  hidden_dim,
+                                  num_layers=2)
+
+        # encoder head
+        self.enc_output = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.LayerNorm(
+                hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr))
+        self.enc_score_head = nn.Linear(hidden_dim, num_classes)
+        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+        # decoder head
+        self.dec_score_head = nn.LayerList([
+            nn.Linear(hidden_dim, num_classes)
+            for _ in range(num_decoder_layers)
+        ])
+        self.dec_bbox_head = nn.LayerList([
+            MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+            for _ in range(num_decoder_layers)
+        ])
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        # class and bbox head init
+        bias_cls = bias_init_with_prob(0.01)
+        linear_init_(self.enc_score_head)
+        constant_(self.enc_score_head.bias, bias_cls)
+        constant_(self.enc_bbox_head.layers[-1].weight)
+        constant_(self.enc_bbox_head.layers[-1].bias)
+        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
+            linear_init_(cls_)
+            constant_(cls_.bias, bias_cls)
+            constant_(reg_.layers[-1].weight)
+            constant_(reg_.layers[-1].bias)
+
+        linear_init_(self.enc_output[0])
+        xavier_uniform_(self.enc_output[0].weight)
+        normal_(self.level_embed.weight)
+        if self.learnt_init_query:
+            xavier_uniform_(self.tgt_embed.weight)
+        xavier_uniform_(self.query_pos_head.layers[0].weight)
+        xavier_uniform_(self.query_pos_head.layers[1].weight)
+        for l in self.input_proj:
+            xavier_uniform_(l[0].weight)
+            constant_(l[0].bias)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_feats_channel': [i.channels for i in input_shape], }
+
+    def _build_input_proj_layer(self,
+                                in_feats_channel,
+                                weight_attr=None,
+                                bias_attr=None):
+        self.input_proj = nn.LayerList()
+        for in_channels in in_feats_channel:
+            self.input_proj.append(
+                nn.Sequential(
+                    ('conv', nn.Conv2D(
+                        in_channels, self.hidden_dim, kernel_size=1)), (
+                            'norm', nn.GroupNorm(
+                                32,
+                                self.hidden_dim,
+                                weight_attr=weight_attr,
+                                bias_attr=bias_attr))))
+        in_channels = in_feats_channel[-1]
+        for _ in range(self.num_levels - len(in_feats_channel)):
+            self.input_proj.append(
+                nn.Sequential(
+                    ('conv', nn.Conv2D(
+                        in_channels,
+                        self.hidden_dim,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1)), ('norm', nn.GroupNorm(
+                            32,
+                            self.hidden_dim,
+                            weight_attr=weight_attr,
+                            bias_attr=bias_attr))))
+            in_channels = self.hidden_dim
+
+    def _get_encoder_input(self, feats, pad_mask=None):
+        # get projection features
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+        if self.num_levels > len(proj_feats):
+            len_srcs = len(proj_feats)
+            for i in range(len_srcs, self.num_levels):
+                if i == len_srcs:
+                    proj_feats.append(self.input_proj[i](feats[-1]))
+                else:
+                    proj_feats.append(self.input_proj[i](proj_feats[-1]))
+
+        # get encoder inputs
+        feat_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        valid_ratios = []
+        for i, feat in enumerate(proj_feats):
+            bs, _, h, w = paddle.shape(feat)
+            spatial_shapes.append(paddle.stack([h, w]))
+            # [b,c,h,w] -> [b,h*w,c]
+            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
+            if pad_mask is not None:
+                mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0]
+            else:
+                mask = paddle.ones([bs, h, w])
+            valid_ratios.append(get_valid_ratio(mask))
+            # [b, h*w, c]
+            pos_embed = self.position_embedding(mask).flatten(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed.weight[i]
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            if pad_mask is not None:
+                # [b, h*w]
+                mask_flatten.append(mask.flatten(1))
+
+        # [b, l, c]
+        feat_flatten = paddle.concat(feat_flatten, 1)
+        # [b, l]
+        mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten,
+                                                                   1)
+        # [b, l, c]
+        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
+        # [num_levels, 2]
+        spatial_shapes = paddle.to_tensor(
+            paddle.stack(spatial_shapes).astype('int64'))
+        # [l] start index of each level
+        level_start_index = paddle.concat([
+            paddle.zeros(
+                [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
+        ])
+        # [b, num_levels, 2]
+        valid_ratios = paddle.stack(valid_ratios, 1)
+        return (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
+                lvl_pos_embed_flatten, valid_ratios)
+
+    def forward(self, feats, pad_mask=None, gt_meta=None):
+        # input projection and embedding
+        (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
+         lvl_pos_embed_flatten,
+         valid_ratios) = self._get_encoder_input(feats, pad_mask)
+
+        # encoder
+        memory = self.encoder(feat_flatten, spatial_shapes, level_start_index,
+                              mask_flatten, lvl_pos_embed_flatten, valid_ratios)
+
+        # prepare denoising training
+        if self.training:
+            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
+                get_contrastive_denoising_training_group(gt_meta,
+                                            self.num_classes,
+                                            self.num_queries,
+                                            self.denoising_class_embed.weight,
+                                            self.num_denoising,
+                                            self.label_noise_ratio,
+                                            self.box_noise_scale)
+        else:
+            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
+
+        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
+            self._get_decoder_input(
+            memory, spatial_shapes, mask_flatten, denoising_class,
+            denoising_bbox_unact)
+
+        # decoder
+        inter_feats, inter_bboxes = self.decoder(
+            target, init_ref_points_unact, memory, spatial_shapes,
+            level_start_index, self.dec_bbox_head, self.query_pos_head,
+            valid_ratios, attn_mask, mask_flatten)
+        out_bboxes = []
+        out_logits = []
+        for i in range(self.num_decoder_layers):
+            out_logits.append(self.dec_score_head[i](inter_feats[i]))
+            if i == 0:
+                out_bboxes.append(
+                    F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
+                              init_ref_points_unact))
+            else:
+                out_bboxes.append(
+                    F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
+                              inverse_sigmoid(inter_bboxes[i - 1])))
+        out_bboxes = paddle.stack(out_bboxes)
+        out_logits = paddle.stack(out_logits)
+
+        return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,
+                dn_meta)
+
+    def _get_encoder_output_anchors(self,
+                                    memory,
+                                    spatial_shapes,
+                                    memory_mask=None,
+                                    grid_size=0.05):
+        output_anchors = []
+        idx = 0
+        for lvl, (h, w) in enumerate(spatial_shapes):
+            if memory_mask is not None:
+                mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w])
+                valid_H = paddle.sum(mask_[:, :, 0], 1)
+                valid_W = paddle.sum(mask_[:, 0, :], 1)
+            else:
+                valid_H, valid_W = h, w
+
+            grid_y, grid_x = paddle.meshgrid(
+                paddle.arange(end=h), paddle.arange(end=w))
+            grid_xy = paddle.stack([grid_x, grid_y], -1).astype(memory.dtype)
+
+            valid_WH = paddle.stack([valid_W, valid_H], -1).reshape(
+                [-1, 1, 1, 2]).astype(grid_xy.dtype)
+            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
+            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
+            output_anchors.append(
+                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
+            idx += h * w
+
+        output_anchors = paddle.concat(output_anchors, 1)
+        valid_mask = ((output_anchors > self.eps) *
+                      (output_anchors < 1 - self.eps)).all(-1, keepdim=True)
+        output_anchors = paddle.log(output_anchors / (1 - output_anchors))
+        if memory_mask is not None:
+            valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0
+        output_anchors = paddle.where(valid_mask, output_anchors,
+                                      paddle.to_tensor(float("inf")))
+
+        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
+        output_memory = self.enc_output(memory)
+        return output_memory, output_anchors
+
+    def _get_decoder_input(self,
+                           memory,
+                           spatial_shapes,
+                           memory_mask=None,
+                           denoising_class=None,
+                           denoising_bbox_unact=None):
+        bs, _, _ = memory.shape
+        # prepare input for decoder
+        output_memory, output_anchors = self._get_encoder_output_anchors(
+            memory, spatial_shapes, memory_mask)
+        enc_outputs_class = self.enc_score_head(output_memory)
+        enc_outputs_coord_unact = self.enc_bbox_head(
+            output_memory) + output_anchors
+
+        _, topk_ind = paddle.topk(
+            enc_outputs_class.max(-1), self.num_queries, axis=1)
+        # extract region proposal boxes
+        batch_ind = paddle.arange(end=bs).astype(topk_ind.dtype)
+        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
+        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
+        reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact,
+                                                  topk_ind)  # unsigmoided.
+        enc_topk_bboxes = F.sigmoid(reference_points_unact)
+        if denoising_bbox_unact is not None:
+            reference_points_unact = paddle.concat(
+                [denoising_bbox_unact, reference_points_unact], 1)
+        enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)
+
+        # extract region features
+        if self.learnt_init_query:
+            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
+        else:
+            target = paddle.gather_nd(output_memory, topk_ind).detach()
+        if denoising_class is not None:
+            target = paddle.concat([denoising_class, target], 1)
+
+        return target, reference_points_unact.detach(
+        ), enc_topk_bboxes, enc_topk_logits
\ No newline at end of file
diff --git a/rtdetr_paddle/ppdet/modeling/transformers/ext_op/README.md b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/README.md
new file mode 100644
index 0000000..144f2fa
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/README.md
@@ -0,0 +1,85 @@
+# Multi-scale deformable attention自定义OP编译
+该自定义OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。
+
+## 1. 环境依赖
+- Paddle >= 2.3.2
+- gcc 8.2
+
+## 2. 安装
+请在当前路径下进行编译安装
+```
+cd rtdetr_paddle/ppdet/modeling/transformers/ext_op/
+python setup_ms_deformable_attn_op.py install
+```
+
+编译完成后即可使用，以下为`ms_deformable_attn`的使用示例
+```
+# 引入自定义op
+from deformable_detr_ops import ms_deformable_attn
+
+# 构造fake input tensor
+bs, n_heads, c = 2, 8, 8
+query_length, n_levels, n_points = 2, 2, 2
+spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
+level_start_index = paddle.concat((paddle.to_tensor(
+    [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
+value_length = sum([(H * W).item() for H, W in spatial_shapes])
+
+def get_test_tensors(channels):
+    value = paddle.rand(
+        [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
+    sampling_locations = paddle.rand(
+        [bs, query_length, n_heads, n_levels, n_points, 2],
+        dtype=paddle.float32)
+    attention_weights = paddle.rand(
+        [bs, query_length, n_heads, n_levels, n_points],
+        dtype=paddle.float32) + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
+        -2, keepdim=True)
+    return [value, sampling_locations, attention_weights]
+
+value, sampling_locations, attention_weights = get_test_tensors(c)
+
+output = ms_deformable_attn(value,
+                            spatial_shapes,
+                            level_start_index,
+                            sampling_locations,
+                            attention_weights)
+```
+
+## 3. 单元测试
+可以通过执行单元测试来确认自定义算子功能的正确性，执行单元测试的示例如下所示：
+```
+python test_ms_deformable_attn_op.py
+```
+运行成功后，打印如下：
+```
+*True check_forward_equal_with_paddle_float: max_abs_err 6.98e-10 max_rel_err 2.03e-07
+*tensor1 True check_gradient_numerical(D=30)
+*tensor2 True check_gradient_numerical(D=30)
+*tensor3 True check_gradient_numerical(D=30)
+*tensor1 True check_gradient_numerical(D=32)
+*tensor2 True check_gradient_numerical(D=32)
+*tensor3 True check_gradient_numerical(D=32)
+*tensor1 True check_gradient_numerical(D=64)
+*tensor2 True check_gradient_numerical(D=64)
+*tensor3 True check_gradient_numerical(D=64)
+*tensor1 True check_gradient_numerical(D=71)
+*tensor2 True check_gradient_numerical(D=71)
+*tensor3 True check_gradient_numerical(D=71)
+*tensor1 True check_gradient_numerical(D=128)
+*tensor2 True check_gradient_numerical(D=128)
+*tensor3 True check_gradient_numerical(D=128)
+*tensor1 True check_gradient_numerical(D=1024)
+*tensor2 True check_gradient_numerical(D=1024)
+*tensor3 True check_gradient_numerical(D=1024)
+*tensor1 True check_gradient_numerical(D=1025)
+*tensor2 True check_gradient_numerical(D=1025)
+*tensor3 True check_gradient_numerical(D=1025)
+*tensor1 True check_gradient_numerical(D=2048)
+*tensor2 True check_gradient_numerical(D=2048)
+*tensor3 True check_gradient_numerical(D=2048)
+*tensor1 True check_gradient_numerical(D=3096)
+*tensor2 True check_gradient_numerical(D=3096)
+*tensor3 True check_gradient_numerical(D=3096)
+```
diff --git a/rtdetr_paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cc b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cc
new file mode 100644
index 0000000..d1758ad
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/extension.h"
+
+#include <vector>
+
+// declare GPU implementation
+std::vector<paddle::Tensor>
+MSDeformableAttnCUDAForward(const paddle::Tensor &value,
+                            const paddle::Tensor &value_spatial_shapes,
+                            const paddle::Tensor &value_level_start_index,
+                            const paddle::Tensor &sampling_locations,
+                            const paddle::Tensor &attention_weights);
+
+std::vector<paddle::Tensor> MSDeformableAttnCUDABackward(
+    const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes,
+    const paddle::Tensor &value_level_start_index,
+    const paddle::Tensor &sampling_locations,
+    const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out);
+
+//// CPU not implemented
+
+std::vector<std::vector<int64_t>>
+MSDeformableAttnInferShape(std::vector<int64_t> value_shape,
+                           std::vector<int64_t> value_spatial_shapes_shape,
+                           std::vector<int64_t> value_level_start_index_shape,
+                           std::vector<int64_t> sampling_locations_shape,
+                           std::vector<int64_t> attention_weights_shape) {
+  return {{value_shape[0], sampling_locations_shape[1],
+           value_shape[2] * value_shape[3]}};
+}
+
+std::vector<paddle::DataType>
+MSDeformableAttnInferDtype(paddle::DataType value_dtype,
+                           paddle::DataType value_spatial_shapes_dtype,
+                           paddle::DataType value_level_start_index_dtype,
+                           paddle::DataType sampling_locations_dtype,
+                           paddle::DataType attention_weights_dtype) {
+  return {value_dtype};
+}
+
+PD_BUILD_OP(ms_deformable_attn)
+    .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
+             "AttentionWeights"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDAForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(MSDeformableAttnInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(MSDeformableAttnInferDtype));
+
+PD_BUILD_GRAD_OP(ms_deformable_attn)
+    .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
+             "AttentionWeights", paddle::Grad("Out")})
+    .Outputs({paddle::Grad("Value"), paddle::Grad("SpatialShapes"),
+              paddle::Grad("LevelIndex"), paddle::Grad("SamplingLocations"),
+              paddle::Grad("AttentionWeights")})
+    .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDABackward));
diff --git a/rtdetr_paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cu b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cu
new file mode 100644
index 0000000..d5a8d16
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cu
@@ -0,0 +1,1073 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/extension.h"
+
+#define CUDA_KERNEL_LOOP(i, n)                                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);                 \
+       i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads) {
+  return (N + num_threads - 1) / num_threads;
+}
+
+// forward bilinear
+template <typename data_t>
+__device__ data_t deformable_attn_bilinear_forward(
+    const data_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const data_t &h, const data_t &w,
+    const int &m, const int &c) {
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const data_t lh = h - h_low;
+  const data_t lw = w - w_low;
+  const data_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  data_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  data_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  data_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  data_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+// forward kernel
+template <typename data_t>
+__global__ void deformable_attn_cuda_kernel_forward(
+    const int n, const data_t *data_value, const int64_t *data_spatial_shapes,
+    const int64_t *data_level_start_index, const data_t *data_sampling_loc,
+    const data_t *data_attn_weight, const int batch_size,
+    const int value_length, const int num_heads, const int channels,
+    const int num_levels, const int query_length, const int num_points,
+    data_t *output_data_ptr) {
+  CUDA_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % query_length;
+    _temp /= query_length;
+    const int b_col = _temp;
+
+    data_t *data_ptr = output_data_ptr + index;
+    int data_weight_ptr = sampling_index * num_levels * num_points;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
+    data_t col = 0;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const data_t *data_value_ptr = data_value + (data_value_ptr_init_offset +
+                                                   level_start_id * qid_stride);
+      for (int p_col = 0; p_col < num_points; ++p_col) {
+        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const data_t weight = data_attn_weight[data_weight_ptr];
+
+        const data_t h_im = loc_h * spatial_h - 0.5;
+        const data_t w_im = loc_w * spatial_w - 0.5;
+
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          col += deformable_attn_bilinear_forward(
+                     data_value_ptr, spatial_h, spatial_w, num_heads, channels,
+                     h_im, w_im, m_col, c_col) *
+                 weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_ptr = col;
+  }
+}
+
+#define CHECK_INPUT_GPU(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+// forward
+std::vector<paddle::Tensor>
+MSDeformableAttnCUDAForward(const paddle::Tensor &value,
+                            const paddle::Tensor &value_spatial_shapes,
+                            const paddle::Tensor &value_level_start_index,
+                            const paddle::Tensor &sampling_locations,
+                            const paddle::Tensor &attention_weights) {
+
+  CHECK_INPUT_GPU(value);
+  CHECK_INPUT_GPU(value_spatial_shapes);
+  CHECK_INPUT_GPU(value_level_start_index);
+  CHECK_INPUT_GPU(sampling_locations);
+  CHECK_INPUT_GPU(attention_weights);
+
+  const int batch_size = value.shape()[0];
+  const int value_length = value.shape()[1];
+  const int num_heads = value.shape()[2];
+  const int channels = value.shape()[3];
+
+  const int num_levels = value_spatial_shapes.shape()[0];
+  const int query_length = sampling_locations.shape()[1];
+  const int num_points = sampling_locations.shape()[4];
+
+  auto output = paddle::full({batch_size, query_length, num_heads * channels},
+                             0, value.dtype(), paddle::GPUPlace());
+
+  const int num_kernels = batch_size * query_length * num_heads * channels;
+  deformable_attn_cuda_kernel_forward<float>
+      <<<GET_BLOCKS(num_kernels, CUDA_NUM_THREADS), CUDA_NUM_THREADS, 0,
+         value.stream()>>>(num_kernels, value.data<float>(),
+                           value_spatial_shapes.data<int64_t>(),
+                           value_level_start_index.data<int64_t>(),
+                           sampling_locations.data<float>(),
+                           attention_weights.data<float>(), batch_size,
+                           value_length, num_heads, channels, num_levels,
+                           query_length, num_points, output.data<float>());
+  return {output};
+}
+
+// backward bilinear
+template <typename data_t>
+__device__ void deformable_attn_bilinear_backward(
+    const data_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const data_t &h, const data_t &w,
+    const int &m, const int &c, const data_t &top_grad,
+    const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const data_t lh = h - h_low;
+  const data_t lw = w - w_low;
+  const data_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const data_t top_grad_value = top_grad * attn_weight;
+  data_t grad_h_weight = 0, grad_w_weight = 0;
+
+  data_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
+  }
+  data_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
+  }
+  data_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
+  }
+  data_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
+  }
+
+  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+
+template <typename data_t>
+__device__ void deformable_attn_bilinear_backward_gm(
+    const data_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const data_t &h, const data_t &w,
+    const int &m, const int &c, const data_t &top_grad,
+    const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const data_t lh = h - h_low;
+  const data_t lw = w - w_low;
+  const data_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const data_t top_grad_value = top_grad * attn_weight;
+  data_t grad_h_weight = 0, grad_w_weight = 0;
+
+  data_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
+  }
+  data_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
+  }
+  data_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
+  }
+  data_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
+  }
+
+  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val);
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+
+// backward kernels
+// channels > 1024
+template <typename data_t>
+__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks(
+    const int n, const data_t *grad_col, const data_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const data_t *data_sampling_loc, const data_t *data_attn_weight,
+    const int batch_size, const int value_length, const int num_heads,
+    const int channels, const int num_levels, const int query_length,
+    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  CUDA_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    data_t *cache_grad_sampling_loc = (data_t *)_s;
+    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % query_length;
+    _temp /= query_length;
+    const int b_col = _temp;
+
+    const data_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_points;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const data_t *data_value_ptr = data_value + value_ptr_offset;
+      data_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_points; ++p_col) {
+        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const data_t weight = data_attn_weight[data_weight_ptr];
+
+        const data_t h_im = loc_h * spatial_h - 0.5;
+        const data_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          deformable_attn_bilinear_backward(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+             s >>= 1, spre >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre) {
+              cache_grad_attn_weight[tid] +=
+                  cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] +=
+                  cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] +=
+                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename data_t>
+__global__ void deformable_attn_cuda_kernel_backward_gm(
+    const int n, const data_t *grad_col, const data_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const data_t *data_sampling_loc, const data_t *data_attn_weight,
+    const int batch_size, const int value_length, const int num_heads,
+    const int channels, const int num_levels, const int query_length,
+    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  CUDA_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % query_length;
+    _temp /= query_length;
+    const int b_col = _temp;
+
+    const data_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_points;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const data_t *data_value_ptr = data_value + value_ptr_offset;
+      data_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_points; ++p_col) {
+        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const data_t weight = data_attn_weight[data_weight_ptr];
+
+        const data_t h_im = loc_h * spatial_h - 0.5;
+        const data_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          deformable_attn_bilinear_backward_gm(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              grad_sampling_loc, grad_attn_weight);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+// channels <= 1024
+template <typename data_t, unsigned int blockSize>
+__global__ void
+deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1(
+    const int n, const data_t *grad_col, const data_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const data_t *data_sampling_loc, const data_t *data_attn_weight,
+    const int batch_size, const int value_length, const int num_heads,
+    const int channels, const int num_levels, const int query_length,
+    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  CUDA_KERNEL_LOOP(index, n) {
+    __shared__ data_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ data_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % query_length;
+    _temp /= query_length;
+    const int b_col = _temp;
+
+    const data_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_points;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const data_t *data_value_ptr = data_value + value_ptr_offset;
+      data_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_points; ++p_col) {
+        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const data_t weight = data_attn_weight[data_weight_ptr];
+
+        const data_t h_im = loc_h * spatial_h - 0.5;
+        const data_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          deformable_attn_bilinear_backward(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+        if (tid == 0) {
+          data_t _grad_w = cache_grad_sampling_loc[0],
+                 _grad_h = cache_grad_sampling_loc[1],
+                 _grad_a = cache_grad_attn_weight[0];
+          int sid = 2;
+          for (unsigned int tid = 1; tid < blockSize; ++tid) {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename data_t, unsigned int blockSize>
+__global__ void
+deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2(
+    const int n, const data_t *grad_col, const data_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const data_t *data_sampling_loc, const data_t *data_attn_weight,
+    const int batch_size, const int value_length, const int num_heads,
+    const int channels, const int num_levels, const int query_length,
+    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  CUDA_KERNEL_LOOP(index, n) {
+    __shared__ data_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ data_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % query_length;
+    _temp /= query_length;
+    const int b_col = _temp;
+
+    const data_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_points;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const data_t *data_value_ptr = data_value + value_ptr_offset;
+      data_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_points; ++p_col) {
+        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const data_t weight = data_attn_weight[data_weight_ptr];
+
+        const data_t h_im = loc_h * spatial_h - 0.5;
+        const data_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          deformable_attn_bilinear_backward(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename data_t>
+__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v1(
+    const int n, const data_t *grad_col, const data_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const data_t *data_sampling_loc, const data_t *data_attn_weight,
+    const int batch_size, const int value_length, const int num_heads,
+    const int channels, const int num_levels, const int query_length,
+    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  CUDA_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    data_t *cache_grad_sampling_loc = (data_t *)_s;
+    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % query_length;
+    _temp /= query_length;
+    const int b_col = _temp;
+
+    const data_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_points;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const data_t *data_value_ptr = data_value + value_ptr_offset;
+      data_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_points; ++p_col) {
+        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const data_t weight = data_attn_weight[data_weight_ptr];
+
+        const data_t h_im = loc_h * spatial_h - 0.5;
+        const data_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          deformable_attn_bilinear_backward(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+        if (tid == 0) {
+          data_t _grad_w = cache_grad_sampling_loc[0],
+                 _grad_h = cache_grad_sampling_loc[1],
+                 _grad_a = cache_grad_attn_weight[0];
+          int sid = 2;
+          for (unsigned int tid = 1; tid < blockDim.x; ++tid) {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename data_t>
+__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2(
+    const int n, const data_t *grad_col, const data_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const data_t *data_sampling_loc, const data_t *data_attn_weight,
+    const int batch_size, const int value_length, const int num_heads,
+    const int channels, const int num_levels, const int query_length,
+    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  CUDA_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    data_t *cache_grad_sampling_loc = (data_t *)_s;
+    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % query_length;
+    _temp /= query_length;
+    const int b_col = _temp;
+
+    const data_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_points;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const data_t *data_value_ptr = data_value + value_ptr_offset;
+      data_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_points; ++p_col) {
+        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const data_t weight = data_attn_weight[data_weight_ptr];
+
+        const data_t h_im = loc_h * spatial_h - 0.5;
+        const data_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          deformable_attn_bilinear_backward(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+             s >>= 1, spre >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre) {
+              cache_grad_attn_weight[tid] +=
+                  cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] +=
+                  cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] +=
+                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+// backward branch
+template <typename data_t>
+void deformable_attn_cuda_backward(
+    cudaStream_t stream, const data_t *grad_out, const data_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const data_t *data_sampling_loc, const data_t *data_attn_weight,
+    const int batch_size, const int value_length, const int num_heads,
+    const int channels, const int num_levels, const int query_length,
+    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
+    data_t *grad_attn_weight) {
+  const int num_threads =
+      (channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : channels;
+  const int num_kernels = batch_size * query_length * num_heads * channels;
+  const int num_actual_kernels =
+      batch_size * query_length * num_heads * channels;
+  if (channels > 1024) {
+    if ((channels & 1023) == 0) {
+      deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks<data_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+             num_threads * 3 * sizeof(data_t), stream>>>(
+              num_kernels, grad_out, data_value, data_spatial_shapes,
+              data_level_start_index, data_sampling_loc, data_attn_weight,
+              batch_size, value_length, num_heads, channels, num_levels,
+              query_length, num_points, grad_value, grad_sampling_loc,
+              grad_attn_weight);
+    } else {
+      deformable_attn_cuda_kernel_backward_gm<data_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+    }
+  } else {
+    switch (channels) {
+    case 1:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
+                                                                         1>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 2:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
+                                                                         2>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 4:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
+                                                                         4>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 8:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
+                                                                         8>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 16:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
+                                                                         16>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 32:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
+                                                                         32>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 64:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
+                                                                         64>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 128:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
+                                                                         128>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 256:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
+                                                                         256>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 512:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
+                                                                         512>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    case 1024:
+      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
+                                                                         1024>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, value_length, num_heads,
+                       channels, num_levels, query_length, num_points,
+                       grad_value, grad_sampling_loc, grad_attn_weight);
+      break;
+    default:
+      if (channels < 64) {
+        deformable_attn_cuda_kernel_backward_shm_reduce_v1<data_t>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+               num_threads * 3 * sizeof(data_t), stream>>>(
+                num_kernels, grad_out, data_value, data_spatial_shapes,
+                data_level_start_index, data_sampling_loc, data_attn_weight,
+                batch_size, value_length, num_heads, channels, num_levels,
+                query_length, num_points, grad_value, grad_sampling_loc,
+                grad_attn_weight);
+      } else {
+        deformable_attn_cuda_kernel_backward_shm_reduce_v2<data_t>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+               num_threads * 3 * sizeof(data_t), stream>>>(
+                num_kernels, grad_out, data_value, data_spatial_shapes,
+                data_level_start_index, data_sampling_loc, data_attn_weight,
+                batch_size, value_length, num_heads, channels, num_levels,
+                query_length, num_points, grad_value, grad_sampling_loc,
+                grad_attn_weight);
+      }
+    }
+  }
+}
+
+// backward
+std::vector<paddle::Tensor> MSDeformableAttnCUDABackward(
+    const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes,
+    const paddle::Tensor &value_level_start_index,
+    const paddle::Tensor &sampling_locations,
+    const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out) {
+
+  CHECK_INPUT_GPU(value);
+  CHECK_INPUT_GPU(value_spatial_shapes);
+  CHECK_INPUT_GPU(value_level_start_index);
+  CHECK_INPUT_GPU(sampling_locations);
+  CHECK_INPUT_GPU(attention_weights);
+  CHECK_INPUT_GPU(grad_out);
+
+  const int batch_size = value.shape()[0];
+  const int value_length = value.shape()[1];
+  const int num_heads = value.shape()[2];
+  const int channels = value.shape()[3];
+
+  const int num_levels = value_spatial_shapes.shape()[0];
+  const int query_length = sampling_locations.shape()[1];
+  const int num_points = sampling_locations.shape()[4];
+
+  auto grad_value =
+      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());
+  auto grad_spatial_shapes =
+      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());
+  auto grad_level_start_index =
+      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());
+  auto grad_sampling_locations =
+      paddle::full(sampling_locations.shape(), 0, sampling_locations.dtype(),
+                   paddle::GPUPlace());
+  auto grad_attention_weights =
+      paddle::full(attention_weights.shape(), 0, attention_weights.dtype(),
+                   paddle::GPUPlace());
+
+  deformable_attn_cuda_backward<float>(
+      value.stream(), grad_out.data<float>(), value.data<float>(),
+      value_spatial_shapes.data<int64_t>(),
+      value_level_start_index.data<int64_t>(), sampling_locations.data<float>(),
+      attention_weights.data<float>(), batch_size, value_length, num_heads,
+      channels, num_levels, query_length, num_points, grad_value.data<float>(),
+      grad_sampling_locations.data<float>(),
+      grad_attention_weights.data<float>());
+
+  return {grad_value, grad_spatial_shapes, grad_level_start_index,
+          grad_sampling_locations, grad_attention_weights};
+}
diff --git a/rtdetr_paddle/ppdet/modeling/transformers/ext_op/setup_ms_deformable_attn_op.py b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/setup_ms_deformable_attn_op.py
new file mode 100644
index 0000000..7c3c386
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/setup_ms_deformable_attn_op.py
@@ -0,0 +1,7 @@
+from paddle.utils.cpp_extension import CUDAExtension, setup
+
+if __name__ == "__main__":
+    setup(
+        name='deformable_detr_ops',
+        ext_modules=CUDAExtension(
+            sources=['ms_deformable_attn_op.cc', 'ms_deformable_attn_op.cu']))
diff --git a/rtdetr_paddle/ppdet/modeling/transformers/ext_op/test_ms_deformable_attn_op.py b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/test_ms_deformable_attn_op.py
new file mode 100644
index 0000000..94a0573
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/test_ms_deformable_attn_op.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import os
+import sys
+import random
+import numpy as np
+import paddle
+# add python path of PaddleDetection to sys.path
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 5)))
+if parent_path not in sys.path:
+    sys.path.append(parent_path)
+
+from ppdet.modeling.transformers.utils import deformable_attention_core_func
+ms_deform_attn_core_paddle = deformable_attention_core_func
+
+try:
+    gpu_index = int(sys.argv[1])
+except:
+    gpu_index = 0
+print(f'Use gpu {gpu_index} to test...')
+paddle.set_device(f'gpu:{gpu_index}')
+
+try:
+    from deformable_detr_ops import ms_deformable_attn
+except Exception as e:
+    print('import deformable_detr_ops error', e)
+    sys.exit(-1)
+
+paddle.seed(1)
+random.seed(1)
+np.random.seed(1)
+
+bs, n_heads, c = 2, 8, 8
+query_length, n_levels, n_points = 2, 2, 2
+spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
+level_start_index = paddle.concat((paddle.to_tensor(
+    [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
+value_length = sum([(H * W).item() for H, W in spatial_shapes])
+
+
+def get_test_tensors(channels):
+    value = paddle.rand(
+        [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
+    sampling_locations = paddle.rand(
+        [bs, query_length, n_heads, n_levels, n_points, 2],
+        dtype=paddle.float32)
+    attention_weights = paddle.rand(
+        [bs, query_length, n_heads, n_levels, n_points],
+        dtype=paddle.float32) + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
+        -2, keepdim=True)
+
+    return [value, sampling_locations, attention_weights]
+
+
+@paddle.no_grad()
+def check_forward_equal_with_paddle_float():
+    value, sampling_locations, attention_weights = get_test_tensors(c)
+
+    output_paddle = ms_deform_attn_core_paddle(
+        value, spatial_shapes, level_start_index, sampling_locations,
+        attention_weights).detach().cpu()
+    output_cuda = ms_deformable_attn(value, spatial_shapes, level_start_index,
+                                     sampling_locations,
+                                     attention_weights).detach().cpu()
+    fwdok = paddle.allclose(
+        output_cuda, output_paddle, rtol=1e-2, atol=1e-3).item()
+    max_abs_err = (output_cuda - output_paddle).abs().max().item()
+    max_rel_err = (
+        (output_cuda - output_paddle).abs() / output_paddle.abs()).max().item()
+
+    print(
+        f'*{fwdok} check_forward_equal_with_paddle_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}'
+    )
+
+
+def check_gradient_numerical(channels=4):
+    value_paddle, sampling_locations_paddle, attention_weights_paddle = get_test_tensors(
+        channels)
+    value_paddle.stop_gradient = False
+    sampling_locations_paddle.stop_gradient = False
+    attention_weights_paddle.stop_gradient = False
+
+    value_cuda = value_paddle.detach().clone()
+    sampling_locations_cuda = sampling_locations_paddle.detach().clone()
+    attention_weights_cuda = attention_weights_paddle.detach().clone()
+    value_cuda.stop_gradient = False
+    sampling_locations_cuda.stop_gradient = False
+    attention_weights_cuda.stop_gradient = False
+
+    output_paddle = ms_deform_attn_core_paddle(
+        value_paddle, spatial_shapes, level_start_index,
+        sampling_locations_paddle, attention_weights_paddle)
+    output_paddle.sum().backward()
+
+    output_cuda = ms_deformable_attn(value_cuda, spatial_shapes,
+                                     level_start_index, sampling_locations_cuda,
+                                     attention_weights_cuda)
+    output_cuda.sum().backward()
+
+    res = paddle.allclose(
+        value_paddle.grad, value_cuda.grad, rtol=1e-2, atol=1e-3).item()
+    print(f'*tensor1 {res} check_gradient_numerical(D={channels})')
+
+    res = paddle.allclose(
+        sampling_locations_paddle.grad,
+        sampling_locations_cuda.grad,
+        rtol=1e-2,
+        atol=1e-3).item()
+    print(f'*tensor2 {res} check_gradient_numerical(D={channels})')
+
+    res = paddle.allclose(
+        attention_weights_paddle.grad,
+        attention_weights_cuda.grad,
+        rtol=1e-2,
+        atol=1e-3).item()
+    print(f'*tensor3 {res} check_gradient_numerical(D={channels})')
+
+
+if __name__ == '__main__':
+    check_forward_equal_with_paddle_float()
+
+    for channels in [30, 32, 64, 71, 128, 1024, 1025, 2048, 3096]:
+        check_gradient_numerical(channels)
diff --git a/rtdetr_paddle/ppdet/modeling/transformers/hybrid_encoder.py b/rtdetr_paddle/ppdet/modeling/transformers/hybrid_encoder.py
new file mode 100644
index 0000000..7e0c77c
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/transformers/hybrid_encoder.py
@@ -0,0 +1,287 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling.ops import get_act_fn
+from ..shape_spec import ShapeSpec
+from ..backbones.csp_darknet import BaseConv
+from ..backbones.cspresnet import RepVggBlock
+from ppdet.modeling.transformers.detr_transformer import TransformerEncoder
+from ..initializer import xavier_uniform_, linear_init_
+from ..layers import MultiHeadAttention
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+
+__all__ = ['HybridEncoder']
+
+
+class CSPRepLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_blocks=3,
+                 expansion=1.0,
+                 bias=False,
+                 act="silu"):
+        super(CSPRepLayer, self).__init__()
+        hidden_channels = int(out_channels * expansion)
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.conv2 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.bottlenecks = nn.Sequential(*[
+            RepVggBlock(
+                hidden_channels, hidden_channels, act=act)
+            for _ in range(num_blocks)
+        ])
+        if hidden_channels != out_channels:
+            self.conv3 = BaseConv(
+                hidden_channels,
+                out_channels,
+                ksize=1,
+                stride=1,
+                bias=bias,
+                act=act)
+        else:
+            self.conv3 = nn.Identity()
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_1 = self.bottlenecks(x_1)
+        x_2 = self.conv2(x)
+        return self.conv3(x_1 + x_2)
+
+
+@register
+class TransformerLayer(nn.Layer):
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False):
+        super(TransformerLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.activation = getattr(F, activation)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos_embed):
+        return tensor if pos_embed is None else tensor + pos_embed
+
+    def forward(self, src, src_mask=None, pos_embed=None):
+        residual = src
+        if self.normalize_before:
+            src = self.norm1(src)
+        q = k = self.with_pos_embed(src, pos_embed)
+        src = self.self_attn(q, k, value=src, attn_mask=src_mask)
+
+        src = residual + self.dropout1(src)
+        if not self.normalize_before:
+            src = self.norm1(src)
+
+        residual = src
+        if self.normalize_before:
+            src = self.norm2(src)
+        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = residual + self.dropout2(src)
+        if not self.normalize_before:
+            src = self.norm2(src)
+        return src
+
+
+@register
+@serializable
+class HybridEncoder(nn.Layer):
+    __shared__ = ['depth_mult', 'act', 'trt', 'eval_size']
+    __inject__ = ['encoder_layer']
+
+    def __init__(self,
+                 in_channels=[512, 1024, 2048],
+                 feat_strides=[8, 16, 32],
+                 hidden_dim=256,
+                 use_encoder_idx=[2],
+                 num_encoder_layers=1,
+                 encoder_layer='TransformerLayer',
+                 pe_temperature=10000,
+                 expansion=1.0,
+                 depth_mult=1.0,
+                 act='silu',
+                 trt=False,
+                 eval_size=None):
+        super(HybridEncoder, self).__init__()
+        self.in_channels = in_channels
+        self.feat_strides = feat_strides
+        self.hidden_dim = hidden_dim
+        self.use_encoder_idx = use_encoder_idx
+        self.num_encoder_layers = num_encoder_layers
+        self.pe_temperature = pe_temperature
+        self.eval_size = eval_size
+
+        # channel projection
+        self.input_proj = nn.LayerList()
+        for in_channel in in_channels:
+            self.input_proj.append(
+                nn.Sequential(
+                    nn.Conv2D(
+                        in_channel, hidden_dim, kernel_size=1, bias_attr=False),
+                    nn.BatchNorm2D(
+                        hidden_dim,
+                        weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                        bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))
+        # encoder transformer
+        self.encoder = nn.LayerList([
+            TransformerEncoder(encoder_layer, num_encoder_layers)
+            for _ in range(len(use_encoder_idx))
+        ])
+
+        act = get_act_fn(
+            act, trt=trt) if act is None or isinstance(act,
+                                                       (str, dict)) else act
+        # top-down fpn
+        self.lateral_convs = nn.LayerList()
+        self.fpn_blocks = nn.LayerList()
+        for idx in range(len(in_channels) - 1, 0, -1):
+            self.lateral_convs.append(
+                BaseConv(
+                    hidden_dim, hidden_dim, 1, 1, act=act))
+            self.fpn_blocks.append(
+                CSPRepLayer(
+                    hidden_dim * 2,
+                    hidden_dim,
+                    round(3 * depth_mult),
+                    act=act,
+                    expansion=expansion))
+
+        # bottom-up pan
+        self.downsample_convs = nn.LayerList()
+        self.pan_blocks = nn.LayerList()
+        for idx in range(len(in_channels) - 1):
+            self.downsample_convs.append(
+                BaseConv(
+                    hidden_dim, hidden_dim, 3, stride=2, act=act))
+            self.pan_blocks.append(
+                CSPRepLayer(
+                    hidden_dim * 2,
+                    hidden_dim,
+                    round(3 * depth_mult),
+                    act=act,
+                    expansion=expansion))
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        if self.eval_size:
+            for idx in self.use_encoder_idx:
+                stride = self.feat_strides[idx]
+                pos_embed = self.build_2d_sincos_position_embedding(
+                    self.eval_size[1] // stride, self.eval_size[0] // stride,
+                    self.hidden_dim, self.pe_temperature)
+                setattr(self, f'pos_embed{idx}', pos_embed)
+
+    @staticmethod
+    def build_2d_sincos_position_embedding(w,
+                                           h,
+                                           embed_dim=256,
+                                           temperature=10000.):
+        grid_w = paddle.arange(int(w), dtype=paddle.float32)
+        grid_h = paddle.arange(int(h), dtype=paddle.float32)
+        grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
+        assert embed_dim % 4 == 0, \
+            'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
+        pos_dim = embed_dim // 4
+        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
+        omega = 1. / (temperature**omega)
+
+        out_w = grid_w.flatten()[..., None] @omega[None]
+        out_h = grid_h.flatten()[..., None] @omega[None]
+
+        return paddle.concat(
+            [
+                paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
+                paddle.cos(out_h)
+            ],
+            axis=1)[None, :, :]
+
+    def forward(self, feats, for_mot=False):
+        assert len(feats) == len(self.in_channels)
+        # get projection features
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+        # encoder
+        if self.num_encoder_layers > 0:
+            for i, enc_ind in enumerate(self.use_encoder_idx):
+                h, w = proj_feats[enc_ind].shape[2:]
+                # flatten [B, C, H, W] to [B, HxW, C]
+                src_flatten = proj_feats[enc_ind].flatten(2).transpose(
+                    [0, 2, 1])
+                if self.training or self.eval_size is None:
+                    pos_embed = self.build_2d_sincos_position_embedding(
+                        w, h, self.hidden_dim, self.pe_temperature)
+                else:
+                    pos_embed = getattr(self, f'pos_embed{enc_ind}', None)
+                memory = self.encoder[i](src_flatten, pos_embed=pos_embed)
+                proj_feats[enc_ind] = memory.transpose([0, 2, 1]).reshape(
+                    [-1, self.hidden_dim, h, w])
+
+        # top-down fpn
+        inner_outs = [proj_feats[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_heigh = inner_outs[0]
+            feat_low = proj_feats[idx - 1]
+            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
+                feat_heigh)
+            inner_outs[0] = feat_heigh
+
+            upsample_feat = F.interpolate(
+                feat_heigh, scale_factor=2., mode="nearest")
+            inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
+                paddle.concat(
+                    [upsample_feat, feat_low], axis=1))
+            inner_outs.insert(0, inner_out)
+
+        # bottom-up pan
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_height = inner_outs[idx + 1]
+            downsample_feat = self.downsample_convs[idx](feat_low)
+            out = self.pan_blocks[idx](paddle.concat(
+                [downsample_feat, feat_height], axis=1))
+            outs.append(out)
+
+        return outs
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            'in_channels': [i.channels for i in input_shape],
+            'feat_strides': [i.stride for i in input_shape]
+        }
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self.hidden_dim, stride=self.feat_strides[idx])
+            for idx in range(len(self.in_channels))
+        ]
diff --git a/rtdetr_paddle/ppdet/modeling/transformers/matchers.py b/rtdetr_paddle/ppdet/modeling/transformers/matchers.py
new file mode 100644
index 0000000..72459a3
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/transformers/matchers.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from scipy.optimize import linear_sum_assignment
+
+from ppdet.core.workspace import register, serializable
+from ..losses.iou_loss import GIoULoss
+from .utils import bbox_cxcywh_to_xyxy
+
+__all__ = ['HungarianMatcher']
+
+
+@register
+@serializable
+class HungarianMatcher(nn.Layer):
+    __shared__ = ['use_focal_loss', 'with_mask', 'num_sample_points']
+
+    def __init__(self,
+                 matcher_coeff={
+                     'class': 1,
+                     'bbox': 5,
+                     'giou': 2,
+                     'mask': 1,
+                     'dice': 1
+                 },
+                 use_focal_loss=False,
+                 with_mask=False,
+                 num_sample_points=12544,
+                 alpha=0.25,
+                 gamma=2.0):
+        r"""
+        Args:
+            matcher_coeff (dict): The coefficient of hungarian matcher cost.
+        """
+        super(HungarianMatcher, self).__init__()
+        self.matcher_coeff = matcher_coeff
+        self.use_focal_loss = use_focal_loss
+        self.with_mask = with_mask
+        self.num_sample_points = num_sample_points
+        self.alpha = alpha
+        self.gamma = gamma
+
+        self.giou_loss = GIoULoss()
+
+    def forward(self,
+                boxes,
+                logits,
+                gt_bbox,
+                gt_class,
+                masks=None,
+                gt_mask=None):
+        r"""
+        Args:
+            boxes (Tensor): [b, query, 4]
+            logits (Tensor): [b, query, num_classes]
+            gt_bbox (List(Tensor)): list[[n, 4]]
+            gt_class (List(Tensor)): list[[n, 1]]
+            masks (Tensor|None): [b, query, h, w]
+            gt_mask (List(Tensor)): list[[n, H, W]]
+
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        bs, num_queries = boxes.shape[:2]
+
+        num_gts = [len(a) for a in gt_class]
+        if sum(num_gts) == 0:
+            return [(paddle.to_tensor(
+                [], dtype=paddle.int64), paddle.to_tensor(
+                    [], dtype=paddle.int64)) for _ in range(bs)]
+
+        # We flatten to compute the cost matrices in a batch
+        # [batch_size * num_queries, num_classes]
+        logits = logits.detach()
+        out_prob = F.sigmoid(logits.flatten(
+            0, 1)) if self.use_focal_loss else F.softmax(logits.flatten(0, 1))
+        # [batch_size * num_queries, 4]
+        out_bbox = boxes.detach().flatten(0, 1)
+
+        # Also concat the target labels and boxes
+        tgt_ids = paddle.concat(gt_class).flatten()
+        tgt_bbox = paddle.concat(gt_bbox)
+
+        # Compute the classification cost
+        out_prob = paddle.gather(out_prob, tgt_ids, axis=1)
+        if self.use_focal_loss:
+            neg_cost_class = (1 - self.alpha) * (out_prob**self.gamma) * (-(
+                1 - out_prob + 1e-8).log())
+            pos_cost_class = self.alpha * (
+                (1 - out_prob)**self.gamma) * (-(out_prob + 1e-8).log())
+            cost_class = pos_cost_class - neg_cost_class
+        else:
+            cost_class = -out_prob
+
+        # Compute the L1 cost between boxes
+        cost_bbox = (
+            out_bbox.unsqueeze(1) - tgt_bbox.unsqueeze(0)).abs().sum(-1)
+
+        # Compute the giou cost betwen boxes
+        cost_giou = self.giou_loss(
+            bbox_cxcywh_to_xyxy(out_bbox.unsqueeze(1)),
+            bbox_cxcywh_to_xyxy(tgt_bbox.unsqueeze(0))).squeeze(-1)
+
+        # Final cost matrix
+        C = self.matcher_coeff['class'] * cost_class + \
+            self.matcher_coeff['bbox'] * cost_bbox + \
+            self.matcher_coeff['giou'] * cost_giou
+        # Compute the mask cost and dice cost
+        if self.with_mask:
+            assert (masks is not None and gt_mask is not None,
+                    'Make sure the input has `mask` and `gt_mask`')
+            # all masks share the same set of points for efficient matching
+            sample_points = paddle.rand([bs, 1, self.num_sample_points, 2])
+            sample_points = 2.0 * sample_points - 1.0
+
+            out_mask = F.grid_sample(
+                masks.detach(), sample_points, align_corners=False).squeeze(-2)
+            out_mask = out_mask.flatten(0, 1)
+
+            tgt_mask = paddle.concat(gt_mask).unsqueeze(1)
+            sample_points = paddle.concat([
+                a.tile([b, 1, 1, 1]) for a, b in zip(sample_points, num_gts)
+                if b > 0
+            ])
+            tgt_mask = F.grid_sample(
+                tgt_mask, sample_points, align_corners=False).squeeze([1, 2])
+
+            with paddle.amp.auto_cast(enable=False):
+                # binary cross entropy cost
+                pos_cost_mask = F.binary_cross_entropy_with_logits(
+                    out_mask, paddle.ones_like(out_mask), reduction='none')
+                neg_cost_mask = F.binary_cross_entropy_with_logits(
+                    out_mask, paddle.zeros_like(out_mask), reduction='none')
+                cost_mask = paddle.matmul(
+                    pos_cost_mask, tgt_mask, transpose_y=True) + paddle.matmul(
+                        neg_cost_mask, 1 - tgt_mask, transpose_y=True)
+                cost_mask /= self.num_sample_points
+
+                # dice cost
+                out_mask = F.sigmoid(out_mask)
+                numerator = 2 * paddle.matmul(
+                    out_mask, tgt_mask, transpose_y=True)
+                denominator = out_mask.sum(
+                    -1, keepdim=True) + tgt_mask.sum(-1).unsqueeze(0)
+                cost_dice = 1 - (numerator + 1) / (denominator + 1)
+
+                C = C + self.matcher_coeff['mask'] * cost_mask + \
+                    self.matcher_coeff['dice'] * cost_dice
+
+        C = C.reshape([bs, num_queries, -1])
+        C = [a.squeeze(0) for a in C.chunk(bs)]
+        sizes = [a.shape[0] for a in gt_bbox]
+        indices = [
+            linear_sum_assignment(c.split(sizes, -1)[i].numpy())
+            for i, c in enumerate(C)
+        ]
+        return [(paddle.to_tensor(
+            i, dtype=paddle.int64), paddle.to_tensor(
+                j, dtype=paddle.int64)) for i, j in indices]
diff --git a/rtdetr_paddle/ppdet/modeling/transformers/position_encoding.py b/rtdetr_paddle/ppdet/modeling/transformers/position_encoding.py
new file mode 100644
index 0000000..a2c3260
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/transformers/position_encoding.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+import paddle.nn as nn
+
+from ppdet.core.workspace import register, serializable
+
+
+@register
+@serializable
+class PositionEmbedding(nn.Layer):
+    def __init__(self,
+                 num_pos_feats=128,
+                 temperature=10000,
+                 normalize=True,
+                 scale=2 * math.pi,
+                 embed_type='sine',
+                 num_embeddings=50,
+                 offset=0.,
+                 eps=1e-6):
+        super(PositionEmbedding, self).__init__()
+        assert embed_type in ['sine', 'learned']
+
+        self.embed_type = embed_type
+        self.offset = offset
+        self.eps = eps
+        if self.embed_type == 'sine':
+            self.num_pos_feats = num_pos_feats
+            self.temperature = temperature
+            self.normalize = normalize
+            self.scale = scale
+        elif self.embed_type == 'learned':
+            self.row_embed = nn.Embedding(num_embeddings, num_pos_feats)
+            self.col_embed = nn.Embedding(num_embeddings, num_pos_feats)
+        else:
+            raise ValueError(f"{self.embed_type} is not supported.")
+
+    def forward(self, mask):
+        """
+        Args:
+            mask (Tensor): [B, H, W]
+        Returns:
+            pos (Tensor): [B, H, W, C]
+        """
+        if self.embed_type == 'sine':
+            y_embed = mask.cumsum(1)
+            x_embed = mask.cumsum(2)
+            if self.normalize:
+                y_embed = (y_embed + self.offset) / (
+                    y_embed[:, -1:, :] + self.eps) * self.scale
+                x_embed = (x_embed + self.offset) / (
+                    x_embed[:, :, -1:] + self.eps) * self.scale
+
+            dim_t = 2 * (paddle.arange(self.num_pos_feats) //
+                         2).astype('float32')
+            dim_t = self.temperature**(dim_t / self.num_pos_feats)
+
+            pos_x = x_embed.unsqueeze(-1) / dim_t
+            pos_y = y_embed.unsqueeze(-1) / dim_t
+            pos_x = paddle.stack(
+                (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+                axis=4).flatten(3)
+            pos_y = paddle.stack(
+                (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+                axis=4).flatten(3)
+            return paddle.concat((pos_y, pos_x), axis=3)
+        elif self.embed_type == 'learned':
+            h, w = mask.shape[-2:]
+            i = paddle.arange(w)
+            j = paddle.arange(h)
+            x_emb = self.col_embed(i)
+            y_emb = self.row_embed(j)
+            return paddle.concat(
+                [
+                    x_emb.unsqueeze(0).tile([h, 1, 1]),
+                    y_emb.unsqueeze(1).tile([1, w, 1]),
+                ],
+                axis=-1).unsqueeze(0)
+        else:
+            raise ValueError(f"not supported {self.embed_type}")
diff --git a/rtdetr_paddle/ppdet/modeling/transformers/rtdetr_transformer.py b/rtdetr_paddle/ppdet/modeling/transformers/rtdetr_transformer.py
new file mode 100644
index 0000000..3eccdec
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/transformers/rtdetr_transformer.py
@@ -0,0 +1,523 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+
+from ppdet.core.workspace import register
+from ..layers import MultiHeadAttention
+from .deformable_transformer import MSDeformableAttention
+from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,
+                           bias_init_with_prob)
+from .utils import (_get_clones, get_sine_pos_embed,
+                    get_contrastive_denoising_training_group, inverse_sigmoid, MLP)
+
+__all__ = ['RTDETRTransformer']
+
+
+class PPMSDeformableAttention(MSDeformableAttention):
+    def forward(self,
+                query,
+                reference_points,
+                value,
+                value_spatial_shapes,
+                value_level_start_index,
+                value_mask=None):
+        """
+        Args:
+            query (Tensor): [bs, query_length, C]
+            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area
+            value (Tensor): [bs, value_length, C]
+            value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+            value_level_start_index (List): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
+            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
+
+        Returns:
+            output (Tensor): [bs, Length_{query}, C]
+        """
+        bs, Len_q = query.shape[:2]
+        Len_v = value.shape[1]
+
+        value = self.value_proj(value)
+        if value_mask is not None:
+            value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
+            value *= value_mask
+        value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])
+
+        sampling_offsets = self.sampling_offsets(query).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
+        attention_weights = self.attention_weights(query).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels * self.num_points])
+        attention_weights = F.softmax(attention_weights).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels, self.num_points])
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = paddle.to_tensor(value_spatial_shapes)
+            offset_normalizer = offset_normalizer.flip([1]).reshape(
+                [1, 1, 1, self.num_levels, 1, 2])
+            sampling_locations = reference_points.reshape([
+                bs, Len_q, 1, self.num_levels, 1, 2
+            ]) + sampling_offsets / offset_normalizer
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2] + sampling_offsets /
+                self.num_points * reference_points[:, :, None, :, None, 2:] *
+                0.5)
+        else:
+            raise ValueError(
+                "Last dim of reference_points must be 2 or 4, but get {} instead.".
+                format(reference_points.shape[-1]))
+
+        if not isinstance(query, paddle.Tensor):
+            from ppdet.modeling.transformers.utils import deformable_attention_core_func
+            output = deformable_attention_core_func(
+                value, value_spatial_shapes, value_level_start_index,
+                sampling_locations, attention_weights)
+        else:
+            value_spatial_shapes = paddle.to_tensor(value_spatial_shapes)
+            value_level_start_index = paddle.to_tensor(value_level_start_index)
+            output = self.ms_deformable_attn_core(
+                value, value_spatial_shapes, value_level_start_index,
+                sampling_locations, attention_weights)
+        output = self.output_proj(output)
+
+        return output
+
+
+class TransformerDecoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model=256,
+                 n_head=8,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 n_levels=4,
+                 n_points=4,
+                 weight_attr=None,
+                 bias_attr=None):
+        super(TransformerDecoderLayer, self).__init__()
+
+        # self attention
+        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(
+            d_model,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+        # cross attention
+        self.cross_attn = PPMSDeformableAttention(d_model, n_head, n_levels,
+                                                  n_points, 1.0)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(
+            d_model,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr,
+                                 bias_attr)
+        self.activation = getattr(F, activation)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr,
+                                 bias_attr)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(
+            d_model,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+        xavier_uniform_(self.linear1.weight)
+        xavier_uniform_(self.linear2.weight)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+
+    def forward(self,
+                tgt,
+                reference_points,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                attn_mask=None,
+                memory_mask=None,
+                query_pos_embed=None):
+        # self attention
+        q = k = self.with_pos_embed(tgt, query_pos_embed)
+        if attn_mask is not None:
+            attn_mask = paddle.where(
+                attn_mask.astype('bool'),
+                paddle.zeros(attn_mask.shape, tgt.dtype),
+                paddle.full(attn_mask.shape, float("-inf"), tgt.dtype))
+        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        # cross attention
+        tgt2 = self.cross_attn(
+            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
+            memory_spatial_shapes, memory_level_start_index, memory_mask)
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+
+        # ffn
+        tgt2 = self.forward_ffn(tgt)
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+
+        return tgt
+
+
+class TransformerDecoder(nn.Layer):
+    def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
+        super(TransformerDecoder, self).__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
+
+    def forward(self,
+                tgt,
+                ref_points_unact,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                bbox_head,
+                score_head,
+                query_pos_head,
+                attn_mask=None,
+                memory_mask=None):
+        output = tgt
+        dec_out_bboxes = []
+        dec_out_logits = []
+        ref_points_detach = F.sigmoid(ref_points_unact)
+        for i, layer in enumerate(self.layers):
+            ref_points_input = ref_points_detach.unsqueeze(2)
+            query_pos_embed = query_pos_head(ref_points_detach)
+
+            output = layer(output, ref_points_input, memory,
+                           memory_spatial_shapes, memory_level_start_index,
+                           attn_mask, memory_mask, query_pos_embed)
+
+            inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
+                ref_points_detach))
+
+            if self.training:
+                dec_out_logits.append(score_head[i](output))
+                if i == 0:
+                    dec_out_bboxes.append(inter_ref_bbox)
+                else:
+                    dec_out_bboxes.append(
+                        F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
+                            ref_points)))
+            elif i == self.eval_idx:
+                dec_out_logits.append(score_head[i](output))
+                dec_out_bboxes.append(inter_ref_bbox)
+                break
+
+            ref_points = inter_ref_bbox
+            ref_points_detach = inter_ref_bbox.detach(
+            ) if self.training else inter_ref_bbox
+
+        return paddle.stack(dec_out_bboxes), paddle.stack(dec_out_logits)
+
+
+@register
+class RTDETRTransformer(nn.Layer):
+    __shared__ = ['num_classes', 'hidden_dim', 'eval_size']
+
+    def __init__(self,
+                 num_classes=80,
+                 hidden_dim=256,
+                 num_queries=300,
+                 position_embed_type='sine',
+                 backbone_feat_channels=[512, 1024, 2048],
+                 feat_strides=[8, 16, 32],
+                 num_levels=3,
+                 num_decoder_points=4,
+                 nhead=8,
+                 num_decoder_layers=6,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 num_denoising=100,
+                 label_noise_ratio=0.5,
+                 box_noise_scale=1.0,
+                 learnt_init_query=True,
+                 eval_size=None,
+                 eval_idx=-1,
+                 eps=1e-2):
+        super(RTDETRTransformer, self).__init__()
+        assert position_embed_type in ['sine', 'learned'], \
+            f'ValueError: position_embed_type not supported {position_embed_type}!'
+        assert len(backbone_feat_channels) <= num_levels
+        assert len(feat_strides) == len(backbone_feat_channels)
+        for _ in range(num_levels - len(feat_strides)):
+            feat_strides.append(feat_strides[-1] * 2)
+
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+        self.feat_strides = feat_strides
+        self.num_levels = num_levels
+        self.num_classes = num_classes
+        self.num_queries = num_queries
+        self.eps = eps
+        self.num_decoder_layers = num_decoder_layers
+        self.eval_size = eval_size
+
+        # backbone feature projection
+        self._build_input_proj_layer(backbone_feat_channels)
+
+        # Transformer module
+        decoder_layer = TransformerDecoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
+            num_decoder_points)
+        self.decoder = TransformerDecoder(hidden_dim, decoder_layer,
+                                          num_decoder_layers, eval_idx)
+
+        # denoising part
+        self.denoising_class_embed = nn.Embedding(
+            num_classes,
+            hidden_dim,
+            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
+        self.num_denoising = num_denoising
+        self.label_noise_ratio = label_noise_ratio
+        self.box_noise_scale = box_noise_scale
+
+        # decoder embedding
+        self.learnt_init_query = learnt_init_query
+        if learnt_init_query:
+            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
+        self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2)
+
+        # encoder head
+        self.enc_output = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.LayerNorm(
+                hidden_dim,
+                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                bias_attr=ParamAttr(regularizer=L2Decay(0.0))))
+        self.enc_score_head = nn.Linear(hidden_dim, num_classes)
+        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+
+        # decoder head
+        self.dec_score_head = nn.LayerList([
+            nn.Linear(hidden_dim, num_classes)
+            for _ in range(num_decoder_layers)
+        ])
+        self.dec_bbox_head = nn.LayerList([
+            MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+            for _ in range(num_decoder_layers)
+        ])
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        # class and bbox head init
+        bias_cls = bias_init_with_prob(0.01)
+        linear_init_(self.enc_score_head)
+        constant_(self.enc_score_head.bias, bias_cls)
+        constant_(self.enc_bbox_head.layers[-1].weight)
+        constant_(self.enc_bbox_head.layers[-1].bias)
+        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
+            linear_init_(cls_)
+            constant_(cls_.bias, bias_cls)
+            constant_(reg_.layers[-1].weight)
+            constant_(reg_.layers[-1].bias)
+
+        linear_init_(self.enc_output[0])
+        xavier_uniform_(self.enc_output[0].weight)
+        if self.learnt_init_query:
+            xavier_uniform_(self.tgt_embed.weight)
+        xavier_uniform_(self.query_pos_head.layers[0].weight)
+        xavier_uniform_(self.query_pos_head.layers[1].weight)
+        for l in self.input_proj:
+            xavier_uniform_(l[0].weight)
+
+        # init encoder output anchors and valid_mask
+        if self.eval_size:
+            self.anchors, self.valid_mask = self._generate_anchors()
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'backbone_feat_channels': [i.channels for i in input_shape]}
+
+    def _build_input_proj_layer(self, backbone_feat_channels):
+        self.input_proj = nn.LayerList()
+        for in_channels in backbone_feat_channels:
+            self.input_proj.append(
+                nn.Sequential(
+                    ('conv', nn.Conv2D(
+                        in_channels,
+                        self.hidden_dim,
+                        kernel_size=1,
+                        bias_attr=False)), ('norm', nn.BatchNorm2D(
+                            self.hidden_dim,
+                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
+        in_channels = backbone_feat_channels[-1]
+        for _ in range(self.num_levels - len(backbone_feat_channels)):
+            self.input_proj.append(
+                nn.Sequential(
+                    ('conv', nn.Conv2D(
+                        in_channels,
+                        self.hidden_dim,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        bias_attr=False)), ('norm', nn.BatchNorm2D(
+                            self.hidden_dim,
+                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
+            in_channels = self.hidden_dim
+
+    def _get_encoder_input(self, feats):
+        # get projection features
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+        if self.num_levels > len(proj_feats):
+            len_srcs = len(proj_feats)
+            for i in range(len_srcs, self.num_levels):
+                if i == len_srcs:
+                    proj_feats.append(self.input_proj[i](feats[-1]))
+                else:
+                    proj_feats.append(self.input_proj[i](proj_feats[-1]))
+
+        # get encoder inputs
+        feat_flatten = []
+        spatial_shapes = []
+        level_start_index = [0, ]
+        for i, feat in enumerate(proj_feats):
+            _, _, h, w = feat.shape
+            # [b, c, h, w] -> [b, h*w, c]
+            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
+            # [num_levels, 2]
+            spatial_shapes.append([h, w])
+            # [l], start index of each level
+            level_start_index.append(h * w + level_start_index[-1])
+
+        # [b, l, c]
+        feat_flatten = paddle.concat(feat_flatten, 1)
+        level_start_index.pop()
+        return (feat_flatten, spatial_shapes, level_start_index)
+
+    def forward(self, feats, pad_mask=None, gt_meta=None):
+        # input projection and embedding
+        (memory, spatial_shapes,
+         level_start_index) = self._get_encoder_input(feats)
+
+        # prepare denoising training
+        if self.training:
+            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
+                get_contrastive_denoising_training_group(gt_meta,
+                                            self.num_classes,
+                                            self.num_queries,
+                                            self.denoising_class_embed.weight,
+                                            self.num_denoising,
+                                            self.label_noise_ratio,
+                                            self.box_noise_scale)
+        else:
+            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
+
+        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
+            self._get_decoder_input(
+            memory, spatial_shapes, denoising_class, denoising_bbox_unact)
+
+        # decoder
+        out_bboxes, out_logits = self.decoder(
+            target,
+            init_ref_points_unact,
+            memory,
+            spatial_shapes,
+            level_start_index,
+            self.dec_bbox_head,
+            self.dec_score_head,
+            self.query_pos_head,
+            attn_mask=attn_mask)
+        return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,
+                dn_meta)
+
+    def _generate_anchors(self,
+                          spatial_shapes=None,
+                          grid_size=0.05,
+                          dtype="float32"):
+        if spatial_shapes is None:
+            spatial_shapes = [
+                [int(self.eval_size[0] / s), int(self.eval_size[1] / s)]
+                for s in self.feat_strides
+            ]
+        anchors = []
+        for lvl, (h, w) in enumerate(spatial_shapes):
+            grid_y, grid_x = paddle.meshgrid(
+                paddle.arange(
+                    end=h, dtype=dtype),
+                paddle.arange(
+                    end=w, dtype=dtype))
+            grid_xy = paddle.stack([grid_x, grid_y], -1)
+
+            valid_WH = paddle.to_tensor([w, h]).astype(dtype)
+            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
+            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
+            anchors.append(
+                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
+
+        anchors = paddle.concat(anchors, 1)
+        valid_mask = ((anchors > self.eps) *
+                      (anchors < 1 - self.eps)).all(-1, keepdim=True)
+        anchors = paddle.log(anchors / (1 - anchors))
+        anchors = paddle.where(valid_mask, anchors,
+                               paddle.to_tensor(float("inf")))
+        return anchors, valid_mask
+
+    def _get_decoder_input(self,
+                           memory,
+                           spatial_shapes,
+                           denoising_class=None,
+                           denoising_bbox_unact=None):
+        bs, _, _ = memory.shape
+        # prepare input for decoder
+        if self.training or self.eval_size is None:
+            anchors, valid_mask = self._generate_anchors(spatial_shapes)
+        else:
+            anchors, valid_mask = self.anchors, self.valid_mask
+        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
+        output_memory = self.enc_output(memory)
+
+        enc_outputs_class = self.enc_score_head(output_memory)
+        enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors
+
+        _, topk_ind = paddle.topk(
+            enc_outputs_class.max(-1), self.num_queries, axis=1)
+        # extract region proposal boxes
+        batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype)
+        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
+        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
+
+        reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact,
+                                                  topk_ind)  # unsigmoided.
+        enc_topk_bboxes = F.sigmoid(reference_points_unact)
+        if denoising_bbox_unact is not None:
+            reference_points_unact = paddle.concat(
+                [denoising_bbox_unact, reference_points_unact], 1)
+        if self.training:
+            reference_points_unact = reference_points_unact.detach()
+        enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)
+
+        # extract region features
+        if self.learnt_init_query:
+            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
+        else:
+            target = paddle.gather_nd(output_memory, topk_ind)
+            if self.training:
+                target = target.detach()
+        if denoising_class is not None:
+            target = paddle.concat([denoising_class, target], 1)
+
+        return target, reference_points_unact, enc_topk_bboxes, enc_topk_logits
diff --git a/rtdetr_paddle/ppdet/modeling/transformers/utils.py b/rtdetr_paddle/ppdet/modeling/transformers/utils.py
new file mode 100644
index 0000000..d144704
--- /dev/null
+++ b/rtdetr_paddle/ppdet/modeling/transformers/utils.py
@@ -0,0 +1,481 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Modified from detrex (https://github.com/IDEA-Research/detrex)
+# Copyright 2022 The IDEA Authors. All rights reserved.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+__all__ = [
+    '_get_clones', 'bbox_cxcywh_to_xyxy',
+    'bbox_xyxy_to_cxcywh', 'sigmoid_focal_loss', 'inverse_sigmoid',
+    'deformable_attention_core_func', 'varifocal_loss_with_logits'
+]
+
+
+
+def bbox_area(boxes):
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+def bbox_overlaps(boxes1, boxes2):
+    """
+    Calculate overlaps between boxes1 and boxes2
+
+    Args:
+        boxes1 (Tensor): boxes with shape [M, 4]
+        boxes2 (Tensor): boxes with shape [N, 4]
+
+    Return:
+        overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N]
+    """
+    M = boxes1.shape[0]
+    N = boxes2.shape[0]
+    if M * N == 0:
+        return paddle.zeros([M, N], dtype='float32')
+    area1 = bbox_area(boxes1)
+    area2 = bbox_area(boxes2)
+
+    xy_max = paddle.minimum(
+        paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:])
+    xy_min = paddle.maximum(
+        paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2])
+    width_height = xy_max - xy_min
+    width_height = width_height.clip(min=0)
+    inter = width_height.prod(axis=2)
+
+    overlaps = paddle.where(inter > 0, inter /
+                            (paddle.unsqueeze(area1, 1) + area2 - inter),
+                            paddle.zeros_like(inter))
+    return overlaps
+
+
+def _get_clones(module, N):
+    return nn.LayerList([copy.deepcopy(module) for _ in range(N)])
+
+
+def bbox_cxcywh_to_xyxy(x):
+    cxcy, wh = paddle.split(x, 2, axis=-1)
+    return paddle.concat([cxcy - 0.5 * wh, cxcy + 0.5 * wh], axis=-1)
+
+
+def bbox_xyxy_to_cxcywh(x):
+    x1, y1, x2, y2 = x.split(4, axis=-1)
+    return paddle.concat(
+        [(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)], axis=-1)
+
+
+def sigmoid_focal_loss(logit, label, normalizer=1.0, alpha=0.25, gamma=2.0):
+    prob = F.sigmoid(logit)
+    ce_loss = F.binary_cross_entropy_with_logits(logit, label, reduction="none")
+    p_t = prob * label + (1 - prob) * (1 - label)
+    loss = ce_loss * ((1 - p_t)**gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * label + (1 - alpha) * (1 - label)
+        loss = alpha_t * loss
+    return loss.mean(1).sum() / normalizer
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clip(min=0., max=1.)
+    return paddle.log(x.clip(min=eps) / (1 - x).clip(min=eps))
+
+
+def deformable_attention_core_func(value, value_spatial_shapes,
+                                   value_level_start_index, sampling_locations,
+                                   attention_weights):
+    """
+    Args:
+        value (Tensor): [bs, value_length, n_head, c]
+        value_spatial_shapes (Tensor|List): [n_levels, 2]
+        value_level_start_index (Tensor|List): [n_levels]
+        sampling_locations (Tensor): [bs, query_length, n_head, n_levels, n_points, 2]
+        attention_weights (Tensor): [bs, query_length, n_head, n_levels, n_points]
+
+    Returns:
+        output (Tensor): [bs, Length_{query}, C]
+    """
+    bs, _, n_head, c = value.shape
+    _, Len_q, _, n_levels, n_points, _ = sampling_locations.shape
+
+    split_shape = [h * w for h, w in value_spatial_shapes]
+    value_list = value.split(split_shape, axis=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level, (h, w) in enumerate(value_spatial_shapes):
+        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
+        value_l_ = value_list[level].flatten(2).transpose(
+            [0, 2, 1]).reshape([bs * n_head, c, h, w])
+        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(
+            [0, 2, 1, 3, 4]).flatten(0, 1)
+        # N_*M_, D_, Lq_, P_
+        sampling_value_l_ = F.grid_sample(
+            value_l_,
+            sampling_grid_l_,
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_)
+    attention_weights = attention_weights.transpose([0, 2, 1, 3, 4]).reshape(
+        [bs * n_head, 1, Len_q, n_levels * n_points])
+    output = (paddle.stack(
+        sampling_value_list, axis=-2).flatten(-2) *
+              attention_weights).sum(-1).reshape([bs, n_head * c, Len_q])
+
+    return output.transpose([0, 2, 1])
+
+
+def get_valid_ratio(mask):
+    _, H, W = paddle.shape(mask)
+    valid_ratio_h = paddle.sum(mask[:, :, 0], 1) / H
+    valid_ratio_w = paddle.sum(mask[:, 0, :], 1) / W
+    # [b, 2]
+    return paddle.stack([valid_ratio_w, valid_ratio_h], -1)
+
+
+def get_denoising_training_group(targets,
+                                 num_classes,
+                                 num_queries,
+                                 class_embed,
+                                 num_denoising=100,
+                                 label_noise_ratio=0.5,
+                                 box_noise_scale=1.0):
+    if num_denoising <= 0:
+        return None, None, None, None
+    num_gts = [len(t) for t in targets["gt_class"]]
+    max_gt_num = max(num_gts)
+    if max_gt_num == 0:
+        return None, None, None, None
+
+    num_group = num_denoising // max_gt_num
+    num_group = 1 if num_group == 0 else num_group
+    # pad gt to max_num of a batch
+    bs = len(targets["gt_class"])
+    input_query_class = paddle.full(
+        [bs, max_gt_num], num_classes, dtype='int32')
+    input_query_bbox = paddle.zeros([bs, max_gt_num, 4])
+    pad_gt_mask = paddle.zeros([bs, max_gt_num])
+    for i in range(bs):
+        num_gt = num_gts[i]
+        if num_gt > 0:
+            input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1)
+            input_query_bbox[i, :num_gt] = targets["gt_bbox"][i]
+            pad_gt_mask[i, :num_gt] = 1
+
+    input_query_class = input_query_class.tile([1, num_group])
+    input_query_bbox = input_query_bbox.tile([1, num_group, 1])
+    pad_gt_mask = pad_gt_mask.tile([1, num_group])
+
+    dn_positive_idx = paddle.nonzero(pad_gt_mask)[:, 1]
+    dn_positive_idx = paddle.split(dn_positive_idx,
+                                   [n * num_group for n in num_gts])
+    # total denoising queries
+    num_denoising = int(max_gt_num * num_group)
+
+    if label_noise_ratio > 0:
+        input_query_class = input_query_class.flatten()
+        pad_gt_mask = pad_gt_mask.flatten()
+        # half of bbox prob
+        mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5)
+        chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1)
+        # randomly put a new one here
+        new_label = paddle.randint_like(
+            chosen_idx, 0, num_classes, dtype=input_query_class.dtype)
+        input_query_class.scatter_(chosen_idx, new_label)
+        input_query_class.reshape_([bs, num_denoising])
+        pad_gt_mask.reshape_([bs, num_denoising])
+
+    if box_noise_scale > 0:
+        diff = paddle.concat(
+            [input_query_bbox[..., 2:] * 0.5, input_query_bbox[..., 2:]],
+            axis=-1) * box_noise_scale
+        diff *= (paddle.rand(input_query_bbox.shape) * 2.0 - 1.0)
+        input_query_bbox += diff
+        input_query_bbox = inverse_sigmoid(input_query_bbox)
+
+    class_embed = paddle.concat(
+        [class_embed, paddle.zeros([1, class_embed.shape[-1]])])
+    input_query_class = paddle.gather(
+        class_embed, input_query_class.flatten(),
+        axis=0).reshape([bs, num_denoising, -1])
+
+    tgt_size = num_denoising + num_queries
+    attn_mask = paddle.ones([tgt_size, tgt_size]) < 0
+    # match query cannot see the reconstruction
+    attn_mask[num_denoising:, :num_denoising] = True
+    # reconstruct cannot see each other
+    for i in range(num_group):
+        if i == 0:
+            attn_mask[max_gt_num * i:max_gt_num * (i + 1), max_gt_num * (i + 1):
+                      num_denoising] = True
+        if i == num_group - 1:
+            attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num *
+                      i] = True
+        else:
+            attn_mask[max_gt_num * i:max_gt_num * (i + 1), max_gt_num * (i + 1):
+                      num_denoising] = True
+            attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num *
+                      i] = True
+    attn_mask = ~attn_mask
+    dn_meta = {
+        "dn_positive_idx": dn_positive_idx,
+        "dn_num_group": num_group,
+        "dn_num_split": [num_denoising, num_queries]
+    }
+
+    return input_query_class, input_query_bbox, attn_mask, dn_meta
+
+
+def get_contrastive_denoising_training_group(targets,
+                                             num_classes,
+                                             num_queries,
+                                             class_embed,
+                                             num_denoising=100,
+                                             label_noise_ratio=0.5,
+                                             box_noise_scale=1.0):
+    if num_denoising <= 0:
+        return None, None, None, None
+    num_gts = [len(t) for t in targets["gt_class"]]
+    max_gt_num = max(num_gts)
+    if max_gt_num == 0:
+        return None, None, None, None
+
+    num_group = num_denoising // max_gt_num
+    num_group = 1 if num_group == 0 else num_group
+    # pad gt to max_num of a batch
+    bs = len(targets["gt_class"])
+    input_query_class = paddle.full(
+        [bs, max_gt_num], num_classes, dtype='int32')
+    input_query_bbox = paddle.zeros([bs, max_gt_num, 4])
+    pad_gt_mask = paddle.zeros([bs, max_gt_num])
+    for i in range(bs):
+        num_gt = num_gts[i]
+        if num_gt > 0:
+            input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1)
+            input_query_bbox[i, :num_gt] = targets["gt_bbox"][i]
+            pad_gt_mask[i, :num_gt] = 1
+    # each group has positive and negative queries.
+    input_query_class = input_query_class.tile([1, 2 * num_group])
+    input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1])
+    pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group])
+    # positive and negative mask
+    negative_gt_mask = paddle.zeros([bs, max_gt_num * 2, 1])
+    negative_gt_mask[:, max_gt_num:] = 1
+    negative_gt_mask = negative_gt_mask.tile([1, num_group, 1])
+    positive_gt_mask = 1 - negative_gt_mask
+    # contrastive denoising training positive index
+    positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
+    dn_positive_idx = paddle.nonzero(positive_gt_mask)[:, 1]
+    dn_positive_idx = paddle.split(dn_positive_idx,
+                                   [n * num_group for n in num_gts])
+    # total denoising queries
+    num_denoising = int(max_gt_num * 2 * num_group)
+
+    if label_noise_ratio > 0:
+        input_query_class = input_query_class.flatten()
+        pad_gt_mask = pad_gt_mask.flatten()
+
+        # Convert pad_gt_mask to bool if it's not already
+        pad_gt_mask = pad_gt_mask.astype('bool')
+
+        # half of bbox prob
+        mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5)
+        chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1)
+        
+        # randomly put a new one here
+        new_label = paddle.randint_like(
+            chosen_idx, 0, num_classes, dtype=input_query_class.dtype)
+        
+        input_query_class.scatter_(chosen_idx, new_label)
+        input_query_class.reshape_([bs, num_denoising])
+        pad_gt_mask.reshape_([bs, num_denoising])
+
+    if box_noise_scale > 0:
+        known_bbox = bbox_cxcywh_to_xyxy(input_query_bbox)
+
+        diff = paddle.tile(input_query_bbox[..., 2:] * 0.5,
+                           [1, 1, 2]) * box_noise_scale
+
+        rand_sign = paddle.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
+        rand_part = paddle.rand(input_query_bbox.shape)
+        rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (
+            1 - negative_gt_mask)
+        rand_part *= rand_sign
+        known_bbox += rand_part * diff
+        known_bbox.clip_(min=0.0, max=1.0)
+        input_query_bbox = bbox_xyxy_to_cxcywh(known_bbox)
+        input_query_bbox = inverse_sigmoid(input_query_bbox)
+
+    class_embed = paddle.concat(
+        [class_embed, paddle.zeros([1, class_embed.shape[-1]])])
+    input_query_class = paddle.gather(
+        class_embed, input_query_class.flatten(),
+        axis=0).reshape([bs, num_denoising, -1])
+
+    tgt_size = num_denoising + num_queries
+    attn_mask = paddle.ones([tgt_size, tgt_size]) < 0
+    # match query cannot see the reconstruction
+    attn_mask[num_denoising:, :num_denoising] = True
+    # reconstruct cannot see each other
+    for i in range(num_group):
+        if i == 0:
+            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num *
+                      2 * (i + 1):num_denoising] = True
+        if i == num_group - 1:
+            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *
+                      i * 2] = True
+        else:
+            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num *
+                      2 * (i + 1):num_denoising] = True
+            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *
+                      2 * i] = True
+    attn_mask = ~attn_mask
+    dn_meta = {
+        "dn_positive_idx": dn_positive_idx,
+        "dn_num_group": num_group,
+        "dn_num_split": [num_denoising, num_queries]
+    }
+
+    return input_query_class, input_query_bbox, attn_mask, dn_meta
+
+
+def get_sine_pos_embed(pos_tensor,
+                       num_pos_feats=128,
+                       temperature=10000,
+                       exchange_xy=True):
+    """generate sine position embedding from a position tensor
+
+    Args:
+        pos_tensor (Tensor): Shape as `(None, n)`.
+        num_pos_feats (int): projected shape for each float in the tensor. Default: 128
+        temperature (int): The temperature used for scaling
+            the position embedding. Default: 10000.
+        exchange_xy (bool, optional): exchange pos x and pos y. \
+            For example, input tensor is `[x, y]`, the results will  # noqa
+            be `[pos(y), pos(x)]`. Defaults: True.
+
+    Returns:
+        Tensor: Returned position embedding  # noqa
+        with shape `(None, n * num_pos_feats)`.
+    """
+    scale = 2. * math.pi
+    dim_t = 2. * paddle.floor_divide(
+        paddle.arange(num_pos_feats), paddle.to_tensor(2))
+    dim_t = scale / temperature**(dim_t / num_pos_feats)
+
+    def sine_func(x):
+        x *= dim_t
+        return paddle.stack(
+            (x[:, :, 0::2].sin(), x[:, :, 1::2].cos()), axis=3).flatten(2)
+
+    pos_res = [sine_func(x) for x in pos_tensor.split(pos_tensor.shape[-1], -1)]
+    if exchange_xy:
+        pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
+    pos_res = paddle.concat(pos_res, axis=2)
+    return pos_res
+
+
+def mask_to_box_coordinate(mask,
+                           normalize=False,
+                           format="xyxy",
+                           dtype="float32"):
+    """
+    Compute the bounding boxes around the provided mask.
+    Args:
+        mask (Tensor:bool): [b, c, h, w]
+
+    Returns:
+        bbox (Tensor): [b, c, 4]
+    """
+    assert mask.ndim == 4
+    assert format in ["xyxy", "xywh"]
+    if mask.sum() == 0:
+        return paddle.zeros([mask.shape[0], mask.shape[1], 4], dtype=dtype)
+
+    h, w = mask.shape[-2:]
+    y, x = paddle.meshgrid(
+        paddle.arange(
+            end=h, dtype=dtype), paddle.arange(
+                end=w, dtype=dtype))
+
+    x_mask = x * mask
+    x_max = x_mask.flatten(-2).max(-1) + 1
+    x_min = paddle.where(mask, x_mask,
+                         paddle.to_tensor(1e8)).flatten(-2).min(-1)
+
+    y_mask = y * mask
+    y_max = y_mask.flatten(-2).max(-1) + 1
+    y_min = paddle.where(mask, y_mask,
+                         paddle.to_tensor(1e8)).flatten(-2).min(-1)
+    out_bbox = paddle.stack([x_min, y_min, x_max, y_max], axis=-1)
+    if normalize:
+        out_bbox /= paddle.to_tensor([w, h, w, h]).astype(dtype)
+
+    return out_bbox if format == "xyxy" else bbox_xyxy_to_cxcywh(out_bbox)
+
+
+def varifocal_loss_with_logits(pred_logits,
+                               gt_score,
+                               label,
+                               normalizer=1.0,
+                               alpha=0.75,
+                               gamma=2.0):
+    pred_score = F.sigmoid(pred_logits)
+    weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
+    loss = F.binary_cross_entropy_with_logits(
+        pred_logits, gt_score, weight=weight, reduction='none')
+    return loss.mean(1).sum() / normalizer
+
+
+
+
+from ..initializer import linear_init_
+
+class MLP(nn.Layer):
+    """This code is based on
+        https://github.com/facebookresearch/detr/blob/main/models/detr.py
+    """
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.LayerList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for l in self.layers:
+            linear_init_(l)
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
diff --git a/rtdetr_paddle/ppdet/optimizer/__init__.py b/rtdetr_paddle/ppdet/optimizer/__init__.py
new file mode 100644
index 0000000..aa690dc
--- /dev/null
+++ b/rtdetr_paddle/ppdet/optimizer/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import optimizer
+from . import ema
+
+from .optimizer import *
+from .ema import *
diff --git a/rtdetr_paddle/ppdet/optimizer/ema.py b/rtdetr_paddle/ppdet/optimizer/ema.py
new file mode 100644
index 0000000..70d006b
--- /dev/null
+++ b/rtdetr_paddle/ppdet/optimizer/ema.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+import weakref
+from copy import deepcopy
+
+from .utils import get_bn_running_state_names
+
+__all__ = ['ModelEMA', 'SimpleModelEMA']
+
+
+class ModelEMA(object):
+    """
+    Exponential Weighted Average for Deep Neutal Networks
+    Args:
+        model (nn.Layer): Detector of model.
+        decay (int):  The decay used for updating ema parameter.
+            Ema's parameter are updated with the formula:
+           `ema_param = decay * ema_param + (1 - decay) * cur_param`.
+            Defaults is 0.9998.
+        ema_decay_type (str): type in ['threshold', 'normal', 'exponential'],
+            'threshold' as default.
+        cycle_epoch (int): The epoch of interval to reset ema_param and
+            step. Defaults is -1, which means not reset. Its function is to
+            add a regular effect to ema, which is set according to experience
+            and is effective when the total training epoch is large.
+        ema_black_list (set|list|tuple, optional): The custom EMA black_list.
+            Blacklist of weight names that will not participate in EMA
+            calculation. Default: None.
+    """
+
+    def __init__(self,
+                 model,
+                 decay=0.9998,
+                 ema_decay_type='threshold',
+                 cycle_epoch=-1,
+                 ema_black_list=None,
+                 ema_filter_no_grad=False):
+        self.step = 0
+        self.epoch = 0
+        self.decay = decay
+        self.ema_decay_type = ema_decay_type
+        self.cycle_epoch = cycle_epoch
+        self.ema_black_list = self._match_ema_black_list(
+            model.state_dict().keys(), ema_black_list)
+        bn_states_names = get_bn_running_state_names(model)
+        if ema_filter_no_grad:
+            for n, p in model.named_parameters():
+                if p.stop_gradient and n not in bn_states_names:
+                    self.ema_black_list.add(n)
+
+        self.state_dict = dict()
+        for k, v in model.state_dict().items():
+            if k in self.ema_black_list:
+                self.state_dict[k] = v
+            else:
+                self.state_dict[k] = paddle.zeros_like(v)
+
+        self._model_state = {
+            k: weakref.ref(p)
+            for k, p in model.state_dict().items()
+        }
+
+    def reset(self):
+        self.step = 0
+        self.epoch = 0
+        for k, v in self.state_dict.items():
+            if k in self.ema_black_list:
+                self.state_dict[k] = v
+            else:
+                self.state_dict[k] = paddle.zeros_like(v)
+
+    def resume(self, state_dict, step=0):
+        for k, v in state_dict.items():
+            if k in self.state_dict:
+                if self.state_dict[k].dtype == v.dtype:
+                    self.state_dict[k] = v
+                else:
+                    self.state_dict[k] = v.astype(self.state_dict[k].dtype)
+        self.step = step
+
+    def update(self, model=None):
+        if self.ema_decay_type == 'threshold':
+            decay = min(self.decay, (1 + self.step) / (10 + self.step))
+        elif self.ema_decay_type == 'exponential':
+            decay = self.decay * (1 - math.exp(-(self.step + 1) / 2000))
+        else:
+            decay = self.decay
+        self._decay = decay
+
+        if model is not None:
+            model_dict = model.state_dict()
+        else:
+            model_dict = {k: p() for k, p in self._model_state.items()}
+            assert all(
+                [v is not None for _, v in model_dict.items()]), 'python gc.'
+
+        for k, v in self.state_dict.items():
+            if k not in self.ema_black_list:
+                v = decay * v + (1 - decay) * model_dict[k]
+                v.stop_gradient = True
+                self.state_dict[k] = v
+        self.step += 1
+
+    def apply(self):
+        if self.step == 0:
+            return self.state_dict
+        state_dict = dict()
+        for k, v in self.state_dict.items():
+            if k in self.ema_black_list:
+                v.stop_gradient = True
+                state_dict[k] = v
+            else:
+                if self.ema_decay_type != 'exponential':
+                    v = v / (1 - self._decay**self.step)
+                v.stop_gradient = True
+                state_dict[k] = v
+        self.epoch += 1
+        if self.cycle_epoch > 0 and self.epoch == self.cycle_epoch:
+            self.reset()
+
+        return state_dict
+
+    def _match_ema_black_list(self, weight_name, ema_black_list=None):
+        out_list = set()
+        if ema_black_list:
+            for name in weight_name:
+                for key in ema_black_list:
+                    if key in name:
+                        out_list.add(name)
+        return out_list
+
+
+class SimpleModelEMA(object):
+    """
+    Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models
+    Keep a moving average of everything in the model state_dict (parameters and buffers).
+    This is intended to allow functionality like
+    https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
+    A smoothed version of the weights is necessary for some training schemes to perform well.
+    This class is sensitive where it is initialized in the sequence of model init,
+    GPU assignment and distributed training wrappers.
+    """
+
+    def __init__(self, model=None, decay=0.9996):
+        """
+        Args:
+            model (nn.Module): model to apply EMA.
+            decay (float): ema decay reate.
+        """
+        self.model = deepcopy(model)
+        self.decay = decay
+
+    def update(self, model, decay=None):
+        if decay is None:
+            decay = self.decay
+
+        with paddle.no_grad():
+            state = {}
+            msd = model.state_dict()
+            for k, v in self.model.state_dict().items():
+                if paddle.is_floating_point(v):
+                    v *= decay
+                    v += (1.0 - decay) * msd[k].detach()
+                state[k] = v
+            self.model.set_state_dict(state)
+
+    def resume(self, state_dict, step=0):
+        state = {}
+        msd = state_dict
+        for k, v in self.model.state_dict().items():
+            if paddle.is_floating_point(v):
+                v = msd[k].detach()
+            state[k] = v
+        self.model.set_state_dict(state)
+        self.step = step
diff --git a/rtdetr_paddle/ppdet/optimizer/optimizer.py b/rtdetr_paddle/ppdet/optimizer/optimizer.py
new file mode 100644
index 0000000..37d1cf0
--- /dev/null
+++ b/rtdetr_paddle/ppdet/optimizer/optimizer.py
@@ -0,0 +1,350 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import math
+import paddle
+import paddle.nn as nn
+
+import paddle.optimizer as optimizer
+import paddle.regularizer as regularizer
+
+from ppdet.core.workspace import register, serializable
+import copy
+
+__all__ = ['LearningRate', 'OptimizerBuilder']
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+@serializable
+class CosineDecay(object):
+    """
+    Cosine learning rate decay
+
+    Args:
+        max_epochs (int): max epochs for the training process.
+            if you commbine cosine decay with warmup, it is recommended that
+            the max_iters is much larger than the warmup iter
+        use_warmup (bool): whether to use warmup. Default: True.
+        min_lr_ratio (float): minimum learning rate ratio. Default: 0.
+        last_plateau_epochs (int): use minimum learning rate in
+            the last few epochs. Default: 0.
+    """
+
+    def __init__(self,
+                 max_epochs=1000,
+                 use_warmup=True,
+                 min_lr_ratio=0.,
+                 last_plateau_epochs=0):
+        self.max_epochs = max_epochs
+        self.use_warmup = use_warmup
+        self.min_lr_ratio = min_lr_ratio
+        self.last_plateau_epochs = last_plateau_epochs
+
+    def __call__(self,
+                 base_lr=None,
+                 boundary=None,
+                 value=None,
+                 step_per_epoch=None):
+        assert base_lr is not None, "either base LR or values should be provided"
+
+        max_iters = self.max_epochs * int(step_per_epoch)
+        last_plateau_iters = self.last_plateau_epochs * int(step_per_epoch)
+        min_lr = base_lr * self.min_lr_ratio
+        if boundary is not None and value is not None and self.use_warmup:
+            # use warmup
+            warmup_iters = len(boundary)
+            for i in range(int(boundary[-1]), max_iters):
+                boundary.append(i)
+                if i < max_iters - last_plateau_iters:
+                    decayed_lr = min_lr + (base_lr - min_lr) * 0.5 * (math.cos(
+                        (i - warmup_iters) * math.pi /
+                        (max_iters - warmup_iters - last_plateau_iters)) + 1)
+                    value.append(decayed_lr)
+                else:
+                    value.append(min_lr)
+            return optimizer.lr.PiecewiseDecay(boundary, value)
+        elif last_plateau_iters > 0:
+            # not use warmup, but set `last_plateau_epochs` > 0
+            boundary = []
+            value = []
+            for i in range(max_iters):
+                if i < max_iters - last_plateau_iters:
+                    decayed_lr = min_lr + (base_lr - min_lr) * 0.5 * (math.cos(
+                        i * math.pi / (max_iters - last_plateau_iters)) + 1)
+                    value.append(decayed_lr)
+                else:
+                    value.append(min_lr)
+                if i > 0:
+                    boundary.append(i)
+            return optimizer.lr.PiecewiseDecay(boundary, value)
+
+        return optimizer.lr.CosineAnnealingDecay(
+            base_lr, T_max=max_iters, eta_min=min_lr)
+
+
+@serializable
+class PiecewiseDecay(object):
+    """
+    Multi step learning rate decay
+
+    Args:
+        gamma (float | list): decay factor
+        milestones (list): steps at which to decay learning rate
+    """
+
+    def __init__(self,
+                 gamma=[0.1, 0.01],
+                 milestones=[8, 11],
+                 values=None,
+                 use_warmup=True):
+        super(PiecewiseDecay, self).__init__()
+        if type(gamma) is not list:
+            self.gamma = []
+            for i in range(len(milestones)):
+                self.gamma.append(gamma / 10**i)
+        else:
+            self.gamma = gamma
+        self.milestones = milestones
+        self.values = values
+        self.use_warmup = use_warmup
+
+    def __call__(self,
+                 base_lr=None,
+                 boundary=None,
+                 value=None,
+                 step_per_epoch=None):
+        if boundary is not None and self.use_warmup:
+            boundary.extend([int(step_per_epoch) * i for i in self.milestones])
+        else:
+            # do not use LinearWarmup
+            boundary = [int(step_per_epoch) * i for i in self.milestones]
+            value = [base_lr]  # during step[0, boundary[0]] is base_lr
+
+        # self.values is setted directly in config
+        if self.values is not None:
+            assert len(self.milestones) + 1 == len(self.values)
+            return optimizer.lr.PiecewiseDecay(boundary, self.values)
+
+        # value is computed by self.gamma
+        value = value if value is not None else [base_lr]
+        for i in self.gamma:
+            value.append(base_lr * i)
+
+        return optimizer.lr.PiecewiseDecay(boundary, value)
+
+
+@serializable
+class LinearWarmup(object):
+    """
+    Warm up learning rate linearly
+
+    Args:
+        steps (int): warm up steps
+        start_factor (float): initial learning rate factor
+        epochs (int|None): use epochs as warm up steps, the priority
+            of `epochs` is higher than `steps`. Default: None.
+    """
+
+    def __init__(self, steps=500, start_factor=1. / 3, epochs=None):
+        super(LinearWarmup, self).__init__()
+        self.steps = steps
+        self.start_factor = start_factor
+        self.epochs = epochs
+
+    def __call__(self, base_lr, step_per_epoch):
+        boundary = []
+        value = []
+        warmup_steps = self.epochs * step_per_epoch \
+            if self.epochs is not None else self.steps
+        warmup_steps = max(warmup_steps, 1)
+        for i in range(warmup_steps + 1):
+            if warmup_steps > 0:
+                alpha = i / warmup_steps
+                factor = self.start_factor * (1 - alpha) + alpha
+                lr = base_lr * factor
+                value.append(lr)
+            if i > 0:
+                boundary.append(i)
+        return boundary, value
+
+
+@serializable
+class ExpWarmup(object):
+    """
+    Warm up learning rate in exponential mode
+    Args:
+        steps (int): warm up steps.
+        epochs (int|None): use epochs as warm up steps, the priority
+            of `epochs` is higher than `steps`. Default: None.
+        power (int): Exponential coefficient. Default: 2.
+    """
+
+    def __init__(self, steps=1000, epochs=None, power=2):
+        super(ExpWarmup, self).__init__()
+        self.steps = steps
+        self.epochs = epochs
+        self.power = power
+
+    def __call__(self, base_lr, step_per_epoch):
+        boundary = []
+        value = []
+        warmup_steps = self.epochs * step_per_epoch if self.epochs is not None else self.steps
+        warmup_steps = max(warmup_steps, 1)
+        for i in range(warmup_steps + 1):
+            factor = (i / float(warmup_steps))**self.power
+            value.append(base_lr * factor)
+            if i > 0:
+                boundary.append(i)
+        return boundary, value
+
+
+@register
+class LearningRate(object):
+    """
+    Learning Rate configuration
+
+    Args:
+        base_lr (float): base learning rate
+        schedulers (list): learning rate schedulers
+    """
+    __category__ = 'optim'
+
+    def __init__(self,
+                 base_lr=0.01,
+                 schedulers=[PiecewiseDecay(), LinearWarmup()]):
+        super(LearningRate, self).__init__()
+        self.base_lr = base_lr
+        self.schedulers = []
+
+        schedulers = copy.deepcopy(schedulers)
+        for sched in schedulers:
+            if isinstance(sched, dict):
+                # support dict sched instantiate
+                module = sys.modules[__name__]
+                type = sched.pop("name")
+                scheduler = getattr(module, type)(**sched)
+                self.schedulers.append(scheduler)
+            else:
+                self.schedulers.append(sched)
+
+    def __call__(self, step_per_epoch):
+        assert len(self.schedulers) >= 1
+        if not self.schedulers[0].use_warmup:
+            return self.schedulers[0](base_lr=self.base_lr,
+                                      step_per_epoch=step_per_epoch)
+
+        # TODO: split warmup & decay
+        # warmup
+        boundary, value = self.schedulers[1](self.base_lr, step_per_epoch)
+        # decay
+        decay_lr = self.schedulers[0](self.base_lr, boundary, value,
+                                      step_per_epoch)
+        return decay_lr
+
+
+@register
+class OptimizerBuilder():
+    """
+    Build optimizer handles
+    Args:
+        regularizer (object): an `Regularizer` instance
+        optimizer (object): an `Optimizer` instance
+    """
+    __category__ = 'optim'
+
+    def __init__(self,
+                 clip_grad_by_norm=None,
+                 clip_grad_by_value=None,
+                 regularizer={'type': 'L2',
+                              'factor': .0001},
+                 optimizer={'type': 'Momentum',
+                            'momentum': .9}):
+        self.clip_grad_by_norm = clip_grad_by_norm
+        self.clip_grad_by_value = clip_grad_by_value
+        self.regularizer = regularizer
+        self.optimizer = optimizer
+
+    def __call__(self, learning_rate, model=None):
+        if self.clip_grad_by_norm is not None:
+            grad_clip = nn.ClipGradByGlobalNorm(
+                clip_norm=self.clip_grad_by_norm)
+        elif self.clip_grad_by_value is not None:
+            var = abs(self.clip_grad_by_value)
+            grad_clip = nn.ClipGradByValue(min=-var, max=var)
+        else:
+            grad_clip = None
+        if self.regularizer and self.regularizer != 'None':
+            reg_type = self.regularizer['type'] + 'Decay'
+            reg_factor = self.regularizer['factor']
+            regularization = getattr(regularizer, reg_type)(reg_factor)
+        else:
+            regularization = None
+
+        optim_args = self.optimizer.copy()
+        optim_type = optim_args['type']
+        del optim_args['type']
+
+        if optim_type != 'AdamW':
+            optim_args['weight_decay'] = regularization
+
+        op = getattr(optimizer, optim_type)
+
+        if 'param_groups' in optim_args:
+            assert isinstance(optim_args['param_groups'], list), ''
+
+            param_groups = optim_args.pop('param_groups')
+
+            params, visited = [], []
+            for group in param_groups:
+                assert isinstance(group,
+                                  dict) and 'params' in group and isinstance(
+                                      group['params'], list), ''
+                _params = {
+                    n: p
+                    for n, p in model.named_parameters()
+                    if any([k in n
+                            for k in group['params']]) and p.trainable is True
+                }
+                _group = group.copy()
+                _group.update({'params': list(_params.values())})
+
+                params.append(_group)
+                visited.extend(list(_params.keys()))
+
+            ext_params = [
+                p for n, p in model.named_parameters()
+                if n not in visited and p.trainable is True
+            ]
+
+            if len(ext_params) < len(model.parameters()):
+                params.append({'params': ext_params})
+
+            elif len(ext_params) > len(model.parameters()):
+                raise RuntimeError
+
+        else:
+            _params = model.parameters()
+            params = [param for param in _params if param.trainable is True]
+
+        return op(learning_rate=learning_rate,
+                  parameters=params,
+                  grad_clip=grad_clip,
+                  **optim_args)
diff --git a/rtdetr_paddle/ppdet/optimizer/utils.py b/rtdetr_paddle/ppdet/optimizer/utils.py
new file mode 100644
index 0000000..6fc6dbd
--- /dev/null
+++ b/rtdetr_paddle/ppdet/optimizer/utils.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle.nn as nn
+
+from typing import List
+
+
+def get_bn_running_state_names(model: nn.Layer) -> List[str]:
+    """Get all bn state full names including running mean and variance
+    """
+    names = []
+    for n, m in model.named_sublayers():
+        if isinstance(m, (nn.BatchNorm2D, nn.SyncBatchNorm)):
+            assert hasattr(m, '_mean'), f'assert {m} has _mean'
+            assert hasattr(m, '_variance'), f'assert {m} has _variance'
+            running_mean = f'{n}._mean'
+            running_var = f'{n}._variance'
+            names.extend([running_mean, running_var])
+
+    return names
diff --git a/rtdetr_paddle/ppdet/utils/__init__.py b/rtdetr_paddle/ppdet/utils/__init__.py
new file mode 100644
index 0000000..d0c32e2
--- /dev/null
+++ b/rtdetr_paddle/ppdet/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/rtdetr_paddle/ppdet/utils/cam_utils.py b/rtdetr_paddle/ppdet/utils/cam_utils.py
new file mode 100644
index 0000000..d2f7a47
--- /dev/null
+++ b/rtdetr_paddle/ppdet/utils/cam_utils.py
@@ -0,0 +1,343 @@
+import numpy as np
+import cv2
+import os
+import sys
+import glob
+from ppdet.utils.logger import setup_logger
+import copy
+logger = setup_logger('ppdet_cam')
+
+import paddle
+from ppdet.engine import Trainer
+
+
+def get_test_images(infer_dir, infer_img):
+    """
+    Get image path list in TEST mode
+    """
+    assert infer_img is not None or infer_dir is not None, \
+        "--infer_img or --infer_dir should be set"
+    assert infer_img is None or os.path.isfile(infer_img), \
+            "{} is not a file".format(infer_img)
+    assert infer_dir is None or os.path.isdir(infer_dir), \
+            "{} is not a directory".format(infer_dir)
+
+    # infer_img has a higher priority
+    if infer_img and os.path.isfile(infer_img):
+        return [infer_img]
+
+    images = set()
+    infer_dir = os.path.abspath(infer_dir)
+    assert os.path.isdir(infer_dir), \
+        "infer_dir {} is not a directory".format(infer_dir)
+    exts = ['jpg', 'jpeg', 'png', 'bmp']
+    exts += [ext.upper() for ext in exts]
+    for ext in exts:
+        images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
+    images = list(images)
+
+    assert len(images) > 0, "no image found in {}".format(infer_dir)
+    logger.info("Found {} inference images in total.".format(len(images)))
+
+    return images
+
+
+def compute_ious(boxes1, boxes2):
+    """[Compute pairwise IOU matrix for given two sets of boxes]
+
+        Args:
+            boxes1 ([numpy ndarray with shape N,4]): [representing bounding boxes with format (xmin,ymin,xmax,ymax)]
+            boxes2 ([numpy ndarray with shape M,4]): [representing bounding boxes with format (xmin,ymin,xmax,ymax)]
+        Returns:
+            pairwise IOU maxtrix with shape (N,M)，where the value at ith row jth column hold the iou between ith
+            box and jth box from box1 and box2 respectively.
+    """
+    lu = np.maximum(
+        boxes1[:, None, :2], boxes2[:, :2]
+    )  # lu with shape N,M,2 ; boxes1[:,None,:2] with shape (N,1,2) boxes2 with shape(M,2)
+    rd = np.minimum(boxes1[:, None, 2:], boxes2[:, 2:])  # rd same to lu
+    intersection_wh = np.maximum(0.0, rd - lu)
+    intersection_area = intersection_wh[:, :,
+                                        0] * intersection_wh[:, :,
+                                                             1]  # with shape (N,M)
+    boxes1_wh = np.maximum(0.0, boxes1[:, 2:] - boxes1[:, :2])
+    boxes1_area = boxes1_wh[:, 0] * boxes1_wh[:, 1]  # with shape (N,)
+    boxes2_wh = np.maximum(0.0, boxes2[:, 2:] - boxes2[:, :2])
+    boxes2_area = boxes2_wh[:, 0] * boxes2_wh[:, 1]  # with shape (M,)
+    union_area = np.maximum(
+        boxes1_area[:, None] + boxes2_area - intersection_area,
+        1e-8)  # with shape (N,M)
+    ious = np.clip(intersection_area / union_area, 0.0, 1.0)
+    return ious
+
+
+def grad_cam(feat, grad):
+    """
+
+    Args:
+        feat:  CxHxW
+        grad:  CxHxW
+
+    Returns:
+           cam: HxW
+    """
+    exp = (feat * grad.mean((1, 2), keepdims=True)).mean(axis=0)
+    exp = np.maximum(-exp, 0)
+    return exp
+
+
+def resize_cam(explanation, resize_shape) -> np.ndarray:
+    """
+
+    Args:
+        explanation: (width, height)
+        resize_shape: (width, height)
+
+    Returns:
+
+    """
+    assert len(explanation.shape) == 2, f"{explanation.shape}. " \
+                                        f"Currently support 2D explanation results for visualization. " \
+                                        "Reduce higher dimensions to 2D for visualization."
+
+    explanation = (explanation - explanation.min()) / (
+        explanation.max() - explanation.min())
+
+    explanation = cv2.resize(explanation, resize_shape)
+    explanation = np.uint8(255 * explanation)
+    explanation = cv2.applyColorMap(explanation, cv2.COLORMAP_JET)
+    explanation = cv2.cvtColor(explanation, cv2.COLOR_BGR2RGB)
+
+    return explanation
+
+
+class BBoxCAM:
+    def __init__(self, FLAGS, cfg):
+        self.FLAGS = FLAGS
+        self.cfg = cfg
+        # build model
+        self.trainer = self.build_trainer(cfg)
+        # num_class
+        self.num_class = cfg.num_classes
+        # set hook for extraction of featuremaps and grads
+        self.set_hook(cfg)
+        self.nms_idx_need_divid_numclass_arch = ['FasterRCNN', 'MaskRCNN', 'CascadeRCNN']
+        """
+        In these networks, the bbox array shape before nms contain num_class,
+        the nms_keep_idx of the bbox need to divide the num_class; 
+        """
+
+        # cam image output_dir
+        try:
+            os.makedirs(FLAGS.cam_out)
+        except:
+            print('Path already exists.')
+            pass
+
+    def build_trainer(self, cfg):
+        # build trainer
+        trainer = Trainer(cfg, mode='test')
+        # load weights
+        trainer.load_weights(cfg.weights)
+
+        # set for get extra_data before nms
+        trainer.model.use_extra_data=True
+        # set for record the bbox index before nms
+        if cfg.architecture in ['FasterRCNN', 'MaskRCNN']:
+            trainer.model.bbox_post_process.nms.return_index = True
+        elif cfg.architecture in ['YOLOv3', 'PPYOLOE', 'PPYOLOEWithAuxHead']:
+            if trainer.model.post_process is not None:
+                # anchor based YOLOs: YOLOv3,PP-YOLO
+                trainer.model.post_process.nms.return_index = True
+            else:
+                # anchor free YOLOs: PP-YOLOE, PP-YOLOE+
+                trainer.model.yolo_head.nms.return_index = True
+        elif cfg.architecture=='BlazeFace' or cfg.architecture=='SSD':
+            trainer.model.post_process.nms.return_index = True
+        elif cfg.architecture=='RetinaNet':
+            trainer.model.head.nms.return_index = True
+        else:
+            print(
+                cfg.architecture+' is not supported for cam temporarily!'
+            )
+            sys.exit()
+        # Todo: Unify the head/post_process name in each model
+
+        return trainer
+
+    def set_hook(self, cfg):
+        # set hook for extraction of featuremaps and grads
+        self.target_feats = {}
+        self.target_layer_name = cfg.target_feature_layer_name
+        # such as trainer.model.backbone, trainer.model.bbox_head.roi_extractor
+
+        def hook(layer, input, output):
+            self.target_feats[layer._layer_name_for_hook] = output
+
+        try:
+            exec('self.trainer.'+self.target_layer_name+'._layer_name_for_hook = self.target_layer_name')
+            # self.trainer.target_layer_name._layer_name_for_hook = self.target_layer_name
+            exec('self.trainer.'+self.target_layer_name+'.register_forward_post_hook(hook)')
+            # self.trainer.target_layer_name.register_forward_post_hook(hook)
+        except:
+            print("Error! "
+                  "The target_layer_name--"+self.target_layer_name+" is not in model! "
+                  "Please check the spelling and "
+                  "the network's architecture!")
+            sys.exit()
+
+    def get_bboxes(self):
+        # get inference images
+        images = get_test_images(self.FLAGS.infer_dir, self.FLAGS.infer_img)
+
+        # inference
+        result = self.trainer.predict(
+            images,
+            draw_threshold=self.FLAGS.draw_threshold,
+            output_dir=self.FLAGS.output_dir,
+            save_results=self.FLAGS.save_results,
+            visualize=False)[0]
+        return result
+
+    def get_bboxes_cams(self):
+        # Get the bboxes prediction(after nms result) of the input
+        inference_result = self.get_bboxes()
+
+        # read input image
+        # Todo: Support folder multi-images process
+        from PIL import Image
+        img = np.array(Image.open(self.cfg.infer_img))
+
+        # data for calaulate bbox grad_cam
+        extra_data = inference_result['extra_data']
+        """
+        Example of Faster_RCNN based architecture:
+            extra_data: {'scores': tensor with shape [num_of_bboxes_before_nms, num_classes], for example: [1000, 80]
+                       'nms_keep_idx': tensor with shape [num_of_bboxes_after_nms, 1], for example: [300, 1]
+                      }
+        Example of YOLOv3 based architecture:
+            extra_data: {'scores': tensor with shape [1, num_classes, num_of_yolo_bboxes_before_nms], #for example: [1, 80, 8400]
+                       'nms_keep_idx': tensor with shape [num_of_yolo_bboxes_after_nms, 1], # for example: [300, 1]
+                      }
+        """
+
+        # array index of the predicted bbox before nms
+        if self.cfg.architecture in self.nms_idx_need_divid_numclass_arch:
+            # some network's bbox array shape before nms may be like [num_of_bboxes_before_nms, num_classes, 4],
+            # we need to divide num_classes to get the before_nms_index；
+            # currently, only include the rcnn architectures （fasterrcnn, maskrcnn, cascadercnn);
+            before_nms_indexes = extra_data['nms_keep_idx'].cpu().numpy(
+            ) // self.num_class  # num_class
+        else :
+            before_nms_indexes = extra_data['nms_keep_idx'].cpu().numpy()
+
+        # Calculate and visualize the heatmap of per predict bbox
+        for index, target_bbox in enumerate(inference_result['bbox']):
+            # target_bbox: [cls, score, x1, y1, x2, y2]
+            # filter bboxes with low predicted scores
+            if target_bbox[1] < self.FLAGS.draw_threshold:
+                continue
+
+            target_bbox_before_nms = int(before_nms_indexes[index])
+
+            if len(extra_data['scores'].shape)==2:
+                score_out = extra_data['scores'][target_bbox_before_nms]
+            else:
+                score_out = extra_data['scores'][0, :, target_bbox_before_nms]
+            """
+            There are two kinds array shape of bbox score output :
+                1) [num_of_bboxes_before_nms, num_classes], for example: [1000, 80]
+                2) [num_of_image, num_classes, num_of_yolo_bboxes_before_nms], for example: [1, 80, 1000]
+            """
+
+
+            # construct one_hot label and do backward to get the gradients
+            predicted_label = paddle.argmax(score_out)
+            label_onehot = paddle.nn.functional.one_hot(
+                predicted_label, num_classes=len(score_out))
+            label_onehot = label_onehot.squeeze()
+            target = paddle.sum(score_out * label_onehot)
+            target.backward(retain_graph=True)
+
+
+            if 'backbone' in self.target_layer_name or \
+                    'neck' in self.target_layer_name: # backbone/neck level feature
+                if isinstance(self.target_feats[self.target_layer_name], list):
+                    # when the featuremap contains of multiple scales,
+                    # take the featuremap of the last scale
+                    # Todo: fuse the cam result from multisclae featuremaps
+                    if self.target_feats[self.target_layer_name][
+                            -1].shape[-1]==1:
+                        """
+                        if the last level featuremap is 1x1 size,
+                        we take the second last one
+                        """
+                        cam_grad = self.target_feats[self.target_layer_name][
+                            -2].grad.squeeze().cpu().numpy()
+                        cam_feat = self.target_feats[self.target_layer_name][
+                            -2].squeeze().cpu().numpy()
+                    else:
+                        cam_grad = self.target_feats[self.target_layer_name][
+                            -1].grad.squeeze().cpu().numpy()
+                        cam_feat = self.target_feats[self.target_layer_name][
+                            -1].squeeze().cpu().numpy()
+                else:
+                    cam_grad = self.target_feats[
+                        self.target_layer_name].grad.squeeze().cpu().numpy()
+                    cam_feat = self.target_feats[
+                        self.target_layer_name].squeeze().cpu().numpy()
+            else: # roi level feature
+                cam_grad = self.target_feats[
+                    self.target_layer_name].grad.squeeze().cpu().numpy()[target_bbox_before_nms]
+                cam_feat = self.target_feats[
+                    self.target_layer_name].squeeze().cpu().numpy()[target_bbox_before_nms]
+
+            # grad_cam:
+            exp = grad_cam(cam_feat, cam_grad)
+
+            if 'backbone' in self.target_layer_name or \
+                    'neck' in self.target_layer_name:
+                """
+                when use backbone/neck featuremap, 
+                we first do the cam on whole image, 
+                and then set the area outside the predic bbox to 0
+                """
+                # reshape the cam image to the input image size
+                resized_exp = resize_cam(exp, (img.shape[1], img.shape[0]))
+                mask = np.zeros((img.shape[0], img.shape[1], 3))
+                mask[int(target_bbox[3]):int(target_bbox[5]), int(target_bbox[2]):
+                     int(target_bbox[4]), :] = 1
+                resized_exp = resized_exp * mask
+                # add the bbox cam back to the input image
+                overlay_vis = np.uint8(resized_exp * 0.4 + img * 0.6)
+            elif 'roi' in self.target_layer_name:
+                # get the bbox part of the image
+                bbox_img = copy.deepcopy(img[int(target_bbox[3]):int(target_bbox[5]),
+                                         int(target_bbox[2]):int(target_bbox[4]), :])
+                # reshape the cam image to the bbox size
+                resized_exp = resize_cam(exp, (bbox_img.shape[1], bbox_img.shape[0]))
+                # add the bbox cam back to the bbox image
+                bbox_overlay_vis = np.uint8(resized_exp * 0.4 + bbox_img * 0.6)
+                # put the bbox_cam image to the original image
+                overlay_vis = copy.deepcopy(img)
+                overlay_vis[int(target_bbox[3]):int(target_bbox[5]),
+                    int(target_bbox[2]):int(target_bbox[4]), :] = bbox_overlay_vis
+            else:
+                print(
+                    'Only supported cam for  backbone/neck feature and roi feature,  the others are not supported temporarily!'
+                )
+                sys.exit()
+
+            # put the bbox rectangle on image
+            cv2.rectangle(
+                overlay_vis, (int(target_bbox[2]), int(target_bbox[3])),
+                (int(target_bbox[4]), int(target_bbox[5])), (0, 0, 255), 2)
+
+            # save visualization result
+            cam_image = Image.fromarray(overlay_vis)
+            cam_image.save(self.FLAGS.cam_out + '/' + str(index) + '.jpg')
+
+            # clear gradients after each bbox grad_cam
+            target.clear_gradient()
+            for n, v in self.trainer.model.named_sublayers():
+                v.clear_gradients()
diff --git a/rtdetr_paddle/ppdet/utils/check.py b/rtdetr_paddle/ppdet/utils/check.py
new file mode 100644
index 0000000..7690ade
--- /dev/null
+++ b/rtdetr_paddle/ppdet/utils/check.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+import paddle
+import six
+import paddle.version as paddle_version
+
+from .logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = [
+    'check_gpu', 'check_npu', 'check_xpu', 'check_mlu', 'check_version',
+    'check_config'
+]
+
+
+def check_mlu(use_mlu):
+    """
+    Log error and exit when set use_mlu=true in paddlepaddle
+    cpu/gpu/xpu/npu version.
+    """
+    err = "Config use_mlu cannot be set as true while you are " \
+          "using paddlepaddle cpu/gpu/xpu/npu version ! \nPlease try: \n" \
+          "\t1. Install paddlepaddle-mlu to run model on MLU \n" \
+          "\t2. Set use_mlu as false in config file to run " \
+          "model on CPU/GPU/XPU/NPU"
+
+    try:
+        if use_mlu and not paddle.is_compiled_with_mlu():
+            logger.error(err)
+            sys.exit(1)
+    except Exception as e:
+        pass
+
+
+def check_npu(use_npu):
+    """
+    Log error and exit when set use_npu=true in paddlepaddle
+    version without paddle-custom-npu installed.
+    """
+    err = "Config use_npu cannot be set as true while you are " \
+          "using paddlepaddle version without paddle-custom-npu " \
+          "installed! \nPlease try: \n" \
+          "\t1. Install paddle-custom-npu to run model on NPU \n" \
+          "\t2. Set use_npu as false in config file to run " \
+          "model on other devices supported."
+
+    try:
+        if use_npu and not 'npu' in paddle.device.get_all_custom_device_type():
+            logger.error(err)
+            sys.exit(1)
+    except Exception as e:
+        pass
+
+
+def check_xpu(use_xpu):
+    """
+    Log error and exit when set use_xpu=true in paddlepaddle
+    cpu/gpu/npu version.
+    """
+    err = "Config use_xpu cannot be set as true while you are " \
+          "using paddlepaddle cpu/gpu/npu version ! \nPlease try: \n" \
+          "\t1. Install paddlepaddle-xpu to run model on XPU \n" \
+          "\t2. Set use_xpu as false in config file to run " \
+          "model on CPU/GPU/NPU"
+
+    try:
+        if use_xpu and not paddle.is_compiled_with_xpu():
+            logger.error(err)
+            sys.exit(1)
+    except Exception as e:
+        pass
+
+
+def check_gpu(use_gpu):
+    """
+    Log error and exit when set use_gpu=true in paddlepaddle
+    cpu version.
+    """
+    err = "Config use_gpu cannot be set as true while you are " \
+          "using paddlepaddle cpu version ! \nPlease try: \n" \
+          "\t1. Install paddlepaddle-gpu to run model on GPU \n" \
+          "\t2. Set use_gpu as false in config file to run " \
+          "model on CPU"
+
+    try:
+        if use_gpu and not paddle.is_compiled_with_cuda():
+            logger.error(err)
+            sys.exit(1)
+    except Exception as e:
+        pass
+
+
+def check_version(version='2.2'):
+    """
+    Log error and exit when the installed version of paddlepaddle is
+    not satisfied.
+    """
+    err = "PaddlePaddle version {} or higher is required, " \
+          "or a suitable develop version is satisfied as well. \n" \
+          "Please make sure the version is good with your code.".format(version)
+
+    version_installed = [
+        paddle_version.major, paddle_version.minor, paddle_version.patch,
+        paddle_version.rc
+    ]
+
+    if version_installed == ['0', '0', '0', '0']:
+        return
+
+    version_split = version.split('.')
+
+    length = min(len(version_installed), len(version_split))
+    for i in six.moves.range(length):
+        if version_installed[i] > version_split[i]:
+            return
+        if version_installed[i] < version_split[i]:
+            raise Exception(err)
+
+
+def check_config(cfg):
+    """
+    Check the correctness of the configuration file. Log error and exit
+    when Config is not compliant.
+    """
+    err = "'{}' not specified in config file. Please set it in config file."
+    check_list = ['architecture', 'num_classes']
+    try:
+        for var in check_list:
+            if not var in cfg:
+                logger.error(err.format(var))
+                sys.exit(1)
+    except Exception as e:
+        pass
+
+    if 'log_iter' not in cfg:
+        cfg.log_iter = 20
+
+    return cfg
diff --git a/rtdetr_paddle/ppdet/utils/checkpoint.py b/rtdetr_paddle/ppdet/utils/checkpoint.py
new file mode 100644
index 0000000..f3dafd4
--- /dev/null
+++ b/rtdetr_paddle/ppdet/utils/checkpoint.py
@@ -0,0 +1,325 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import os
+import numpy as np
+import paddle
+import paddle.nn as nn
+from .download import get_weights_path
+
+from .logger import setup_logger
+logger = setup_logger(__name__)
+
+
+def is_url(path):
+    """
+    Whether path is URL.
+    Args:
+        path (string): URL string or not.
+    """
+    return path.startswith('http://') \
+            or path.startswith('https://') \
+            or path.startswith('ppdet://')
+
+
+def _strip_postfix(path):
+    path, ext = os.path.splitext(path)
+    assert ext in ['', '.pdparams', '.pdopt', '.pdmodel'], \
+            "Unknown postfix {} from weights".format(ext)
+    return path
+
+
+def load_weight(model, weight, optimizer=None, ema=None, exchange=True):
+    if is_url(weight):
+        weight = get_weights_path(weight)
+
+    path = _strip_postfix(weight)
+    pdparam_path = path + '.pdparams'
+    if not os.path.exists(pdparam_path):
+        raise ValueError("Model pretrain path {} does not "
+                         "exists.".format(pdparam_path))
+
+    if ema is not None and os.path.exists(path + '.pdema'):
+        if exchange:
+            # Exchange model and ema_model to load
+            logger.info('Exchange model and ema_model to load:')
+            ema_state_dict = paddle.load(pdparam_path)
+            logger.info('Loading ema_model weights from {}'.format(path +
+                                                                   '.pdparams'))
+            param_state_dict = paddle.load(path + '.pdema')
+            logger.info('Loading model weights from {}'.format(path + '.pdema'))
+        else:
+            ema_state_dict = paddle.load(path + '.pdema')
+            logger.info('Loading ema_model weights from {}'.format(path +
+                                                                   '.pdema'))
+            param_state_dict = paddle.load(pdparam_path)
+            logger.info('Loading model weights from {}'.format(path +
+                                                               '.pdparams'))
+    else:
+        ema_state_dict = None
+        param_state_dict = paddle.load(pdparam_path)
+
+    if hasattr(model, 'modelTeacher') and hasattr(model, 'modelStudent'):
+        print('Loading pretrain weights for Teacher-Student framework.')
+        print('Loading pretrain weights for Student model.')
+        student_model_dict = model.modelStudent.state_dict()
+        student_param_state_dict = match_state_dict(
+            student_model_dict, param_state_dict, mode='student')
+        model.modelStudent.set_dict(student_param_state_dict)
+        print('Loading pretrain weights for Teacher model.')
+        teacher_model_dict = model.modelTeacher.state_dict()
+
+        teacher_param_state_dict = match_state_dict(
+            teacher_model_dict, param_state_dict, mode='teacher')
+        model.modelTeacher.set_dict(teacher_param_state_dict)
+
+    else:
+        model_dict = model.state_dict()
+        model_weight = {}
+        incorrect_keys = 0
+        for key in model_dict.keys():
+            if key in param_state_dict.keys():
+                model_weight[key] = param_state_dict[key]
+            else:
+                logger.info('Unmatched key: {}'.format(key))
+                incorrect_keys += 1
+        assert incorrect_keys == 0, "Load weight {} incorrectly, \
+                {} keys unmatched, please check again.".format(weight,
+                                                               incorrect_keys)
+        logger.info('Finish resuming model weights: {}'.format(pdparam_path))
+        model.set_dict(model_weight)
+
+    last_epoch = 0
+    if optimizer is not None and os.path.exists(path + '.pdopt'):
+        optim_state_dict = paddle.load(path + '.pdopt')
+        # to solve resume bug, will it be fixed in paddle 2.0
+        for key in optimizer.state_dict().keys():
+            if not key in optim_state_dict.keys():
+                optim_state_dict[key] = optimizer.state_dict()[key]
+        if 'last_epoch' in optim_state_dict:
+            last_epoch = optim_state_dict.pop('last_epoch')
+        optimizer.set_state_dict(optim_state_dict)
+
+        if ema_state_dict is not None:
+            ema.resume(ema_state_dict,
+                       optim_state_dict['LR_Scheduler']['last_epoch'])
+    elif ema_state_dict is not None:
+        ema.resume(ema_state_dict)
+    return last_epoch
+
+
+def match_state_dict(model_state_dict, weight_state_dict, mode='default'):
+    """
+    Match between the model state dict and pretrained weight state dict.
+    Return the matched state dict.
+
+    The method supposes that all the names in pretrained weight state dict are
+    subclass of the names in models`, if the prefix 'backbone.' in pretrained weight
+    keys is stripped. And we could get the candidates for each model key. Then we
+    select the name with the longest matched size as the final match result. For
+    example, the model state dict has the name of
+    'backbone.res2.res2a.branch2a.conv.weight' and the pretrained weight as
+    name of 'res2.res2a.branch2a.conv.weight' and 'branch2a.conv.weight'. We
+    match the 'res2.res2a.branch2a.conv.weight' to the model key.
+    """
+
+    model_keys = sorted(model_state_dict.keys())
+    weight_keys = sorted(weight_state_dict.keys())
+
+    def teacher_match(a, b):
+        # skip student params
+        if b.startswith('modelStudent'):
+            return False
+        return a == b or a.endswith("." + b) or b.endswith("." + a)
+
+    def student_match(a, b):
+        # skip teacher params
+        if b.startswith('modelTeacher'):
+            return False
+        return a == b or a.endswith("." + b) or b.endswith("." + a)
+
+    def match(a, b):
+        if b.startswith('backbone.res5'):
+            b = b[9:]
+        return a == b or a.endswith("." + b)
+
+    if mode == 'student':
+        match_op = student_match
+    elif mode == 'teacher':
+        match_op = teacher_match
+    else:
+        match_op = match
+
+    match_matrix = np.zeros([len(model_keys), len(weight_keys)])
+    for i, m_k in enumerate(model_keys):
+        for j, w_k in enumerate(weight_keys):
+            if match_op(m_k, w_k):
+                match_matrix[i, j] = len(w_k)
+    max_id = match_matrix.argmax(1)
+    max_len = match_matrix.max(1)
+    max_id[max_len == 0] = -1
+    load_id = set(max_id)
+    load_id.discard(-1)
+    not_load_weight_name = []
+    if weight_keys[0].startswith('modelStudent') or weight_keys[0].startswith(
+            'modelTeacher'):
+        for match_idx in range(len(max_id)):
+            if max_id[match_idx] == -1:
+                not_load_weight_name.append(model_keys[match_idx])
+        if len(not_load_weight_name) > 0:
+            logger.info('{} in model is not matched with pretrained weights, '
+                        'and its will be trained from scratch'.format(
+                            not_load_weight_name))
+
+    else:
+        for idx in range(len(weight_keys)):
+            if idx not in load_id:
+                not_load_weight_name.append(weight_keys[idx])
+
+        if len(not_load_weight_name) > 0:
+            logger.info('{} in pretrained weight is not used in the model, '
+                        'and its will not be loaded'.format(
+                            not_load_weight_name))
+    matched_keys = {}
+    result_state_dict = {}
+    for model_id, weight_id in enumerate(max_id):
+        if weight_id == -1:
+            continue
+        model_key = model_keys[model_id]
+        weight_key = weight_keys[weight_id]
+        weight_value = weight_state_dict[weight_key]
+        model_value_shape = list(model_state_dict[model_key].shape)
+
+        if list(weight_value.shape) != model_value_shape:
+            logger.info(
+                'The shape {} in pretrained weight {} is unmatched with '
+                'the shape {} in model {}. And the weight {} will not be '
+                'loaded'.format(weight_value.shape, weight_key,
+                                model_value_shape, model_key, weight_key))
+            continue
+
+        assert model_key not in result_state_dict
+        result_state_dict[model_key] = weight_value
+        if weight_key in matched_keys:
+            raise ValueError('Ambiguity weight {} loaded, it matches at least '
+                             '{} and {} in the model'.format(
+                                 weight_key, model_key, matched_keys[
+                                     weight_key]))
+        matched_keys[weight_key] = model_key
+    return result_state_dict
+
+
+def load_pretrain_weight(model, pretrain_weight, ARSL_eval=False):
+    if is_url(pretrain_weight):
+        pretrain_weight = get_weights_path(pretrain_weight)
+
+    path = _strip_postfix(pretrain_weight)
+    if not (os.path.isdir(path) or os.path.isfile(path) or
+            os.path.exists(path + '.pdparams')):
+        raise ValueError("Model pretrain path `{}` does not exists. "
+                         "If you don't want to load pretrain model, "
+                         "please delete `pretrain_weights` field in "
+                         "config file.".format(path))
+    teacher_student_flag = False
+    if not ARSL_eval:
+        if hasattr(model, 'modelTeacher') and hasattr(model, 'modelStudent'):
+            print('Loading pretrain weights for Teacher-Student framework.')
+            print(
+                'Assert Teacher model has the same structure with Student model.'
+            )
+            model_dict = model.modelStudent.state_dict()
+            teacher_student_flag = True
+        else:
+            model_dict = model.state_dict()
+
+        weights_path = path + '.pdparams'
+        param_state_dict = paddle.load(weights_path)
+        param_state_dict = match_state_dict(model_dict, param_state_dict)
+        for k, v in param_state_dict.items():
+            if isinstance(v, np.ndarray):
+                v = paddle.to_tensor(v)
+            if model_dict[k].dtype != v.dtype:
+                param_state_dict[k] = v.astype(model_dict[k].dtype)
+
+        if teacher_student_flag:
+            model.modelStudent.set_dict(param_state_dict)
+            model.modelTeacher.set_dict(param_state_dict)
+        else:
+            model.set_dict(param_state_dict)
+        logger.info('Finish loading model weights: {}'.format(weights_path))
+
+    else:
+        weights_path = path + '.pdparams'
+        param_state_dict = paddle.load(weights_path)
+        student_model_dict = model.modelStudent.state_dict()
+        student_param_state_dict = match_state_dict(
+            student_model_dict, param_state_dict, mode='student')
+        model.modelStudent.set_dict(student_param_state_dict)
+        print('Loading pretrain weights for Teacher model.')
+        teacher_model_dict = model.modelTeacher.state_dict()
+
+        teacher_param_state_dict = match_state_dict(
+            teacher_model_dict, param_state_dict, mode='teacher')
+        model.modelTeacher.set_dict(teacher_param_state_dict)
+        logger.info('Finish loading model weights: {}'.format(weights_path))
+
+
+def save_model(model,
+               optimizer,
+               save_dir,
+               save_name,
+               last_epoch,
+               ema_model=None):
+    """
+    save model into disk.
+
+    Args:
+        model (dict): the model state_dict to save parameters.
+        optimizer (paddle.optimizer.Optimizer): the Optimizer instance to
+            save optimizer states.
+        save_dir (str): the directory to be saved.
+        save_name (str): the path to be saved.
+        last_epoch (int): the epoch index.
+        ema_model (dict|None): the ema_model state_dict to save parameters.
+    """
+    if paddle.distributed.get_rank() != 0:
+        return
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    save_path = os.path.join(save_dir, save_name)
+    # save model
+    if isinstance(model, nn.Layer):
+        paddle.save(model.state_dict(), save_path + ".pdparams")
+    else:
+        assert isinstance(model,
+                          dict), 'model is not a instance of nn.layer or dict'
+        if ema_model is None:
+            paddle.save(model, save_path + ".pdparams")
+        else:
+            assert isinstance(ema_model,
+                              dict), ("ema_model is not a instance of dict, "
+                                      "please call model.state_dict() to get.")
+            # Exchange model and ema_model to save
+            paddle.save(ema_model, save_path + ".pdparams")
+            paddle.save(model, save_path + ".pdema")
+    # save optimizer
+    state_dict = optimizer.state_dict()
+    state_dict['last_epoch'] = last_epoch
+    paddle.save(state_dict, save_path + ".pdopt")
+    logger.info("Save checkpoint: {}".format(save_dir))
diff --git a/rtdetr_paddle/ppdet/utils/cli.py b/rtdetr_paddle/ppdet/utils/cli.py
new file mode 100644
index 0000000..2c5acc0
--- /dev/null
+++ b/rtdetr_paddle/ppdet/utils/cli.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from argparse import ArgumentParser, RawDescriptionHelpFormatter
+
+import yaml
+import re
+from ppdet.core.workspace import get_registered_modules, dump_value
+
+__all__ = ['ColorTTY', 'ArgsParser']
+
+
+class ColorTTY(object):
+    def __init__(self):
+        super(ColorTTY, self).__init__()
+        self.colors = ['red', 'green', 'yellow', 'blue', 'magenta', 'cyan']
+
+    def __getattr__(self, attr):
+        if attr in self.colors:
+            color = self.colors.index(attr) + 31
+
+            def color_message(message):
+                return "[{}m{}[0m".format(color, message)
+
+            setattr(self, attr, color_message)
+            return color_message
+
+    def bold(self, message):
+        return self.with_code('01', message)
+
+    def with_code(self, code, message):
+        return "[{}m{}[0m".format(code, message)
+
+
+class ArgsParser(ArgumentParser):
+    def __init__(self):
+        super(ArgsParser, self).__init__(
+            formatter_class=RawDescriptionHelpFormatter)
+        self.add_argument("-c", "--config", help="configuration file to use")
+        self.add_argument(
+            "-o", "--opt", nargs='*', help="set configuration options")
+
+    def parse_args(self, argv=None):
+        args = super(ArgsParser, self).parse_args(argv)
+        assert args.config is not None, \
+            "Please specify --config=configure_file_path."
+        args.opt = self._parse_opt(args.opt)
+        return args
+
+    def _parse_opt(self, opts):
+        config = {}
+        if not opts:
+            return config
+        for s in opts:
+            s = s.strip()
+            k, v = s.split('=', 1)
+            if '.' not in k:
+                config[k] = yaml.load(v, Loader=yaml.Loader)
+            else:
+                keys = k.split('.')
+                if keys[0] not in config:
+                    config[keys[0]] = {}
+                cur = config[keys[0]]
+                for idx, key in enumerate(keys[1:]):
+                    if idx == len(keys) - 2:
+                        cur[key] = yaml.load(v, Loader=yaml.Loader)
+                    else:
+                        cur[key] = {}
+                        cur = cur[key]
+        return config
+
+
+def merge_args(config, args, exclude_args=['config', 'opt', 'slim_config']):
+    for k, v in vars(args).items():
+        if k not in exclude_args:
+            config[k] = v
+    return config
+
+
+def print_total_cfg(config):
+    modules = get_registered_modules()
+    color_tty = ColorTTY()
+    green = '___{}___'.format(color_tty.colors.index('green') + 31)
+
+    styled = {}
+    for key in config.keys():
+        if not config[key]:  # empty schema
+            continue
+
+        if key not in modules and not hasattr(config[key], '__dict__'):
+            styled[key] = config[key]
+            continue
+        elif key in modules:
+            module = modules[key]
+        else:
+            type_name = type(config[key]).__name__
+            if type_name in modules:
+                module = modules[type_name].copy()
+                module.update({
+                    k: v
+                    for k, v in config[key].__dict__.items()
+                    if k in module.schema
+                })
+                key += " ({})".format(type_name)
+        default = module.find_default_keys()
+        missing = module.find_missing_keys()
+        mismatch = module.find_mismatch_keys()
+        extra = module.find_extra_keys()
+        dep_missing = []
+        for dep in module.inject:
+            if isinstance(module[dep], str) and module[dep] != '<value>':
+                if module[dep] not in modules:  # not a valid module
+                    dep_missing.append(dep)
+                else:
+                    dep_mod = modules[module[dep]]
+                    # empty dict but mandatory
+                    if not dep_mod and dep_mod.mandatory():
+                        dep_missing.append(dep)
+        override = list(
+            set(module.keys()) - set(default) - set(extra) - set(dep_missing))
+        replacement = {}
+        for name in set(override + default + extra + mismatch + missing):
+            new_name = name
+            if name in missing:
+                value = "<missing>"
+            else:
+                value = module[name]
+
+            if name in extra:
+                value = dump_value(value) + " <extraneous>"
+            elif name in mismatch:
+                value = dump_value(value) + " <type mismatch>"
+            elif name in dep_missing:
+                value = dump_value(value) + " <module config missing>"
+            elif name in override and value != '<missing>':
+                mark = green
+                new_name = mark + name
+            replacement[new_name] = value
+        styled[key] = replacement
+    buffer = yaml.dump(styled, default_flow_style=False, default_style='')
+    buffer = (re.sub(r"<missing>", r"[31m<missing>[0m", buffer))
+    buffer = (re.sub(r"<extraneous>", r"[33m<extraneous>[0m", buffer))
+    buffer = (re.sub(r"<type mismatch>", r"[31m<type mismatch>[0m", buffer))
+    buffer = (re.sub(r"<module config missing>",
+                     r"[31m<module config missing>[0m", buffer))
+    buffer = re.sub(r"___(\d+)___(.*?):", r"[\1m\2[0m:", buffer)
+    print(buffer)
diff --git a/rtdetr_paddle/ppdet/utils/colormap.py b/rtdetr_paddle/ppdet/utils/colormap.py
new file mode 100644
index 0000000..67c68dc
--- /dev/null
+++ b/rtdetr_paddle/ppdet/utils/colormap.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+
+def colormap(rgb=False):
+    """
+    Get colormap
+
+    The code of this function is copied from https://github.com/facebookresearch/Detectron/blob/main/detectron/utils/colormap.py
+    """
+    color_list = np.array([
+        0.000, 0.447, 0.741, 0.850, 0.325, 0.098, 0.929, 0.694, 0.125, 0.494,
+        0.184, 0.556, 0.466, 0.674, 0.188, 0.301, 0.745, 0.933, 0.635, 0.078,
+        0.184, 0.300, 0.300, 0.300, 0.600, 0.600, 0.600, 1.000, 0.000, 0.000,
+        1.000, 0.500, 0.000, 0.749, 0.749, 0.000, 0.000, 1.000, 0.000, 0.000,
+        0.000, 1.000, 0.667, 0.000, 1.000, 0.333, 0.333, 0.000, 0.333, 0.667,
+        0.000, 0.333, 1.000, 0.000, 0.667, 0.333, 0.000, 0.667, 0.667, 0.000,
+        0.667, 1.000, 0.000, 1.000, 0.333, 0.000, 1.000, 0.667, 0.000, 1.000,
+        1.000, 0.000, 0.000, 0.333, 0.500, 0.000, 0.667, 0.500, 0.000, 1.000,
+        0.500, 0.333, 0.000, 0.500, 0.333, 0.333, 0.500, 0.333, 0.667, 0.500,
+        0.333, 1.000, 0.500, 0.667, 0.000, 0.500, 0.667, 0.333, 0.500, 0.667,
+        0.667, 0.500, 0.667, 1.000, 0.500, 1.000, 0.000, 0.500, 1.000, 0.333,
+        0.500, 1.000, 0.667, 0.500, 1.000, 1.000, 0.500, 0.000, 0.333, 1.000,
+        0.000, 0.667, 1.000, 0.000, 1.000, 1.000, 0.333, 0.000, 1.000, 0.333,
+        0.333, 1.000, 0.333, 0.667, 1.000, 0.333, 1.000, 1.000, 0.667, 0.000,
+        1.000, 0.667, 0.333, 1.000, 0.667, 0.667, 1.000, 0.667, 1.000, 1.000,
+        1.000, 0.000, 1.000, 1.000, 0.333, 1.000, 1.000, 0.667, 1.000, 0.167,
+        0.000, 0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000,
+        0.000, 0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000,
+        0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000,
+        0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000, 0.000,
+        0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000, 0.833,
+        0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.143, 0.143, 0.143, 0.286,
+        0.286, 0.286, 0.429, 0.429, 0.429, 0.571, 0.571, 0.571, 0.714, 0.714,
+        0.714, 0.857, 0.857, 0.857, 1.000, 1.000, 1.000
+    ]).astype(np.float32)
+    color_list = color_list.reshape((-1, 3)) * 255
+    if not rgb:
+        color_list = color_list[:, ::-1]
+    return color_list.astype('int32')
diff --git a/rtdetr_paddle/ppdet/utils/download.py b/rtdetr_paddle/ppdet/utils/download.py
new file mode 100644
index 0000000..8fb95af
--- /dev/null
+++ b/rtdetr_paddle/ppdet/utils/download.py
@@ -0,0 +1,559 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import os.path as osp
+import sys
+import yaml
+import time
+import shutil
+import requests
+import tqdm
+import hashlib
+import base64
+import binascii
+import tarfile
+import zipfile
+import errno
+
+from paddle.utils.download import _get_unique_endpoints
+from ppdet.core.workspace import BASE_KEY
+from .logger import setup_logger
+from .voc_utils import create_list
+
+logger = setup_logger(__name__)
+
+__all__ = [
+    'get_weights_path', 'get_dataset_path', 'get_config_path',
+    'download_dataset', 'create_voc_list'
+]
+
+WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/weights")
+DATASET_HOME = osp.expanduser("~/.cache/paddle/dataset")
+CONFIGS_HOME = osp.expanduser("~/.cache/paddle/configs")
+
+# dict of {dataset_name: (download_info, sub_dirs)}
+# download info: [(url, md5sum)]
+DATASETS = {
+    'coco': ([
+        (
+            'http://images.cocodataset.org/zips/train2017.zip',
+            'cced6f7f71b7629ddf16f17bbcfab6b2', ),
+        (
+            'http://images.cocodataset.org/zips/val2017.zip',
+            '442b8da7639aecaf257c1dceb8ba8c80', ),
+        (
+            'http://images.cocodataset.org/annotations/annotations_trainval2017.zip',
+            'f4bbac642086de4f52a3fdda2de5fa2c', ),
+    ], ["annotations", "train2017", "val2017"]),
+    'voc': ([
+        (
+            'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar',
+            '6cd6e144f989b92b3379bac3b3de84fd', ),
+        (
+            'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar',
+            'c52e279531787c972589f7e41ab4ae64', ),
+        (
+            'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar',
+            'b6e924de25625d8de591ea690078ad9f', ),
+        (
+            'https://paddledet.bj.bcebos.com/data/label_list.txt',
+            '5ae5d62183cfb6f6d3ac109359d06a1b', ),
+    ], ["VOCdevkit/VOC2012", "VOCdevkit/VOC2007"]),
+    'wider_face': ([
+        (
+            'https://dataset.bj.bcebos.com/wider_face/WIDER_train.zip',
+            '3fedf70df600953d25982bcd13d91ba2', ),
+        (
+            'https://dataset.bj.bcebos.com/wider_face/WIDER_val.zip',
+            'dfa7d7e790efa35df3788964cf0bbaea', ),
+        (
+            'https://dataset.bj.bcebos.com/wider_face/wider_face_split.zip',
+            'a4a898d6193db4b9ef3260a68bad0dc7', ),
+    ], ["WIDER_train", "WIDER_val", "wider_face_split"]),
+    'fruit': ([(
+        'https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit.tar',
+        'baa8806617a54ccf3685fa7153388ae6', ), ],
+              ['Annotations', 'JPEGImages']),
+    'roadsign_voc': ([(
+        'https://paddlemodels.bj.bcebos.com/object_detection/roadsign_voc.tar',
+        '8d629c0f880dd8b48de9aeff44bf1f3e', ), ], ['annotations', 'images']),
+    'roadsign_coco': ([(
+        'https://paddlemodels.bj.bcebos.com/object_detection/roadsign_coco.tar',
+        '49ce5a9b5ad0d6266163cd01de4b018e', ), ], ['annotations', 'images']),
+    'spine_coco': ([(
+        'https://paddledet.bj.bcebos.com/data/spine.tar',
+        '8a3a353c2c54a2284ad7d2780b65f6a6', ), ], ['annotations', 'images']),
+    'coco_ce': ([(
+        'https://paddledet.bj.bcebos.com/data/coco_ce.tar',
+        'eadd1b79bc2f069f2744b1dd4e0c0329', ), ], [])
+}
+
+DOWNLOAD_DATASETS_LIST = DATASETS.keys()
+
+DOWNLOAD_RETRY_LIMIT = 3
+
+PPDET_WEIGHTS_DOWNLOAD_URL_PREFIX = 'https://paddledet.bj.bcebos.com/'
+
+
+# When running unit tests, there could be multiple processes that
+# trying to create DATA_HOME directory simultaneously, so we cannot
+# use a if condition to check for the existence of the directory;
+# instead, we use the filesystem as the synchronization mechanism by
+# catching returned errors.
+def must_mkdirs(path):
+    try:
+        os.makedirs(path)
+    except OSError as exc:
+        if exc.errno != errno.EEXIST:
+            raise
+        pass
+
+
+def parse_url(url):
+    url = url.replace("ppdet://", PPDET_WEIGHTS_DOWNLOAD_URL_PREFIX)
+    return url
+
+
+def get_weights_path(url):
+    """Get weights path from WEIGHTS_HOME, if not exists,
+    download it from url.
+    """
+    url = parse_url(url)
+    path, _ = get_path(url, WEIGHTS_HOME)
+    return path
+
+
+def get_config_path(url):
+    """Get weights path from CONFIGS_HOME, if not exists,
+    download it from url.
+    """
+    url = parse_url(url)
+    path = map_path(url, CONFIGS_HOME, path_depth=2)
+    if os.path.isfile(path):
+        return path
+
+    # config file not found, try download
+    # 1. clear configs directory
+    if osp.isdir(CONFIGS_HOME):
+        shutil.rmtree(CONFIGS_HOME)
+
+    # 2. get url
+    try:
+        from ppdet import __version__ as version
+    except ImportError:
+        version = None
+
+    cfg_url = "ppdet://configs/{}/configs.tar".format(version) \
+                if version else "ppdet://configs/configs.tar"
+    cfg_url = parse_url(cfg_url)
+
+    # 3. download and decompress
+    cfg_fullname = _download_dist(cfg_url, osp.dirname(CONFIGS_HOME))
+    _decompress_dist(cfg_fullname)
+
+    # 4. check config file existing
+    if os.path.isfile(path):
+        return path
+    else:
+        logger.error("Get config {} failed after download, please contact us on " \
+            "https://github.com/PaddlePaddle/PaddleDetection/issues".format(path))
+        sys.exit(1)
+
+
+def get_dataset_path(path, annotation, image_dir):
+    """
+    If path exists, return path.
+    Otherwise, get dataset path from DATASET_HOME, if not exists,
+    download it.
+    """
+    if _dataset_exists(path, annotation, image_dir):
+        return path
+
+    data_name = os.path.split(path.strip().lower())[-1]
+    if data_name not in DOWNLOAD_DATASETS_LIST:
+        raise ValueError(
+            "Dataset {} is not valid for reason above, please check again.".
+            format(osp.realpath(path)))
+    else:
+        logger.warning(
+            "Dataset {} is not valid for reason above, try searching {} or "
+            "downloading dataset...".format(osp.realpath(path), DATASET_HOME))
+
+    for name, dataset in DATASETS.items():
+        if data_name == name:
+            logger.debug("Parse dataset_dir {} as dataset "
+                         "{}".format(path, name))
+            data_dir = osp.join(DATASET_HOME, name)
+
+            if name == "spine_coco":
+                if _dataset_exists(data_dir, annotation, image_dir):
+                    return data_dir
+
+            # For voc, only check dir VOCdevkit/VOC2012, VOCdevkit/VOC2007
+            if name in ['voc', 'fruit', 'roadsign_voc']:
+                exists = True
+                for sub_dir in dataset[1]:
+                    check_dir = osp.join(data_dir, sub_dir)
+                    if osp.exists(check_dir):
+                        logger.info("Found {}".format(check_dir))
+                    else:
+                        exists = False
+                if exists:
+                    return data_dir
+
+            # voc exist is checked above, voc is not exist here
+            check_exist = name != 'voc' and name != 'fruit' and name != 'roadsign_voc'
+            for url, md5sum in dataset[0]:
+                get_path(url, data_dir, md5sum, check_exist)
+
+            # voc should create list after download
+            if name == 'voc':
+                create_voc_list(data_dir)
+            return data_dir
+
+    raise ValueError("Dataset automaticly downloading Error.")
+
+
+def create_voc_list(data_dir, devkit_subdir='VOCdevkit'):
+    logger.debug("Create voc file list...")
+    devkit_dir = osp.join(data_dir, devkit_subdir)
+    years = ['2007', '2012']
+
+    # NOTE: since using auto download VOC
+    # dataset, VOC default label list should be used, 
+    # do not generate label_list.txt here. For default
+    # label, see ../data/source/voc.py
+    create_list(devkit_dir, years, data_dir)
+    logger.debug("Create voc file list finished")
+
+
+def map_path(url, root_dir, path_depth=1):
+    # parse path after download to decompress under root_dir
+    assert path_depth > 0, "path_depth should be a positive integer"
+    dirname = url
+    for _ in range(path_depth):
+        dirname = osp.dirname(dirname)
+    fpath = osp.relpath(url, dirname)
+
+    zip_formats = ['.zip', '.tar', '.gz']
+    for zip_format in zip_formats:
+        fpath = fpath.replace(zip_format, '')
+    return osp.join(root_dir, fpath)
+
+
+def get_path(url, root_dir, md5sum=None, check_exist=True):
+    """ Download from given url to root_dir.
+    if file or directory specified by url is exists under
+    root_dir, return the path directly, otherwise download
+    from url and decompress it, return the path.
+
+    url (str): download url
+    root_dir (str): root dir for downloading, it should be
+                    WEIGHTS_HOME or DATASET_HOME
+    md5sum (str): md5 sum of download package
+    """
+    # parse path after download to decompress under root_dir
+    fullpath = map_path(url, root_dir)
+
+    # For same zip file, decompressed directory name different
+    # from zip file name, rename by following map
+    decompress_name_map = {
+        "VOCtrainval_11-May-2012": "VOCdevkit/VOC2012",
+        "VOCtrainval_06-Nov-2007": "VOCdevkit/VOC2007",
+        "VOCtest_06-Nov-2007": "VOCdevkit/VOC2007",
+        "annotations_trainval": "annotations"
+    }
+    for k, v in decompress_name_map.items():
+        if fullpath.find(k) >= 0:
+            fullpath = osp.join(osp.split(fullpath)[0], v)
+
+    if osp.exists(fullpath) and check_exist:
+        if not osp.isfile(fullpath) or \
+                _check_exist_file_md5(fullpath, md5sum, url):
+            logger.debug("Found {}".format(fullpath))
+            return fullpath, True
+        else:
+            os.remove(fullpath)
+
+    fullname = _download_dist(url, root_dir, md5sum)
+
+    # new weights format which postfix is 'pdparams' not
+    # need to decompress
+    if osp.splitext(fullname)[-1] not in ['.pdparams', '.yml']:
+        _decompress_dist(fullname)
+
+    return fullpath, False
+
+
+def download_dataset(path, dataset=None):
+    if dataset not in DATASETS.keys():
+        logger.error("Unknown dataset {}, it should be "
+                     "{}".format(dataset, DATASETS.keys()))
+        return
+    dataset_info = DATASETS[dataset][0]
+    for info in dataset_info:
+        get_path(info[0], path, info[1], False)
+    logger.debug("Download dataset {} finished.".format(dataset))
+
+
+def _dataset_exists(path, annotation, image_dir):
+    """
+    Check if user define dataset exists
+    """
+    if not osp.exists(path):
+        logger.warning("Config dataset_dir {} is not exits, "
+                       "dataset config is not valid".format(path))
+        return False
+
+    if annotation:
+        annotation_path = osp.join(path, annotation)
+        if not osp.isfile(annotation_path):
+            logger.warning("Config annotation {} is not a "
+                           "file, dataset config is not "
+                           "valid".format(annotation_path))
+            return False
+    if image_dir:
+        image_path = osp.join(path, image_dir)
+        if not osp.isdir(image_path):
+            logger.warning("Config image_dir {} is not a "
+                           "directory, dataset config is not "
+                           "valid".format(image_path))
+            return False
+    return True
+
+
+def _download(url, path, md5sum=None):
+    """
+    Download from url, save to path.
+
+    url (str): download url
+    path (str): download to given path
+    """
+    must_mkdirs(path)
+
+    fname = osp.split(url)[-1]
+    fullname = osp.join(path, fname)
+    retry_cnt = 0
+
+    while not (osp.exists(fullname) and _check_exist_file_md5(fullname, md5sum,
+                                                              url)):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RuntimeError("Download from {} failed. "
+                               "Retry limit reached".format(url))
+
+        logger.info("Downloading {} from {}".format(fname, url))
+
+        # NOTE: windows path join may incur \, which is invalid in url
+        if sys.platform == "win32":
+            url = url.replace('\\', '/')
+
+        req = requests.get(url, stream=True)
+        if req.status_code != 200:
+            raise RuntimeError("Downloading from {} failed with code "
+                               "{}!".format(url, req.status_code))
+
+        # For protecting download interupted, download to
+        # tmp_fullname firstly, move tmp_fullname to fullname
+        # after download finished
+        tmp_fullname = fullname + "_tmp"
+        total_size = req.headers.get('content-length')
+        with open(tmp_fullname, 'wb') as f:
+            if total_size:
+                for chunk in tqdm.tqdm(
+                        req.iter_content(chunk_size=1024),
+                        total=(int(total_size) + 1023) // 1024,
+                        unit='KB'):
+                    f.write(chunk)
+            else:
+                for chunk in req.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+        shutil.move(tmp_fullname, fullname)
+    return fullname
+
+
+def _download_dist(url, path, md5sum=None):
+    env = os.environ
+    if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env:
+        # Mainly used to solve the problem of downloading data from
+        # different machines in the case of multiple machines.
+        # Different nodes will download data, and the same node
+        # will only download data once.
+        # Reference https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/utils/download.py#L108
+        rank_id_curr_node = int(os.environ.get("PADDLE_RANK_IN_NODE", 0))
+        num_trainers = int(env['PADDLE_TRAINERS_NUM'])
+        if num_trainers <= 1:
+            return _download(url, path, md5sum)
+        else:
+            fname = osp.split(url)[-1]
+            fullname = osp.join(path, fname)
+            lock_path = fullname + '.download.lock'
+
+            must_mkdirs(path)
+
+            if not osp.exists(fullname):
+                with open(lock_path, 'w'):  # touch    
+                    os.utime(lock_path, None)
+                if rank_id_curr_node == 0:
+                    _download(url, path, md5sum)
+                    os.remove(lock_path)
+                else:
+                    while os.path.exists(lock_path):
+                        time.sleep(0.5)
+            return fullname
+    else:
+        return _download(url, path, md5sum)
+
+
+def _check_exist_file_md5(filename, md5sum, url):
+    # if md5sum is None, and file to check is weights file, 
+    # read md5um from url and check, else check md5sum directly
+    return _md5check_from_url(filename, url) if md5sum is None \
+            and filename.endswith('pdparams') \
+            else _md5check(filename, md5sum)
+
+
+def _md5check_from_url(filename, url):
+    # For weights in bcebos URLs, MD5 value is contained
+    # in request header as 'content_md5'
+    req = requests.get(url, stream=True)
+    content_md5 = req.headers.get('content-md5')
+    req.close()
+    if not content_md5 or _md5check(
+            filename,
+            binascii.hexlify(base64.b64decode(content_md5.strip('"'))).decode(
+            )):
+        return True
+    else:
+        return False
+
+
+def _md5check(fullname, md5sum=None):
+    if md5sum is None:
+        return True
+
+    logger.debug("File {} md5 checking...".format(fullname))
+    md5 = hashlib.md5()
+    with open(fullname, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5.update(chunk)
+    calc_md5sum = md5.hexdigest()
+
+    if calc_md5sum != md5sum:
+        logger.warning("File {} md5 check failed, {}(calc) != "
+                       "{}(base)".format(fullname, calc_md5sum, md5sum))
+        return False
+    return True
+
+
+def _decompress(fname):
+    """
+    Decompress for zip and tar file
+    """
+    logger.info("Decompressing {}...".format(fname))
+
+    # For protecting decompressing interupted,
+    # decompress to fpath_tmp directory firstly, if decompress
+    # successed, move decompress files to fpath and delete
+    # fpath_tmp and remove download compress file.
+    fpath = osp.split(fname)[0]
+    fpath_tmp = osp.join(fpath, 'tmp')
+    if osp.isdir(fpath_tmp):
+        shutil.rmtree(fpath_tmp)
+        os.makedirs(fpath_tmp)
+
+    if fname.find('tar') >= 0:
+        with tarfile.open(fname) as tf:
+            tf.extractall(path=fpath_tmp)
+    elif fname.find('zip') >= 0:
+        with zipfile.ZipFile(fname) as zf:
+            zf.extractall(path=fpath_tmp)
+    elif fname.find('.txt') >= 0:
+        return
+    else:
+        raise TypeError("Unsupport compress file type {}".format(fname))
+
+    for f in os.listdir(fpath_tmp):
+        src_dir = osp.join(fpath_tmp, f)
+        dst_dir = osp.join(fpath, f)
+        _move_and_merge_tree(src_dir, dst_dir)
+
+    shutil.rmtree(fpath_tmp)
+    os.remove(fname)
+
+
+def _decompress_dist(fname):
+    env = os.environ
+    if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env:
+        trainer_id = int(env['PADDLE_TRAINER_ID'])
+        num_trainers = int(env['PADDLE_TRAINERS_NUM'])
+        if num_trainers <= 1:
+            _decompress(fname)
+        else:
+            lock_path = fname + '.decompress.lock'
+            from paddle.distributed import ParallelEnv
+            unique_endpoints = _get_unique_endpoints(ParallelEnv()
+                                                     .trainer_endpoints[:])
+            # NOTE(dkp): _decompress_dist always performed after
+            # _download_dist, in _download_dist sub-trainers is waiting
+            # for download lock file release with sleeping, if decompress
+            # prograss is very fast and finished with in the sleeping gap
+            # time, e.g in tiny dataset such as coco_ce, spine_coco, main
+            # trainer may finish decompress and release lock file, so we
+            # only craete lock file in main trainer and all sub-trainer
+            # wait 1s for main trainer to create lock file, for 1s is
+            # twice as sleeping gap, this waiting time can keep all
+            # trainer pipeline in order
+            # **change this if you have more elegent methods**
+            if ParallelEnv().current_endpoint in unique_endpoints:
+                with open(lock_path, 'w'):  # touch    
+                    os.utime(lock_path, None)
+                _decompress(fname)
+                os.remove(lock_path)
+            else:
+                time.sleep(1)
+                while os.path.exists(lock_path):
+                    time.sleep(0.5)
+    else:
+        _decompress(fname)
+
+
+def _move_and_merge_tree(src, dst):
+    """
+    Move src directory to dst, if dst is already exists,
+    merge src to dst
+    """
+    if not osp.exists(dst):
+        shutil.move(src, dst)
+    elif osp.isfile(src):
+        shutil.move(src, dst)
+    else:
+        for fp in os.listdir(src):
+            src_fp = osp.join(src, fp)
+            dst_fp = osp.join(dst, fp)
+            if osp.isdir(src_fp):
+                if osp.isdir(dst_fp):
+                    _move_and_merge_tree(src_fp, dst_fp)
+                else:
+                    shutil.move(src_fp, dst_fp)
+            elif osp.isfile(src_fp) and \
+                    not osp.isfile(dst_fp):
+                shutil.move(src_fp, dst_fp)
diff --git a/rtdetr_paddle/ppdet/utils/fuse_utils.py b/rtdetr_paddle/ppdet/utils/fuse_utils.py
new file mode 100644
index 0000000..647fa99
--- /dev/null
+++ b/rtdetr_paddle/ppdet/utils/fuse_utils.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import paddle
+import paddle.nn as nn
+
+__all__ = ['fuse_conv_bn']
+
+
+def fuse_conv_bn(model):
+    is_train = False
+    if model.training:
+        model.eval()
+        is_train = True
+    fuse_list = []
+    tmp_pair = [None, None]
+    for name, layer in model.named_sublayers():
+        if isinstance(layer, nn.Conv2D):
+            tmp_pair[0] = name
+        if isinstance(layer, nn.BatchNorm2D):
+            tmp_pair[1] = name
+
+        if tmp_pair[0] and tmp_pair[1] and len(tmp_pair) == 2:
+            fuse_list.append(tmp_pair)
+            tmp_pair = [None, None]
+    model = fuse_layers(model, fuse_list)
+    if is_train:
+        model.train()
+    return model
+
+
+def find_parent_layer_and_sub_name(model, name):
+    """
+    Given the model and the name of a layer, find the parent layer and
+    the sub_name of the layer.
+    For example, if name is 'block_1/convbn_1/conv_1', the parent layer is
+    'block_1/convbn_1' and the sub_name is `conv_1`.
+    Args:
+        model(paddle.nn.Layer): the model to be quantized.
+        name(string): the name of a layer
+
+    Returns:
+        parent_layer, subname
+    """
+    assert isinstance(model, nn.Layer), \
+            "The model must be the instance of paddle.nn.Layer."
+    assert len(name) > 0, "The input (name) should not be empty."
+
+    last_idx = 0
+    idx = 0
+    parent_layer = model
+    while idx < len(name):
+        if name[idx] == '.':
+            sub_name = name[last_idx:idx]
+            if hasattr(parent_layer, sub_name):
+                parent_layer = getattr(parent_layer, sub_name)
+                last_idx = idx + 1
+        idx += 1
+    sub_name = name[last_idx:idx]
+    return parent_layer, sub_name
+
+
+class Identity(nn.Layer):
+    '''a layer to replace bn or relu layers'''
+
+    def __init__(self, *args, **kwargs):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+def fuse_layers(model, layers_to_fuse, inplace=False):
+    '''
+       fuse layers in layers_to_fuse
+
+       Args:
+           model(nn.Layer): The model to be fused.
+           layers_to_fuse(list): The layers' names to be fused. For
+               example,"fuse_list = [["conv1", "bn1"], ["conv2", "bn2"]]".
+               A TypeError would be raised if "fuse" was set as
+               True but "fuse_list" was None.
+                                 Default: None.
+           inplace(bool): Whether apply fusing to the input model.
+                          Default: False.
+
+       Return
+           fused_model(paddle.nn.Layer): The fused model.
+    '''
+    if not inplace:
+        model = copy.deepcopy(model)
+    for layers_list in layers_to_fuse:
+        layer_list = []
+        for layer_name in layers_list:
+            parent_layer, sub_name = find_parent_layer_and_sub_name(model,
+                                                                    layer_name)
+            layer_list.append(getattr(parent_layer, sub_name))
+        new_layers = _fuse_func(layer_list)
+        for i, item in enumerate(layers_list):
+            parent_layer, sub_name = find_parent_layer_and_sub_name(model, item)
+            setattr(parent_layer, sub_name, new_layers[i])
+    return model
+
+
+def _fuse_func(layer_list):
+    '''choose the fuser method and fuse layers'''
+    types = tuple(type(m) for m in layer_list)
+    fusion_method = types_to_fusion_method.get(types, None)
+    new_layers = [None] * len(layer_list)
+    fused_layer = fusion_method(*layer_list)
+    for handle_id, pre_hook_fn in layer_list[0]._forward_pre_hooks.items():
+        fused_layer.register_forward_pre_hook(pre_hook_fn)
+        del layer_list[0]._forward_pre_hooks[handle_id]
+    for handle_id, hook_fn in layer_list[-1]._forward_post_hooks.items():
+        fused_layer.register_forward_post_hook(hook_fn)
+        del layer_list[-1]._forward_post_hooks[handle_id]
+    new_layers[0] = fused_layer
+    for i in range(1, len(layer_list)):
+        identity = Identity()
+        identity.training = layer_list[0].training
+        new_layers[i] = identity
+    return new_layers
+
+
+def _fuse_conv_bn(conv, bn):
+    '''fuse conv and bn for train or eval'''
+    assert(conv.training == bn.training),\
+        "Conv and BN both must be in the same mode (train or eval)."
+    if conv.training:
+        assert bn._num_features == conv._out_channels, 'Output channel of Conv2d must match num_features of BatchNorm2d'
+        raise NotImplementedError
+    else:
+        return _fuse_conv_bn_eval(conv, bn)
+
+
+def _fuse_conv_bn_eval(conv, bn):
+    '''fuse conv and bn for eval'''
+    assert (not (conv.training or bn.training)), "Fusion only for eval!"
+    fused_conv = copy.deepcopy(conv)
+
+    fused_weight, fused_bias = _fuse_conv_bn_weights(
+        fused_conv.weight, fused_conv.bias, bn._mean, bn._variance, bn._epsilon,
+        bn.weight, bn.bias)
+    fused_conv.weight.set_value(fused_weight)
+    if fused_conv.bias is None:
+        fused_conv.bias = paddle.create_parameter(
+            shape=[fused_conv._out_channels], is_bias=True, dtype=bn.bias.dtype)
+    fused_conv.bias.set_value(fused_bias)
+    return fused_conv
+
+
+def _fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b):
+    '''fuse weights and bias of conv and bn'''
+    if conv_b is None:
+        conv_b = paddle.zeros_like(bn_rm)
+    if bn_w is None:
+        bn_w = paddle.ones_like(bn_rm)
+    if bn_b is None:
+        bn_b = paddle.zeros_like(bn_rm)
+    bn_var_rsqrt = paddle.rsqrt(bn_rv + bn_eps)
+    conv_w = conv_w * \
+        (bn_w * bn_var_rsqrt).reshape([-1] + [1] * (len(conv_w.shape) - 1))
+    conv_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b
+    return conv_w, conv_b
+
+
+types_to_fusion_method = {(nn.Conv2D, nn.BatchNorm2D): _fuse_conv_bn, }
diff --git a/rtdetr_paddle/ppdet/utils/logger.py b/rtdetr_paddle/ppdet/utils/logger.py
new file mode 100644
index 0000000..51e2962
--- /dev/null
+++ b/rtdetr_paddle/ppdet/utils/logger.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+
+import paddle.distributed as dist
+
+__all__ = ['setup_logger']
+
+logger_initialized = []
+
+
+def setup_logger(name="ppdet", output=None):
+    """
+    Initialize logger and set its verbosity level to INFO.
+    Args:
+        output (str): a file name or a directory to save log. If None, will not save log file.
+            If ends with ".txt" or ".log", assumed to be a file name.
+            Otherwise, logs will be saved to `output/log.txt`.
+        name (str): the root module name of this logger
+
+    Returns:
+        logging.Logger: a logger
+    """
+    logger = logging.getLogger(name)
+    if name in logger_initialized:
+        return logger
+
+    logger.setLevel(logging.INFO)
+    logger.propagate = False
+
+    formatter = logging.Formatter(
+        "[%(asctime)s] %(name)s %(levelname)s: %(message)s",
+        datefmt="%m/%d %H:%M:%S")
+    # stdout logging: master only
+    local_rank = dist.get_rank()
+    if local_rank == 0:
+        ch = logging.StreamHandler(stream=sys.stdout)
+        ch.setLevel(logging.DEBUG)
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+
+    # file logging: all workers
+    if output is not None:
+        if output.endswith(".txt") or output.endswith(".log"):
+            filename = output
+        else:
+            filename = os.path.join(output, "log.txt")
+        if local_rank > 0:
+            filename = filename + ".rank{}".format(local_rank)
+        os.makedirs(os.path.dirname(filename))
+        fh = logging.FileHandler(filename, mode='a')
+        fh.setLevel(logging.DEBUG)
+        fh.setFormatter(logging.Formatter())
+        logger.addHandler(fh)
+    logger_initialized.append(name)
+    return logger
diff --git a/rtdetr_paddle/ppdet/utils/profiler.py b/rtdetr_paddle/ppdet/utils/profiler.py
new file mode 100644
index 0000000..cae3773
--- /dev/null
+++ b/rtdetr_paddle/ppdet/utils/profiler.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import paddle
+
+# A global variable to record the number of calling times for profiler
+# functions. It is used to specify the tracing range of training steps.
+_profiler_step_id = 0
+
+# A global variable to avoid parsing from string every time.
+_profiler_options = None
+
+
+class ProfilerOptions(object):
+    '''
+    Use a string to initialize a ProfilerOptions.
+    The string should be in the format: "key1=value1;key2=value;key3=value3".
+    For example:
+      "profile_path=model.profile"
+      "batch_range=[50, 60]; profile_path=model.profile"
+      "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
+
+    ProfilerOptions supports following key-value pair:
+      batch_range      - a integer list, e.g. [100, 110].
+      state            - a string, the optional values are 'CPU', 'GPU' or 'All'.
+      sorted_key       - a string, the optional values are 'calls', 'total',
+                         'max', 'min' or 'ave.
+      tracer_option    - a string, the optional values are 'Default', 'OpDetail',
+                         'AllOpDetail'.
+      profile_path     - a string, the path to save the serialized profile data,
+                         which can be used to generate a timeline.
+      exit_on_finished - a boolean.
+    '''
+
+    def __init__(self, options_str):
+        assert isinstance(options_str, str)
+
+        self._options = {
+            'batch_range': [10, 20],
+            'state': 'All',
+            'sorted_key': 'total',
+            'tracer_option': 'Default',
+            'profile_path': '/tmp/profile',
+            'exit_on_finished': True
+        }
+        self._parse_from_string(options_str)
+
+    def _parse_from_string(self, options_str):
+        for kv in options_str.replace(' ', '').split(';'):
+            key, value = kv.split('=')
+            if key == 'batch_range':
+                value_list = value.replace('[', '').replace(']', '').split(',')
+                value_list = list(map(int, value_list))
+                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
+                        1] > value_list[0]:
+                    self._options[key] = value_list
+            elif key == 'exit_on_finished':
+                self._options[key] = value.lower() in ("yes", "true", "t", "1")
+            elif key in [
+                    'state', 'sorted_key', 'tracer_option', 'profile_path'
+            ]:
+                self._options[key] = value
+
+    def __getitem__(self, name):
+        if self._options.get(name, None) is None:
+            raise ValueError(
+                "ProfilerOptions does not have an option named %s." % name)
+        return self._options[name]
+
+
+def add_profiler_step(options_str=None):
+    '''
+    Enable the operator-level timing using PaddlePaddle's profiler.
+    The profiler uses a independent variable to count the profiler steps.
+    One call of this function is treated as a profiler step.
+
+    Args:
+      profiler_options - a string to initialize the ProfilerOptions.
+                         Default is None, and the profiler is disabled.
+    '''
+    if options_str is None:
+        return
+
+    global _profiler_step_id
+    global _profiler_options
+
+    if _profiler_options is None:
+        _profiler_options = ProfilerOptions(options_str)
+
+    if _profiler_step_id == _profiler_options['batch_range'][0]:
+        paddle.utils.profiler.start_profiler(_profiler_options['state'],
+                                             _profiler_options['tracer_option'])
+    elif _profiler_step_id == _profiler_options['batch_range'][1]:
+        paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'],
+                                            _profiler_options['profile_path'])
+        if _profiler_options['exit_on_finished']:
+            sys.exit(0)
+
+    _profiler_step_id += 1
diff --git a/rtdetr_paddle/ppdet/utils/stats.py b/rtdetr_paddle/ppdet/utils/stats.py
new file mode 100644
index 0000000..524b7dc
--- /dev/null
+++ b/rtdetr_paddle/ppdet/utils/stats.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import numpy as np
+
+__all__ = ['SmoothedValue', 'TrainingStats']
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({avg:.4f})"
+        self.deque = collections.deque(maxlen=window_size)
+        self.fmt = fmt
+        self.total = 0.
+        self.count = 0
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    @property
+    def median(self):
+        return np.median(self.deque)
+
+    @property
+    def avg(self):
+        return np.mean(self.deque)
+
+    @property
+    def max(self):
+        return np.max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median, avg=self.avg, max=self.max, value=self.value)
+
+
+class TrainingStats(object):
+    def __init__(self, window_size, delimiter=' '):
+        self.meters = None
+        self.window_size = window_size
+        self.delimiter = delimiter
+
+    def update(self, stats):
+        if self.meters is None:
+            self.meters = {
+                k: SmoothedValue(self.window_size)
+                for k in stats.keys()
+            }
+        for k, v in self.meters.items():
+            v.update(float(stats[k]))
+            
+    def get(self, extras=None):
+        stats = collections.OrderedDict()
+        if extras:
+            for k, v in extras.items():
+                stats[k] = v
+        for k, v in self.meters.items():
+            stats[k] = format(v.median, '.6f')
+
+        return stats
+
+    def log(self, extras=None):
+        d = self.get(extras)
+        strs = []
+        for k, v in d.items():
+            strs.append("{}: {}".format(k, str(v)))
+        return self.delimiter.join(strs)
diff --git a/rtdetr_paddle/ppdet/utils/visualizer.py b/rtdetr_paddle/ppdet/utils/visualizer.py
new file mode 100644
index 0000000..406589d
--- /dev/null
+++ b/rtdetr_paddle/ppdet/utils/visualizer.py
@@ -0,0 +1,461 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+import PIL
+from PIL import Image, ImageDraw
+import cv2
+import math
+
+from .colormap import colormap
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['visualize_results']
+
+
+def visualize_results(image,
+                      bbox_res,
+                      mask_res,
+                      segm_res,
+                      keypoint_res,
+                      pose3d_res,
+                      im_id,
+                      catid2name,
+                      threshold=0.5):
+    """
+    Visualize bbox and mask results
+    """
+    if bbox_res is not None:
+        image = draw_bbox(image, im_id, catid2name, bbox_res, threshold)
+    if mask_res is not None:
+        image = draw_mask(image, im_id, mask_res, threshold)
+    if segm_res is not None:
+        image = draw_segm(image, im_id, catid2name, segm_res, threshold)
+    if keypoint_res is not None:
+        image = draw_pose(image, keypoint_res, threshold)
+    if pose3d_res is not None:
+        pose3d = np.array(pose3d_res[0]['pose3d']) * 1000
+        image = draw_pose3d(image, pose3d, visual_thread=threshold)
+    return image
+
+
+def draw_mask(image, im_id, segms, threshold, alpha=0.7):
+    """
+    Draw mask on image
+    """
+    mask_color_id = 0
+    w_ratio = .4
+    color_list = colormap(rgb=True)
+    img_array = np.array(image).astype('float32')
+    for dt in np.array(segms):
+        if im_id != dt['image_id']:
+            continue
+        segm, score = dt['segmentation'], dt['score']
+        if score < threshold:
+            continue
+        import pycocotools.mask as mask_util
+        mask = mask_util.decode(segm) * 255
+        color_mask = color_list[mask_color_id % len(color_list), 0:3]
+        mask_color_id += 1
+        for c in range(3):
+            color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255
+        idx = np.nonzero(mask)
+        img_array[idx[0], idx[1], :] *= 1.0 - alpha
+        img_array[idx[0], idx[1], :] += alpha * color_mask
+    return Image.fromarray(img_array.astype('uint8'))
+
+
+def draw_bbox(image, im_id, catid2name, bboxes, threshold):
+    """
+    Draw bbox on image
+    """
+    draw = ImageDraw.Draw(image)
+
+    catid2color = {}
+    color_list = colormap(rgb=True)[:40]
+    for dt in np.array(bboxes):
+        if im_id != dt['image_id']:
+            continue
+        catid, bbox, score = dt['category_id'], dt['bbox'], dt['score']
+        if score < threshold:
+            continue
+
+        if catid not in catid2color:
+            idx = np.random.randint(len(color_list))
+            catid2color[catid] = color_list[idx]
+        color = tuple(catid2color[catid])
+
+        # draw bbox
+        if len(bbox) == 4:
+            # draw bbox
+            xmin, ymin, w, h = bbox
+            xmax = xmin + w
+            ymax = ymin + h
+            draw.line(
+                [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin),
+                 (xmin, ymin)],
+                width=2,
+                fill=color)
+        elif len(bbox) == 8:
+            x1, y1, x2, y2, x3, y3, x4, y4 = bbox
+            draw.line(
+                [(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)],
+                width=2,
+                fill=color)
+            xmin = min(x1, x2, x3, x4)
+            ymin = min(y1, y2, y3, y4)
+        else:
+            logger.error('the shape of bbox must be [M, 4] or [M, 8]!')
+
+        # draw label
+        text = "{} {:.2f}".format(catid2name[catid], score)
+        # tw, th = draw.textsize(text)
+        left, top, right, bottom = draw.textbbox((0, 0), text)
+        tw, th = right - left, bottom - top
+
+        draw.rectangle(
+            [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color)
+        draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))
+
+    return image
+
+
+def save_result(save_path, results, catid2name, threshold):
+    """
+    save result as txt
+    """
+    img_id = int(results["im_id"])
+    with open(save_path, 'w') as f:
+        if "bbox_res" in results:
+            for dt in results["bbox_res"]:
+                catid, bbox, score = dt['category_id'], dt['bbox'], dt['score']
+                if score < threshold:
+                    continue
+                # each bbox result as a line
+                # for rbox: classname score x1 y1 x2 y2 x3 y3 x4 y4
+                # for bbox: classname score x1 y1 w h
+                bbox_pred = '{} {} '.format(catid2name[catid],
+                                            score) + ' '.join(
+                                                [str(e) for e in bbox])
+                f.write(bbox_pred + '\n')
+        elif "keypoint_res" in results:
+            for dt in results["keypoint_res"]:
+                kpts = dt['keypoints']
+                scores = dt['score']
+                keypoint_pred = [img_id, scores, kpts]
+                print(keypoint_pred, file=f)
+        else:
+            print("No valid results found, skip txt save")
+
+
+def draw_segm(image,
+              im_id,
+              catid2name,
+              segms,
+              threshold,
+              alpha=0.7,
+              draw_box=True):
+    """
+    Draw segmentation on image
+    """
+    mask_color_id = 0
+    w_ratio = .4
+    color_list = colormap(rgb=True)
+    img_array = np.array(image).astype('float32')
+    for dt in np.array(segms):
+        if im_id != dt['image_id']:
+            continue
+        segm, score, catid = dt['segmentation'], dt['score'], dt['category_id']
+        if score < threshold:
+            continue
+        import pycocotools.mask as mask_util
+        mask = mask_util.decode(segm) * 255
+        color_mask = color_list[mask_color_id % len(color_list), 0:3]
+        mask_color_id += 1
+        for c in range(3):
+            color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255
+        idx = np.nonzero(mask)
+        img_array[idx[0], idx[1], :] *= 1.0 - alpha
+        img_array[idx[0], idx[1], :] += alpha * color_mask
+
+        if not draw_box:
+            center_y, center_x = ndimage.measurements.center_of_mass(mask)
+            label_text = "{}".format(catid2name[catid])
+            vis_pos = (max(int(center_x) - 10, 0), int(center_y))
+            cv2.putText(img_array, label_text, vis_pos,
+                        cv2.FONT_HERSHEY_COMPLEX, 0.3, (255, 255, 255))
+        else:
+            mask = mask_util.decode(segm) * 255
+            sum_x = np.sum(mask, axis=0)
+            x = np.where(sum_x > 0.5)[0]
+            sum_y = np.sum(mask, axis=1)
+            y = np.where(sum_y > 0.5)[0]
+            x0, x1, y0, y1 = x[0], x[-1], y[0], y[-1]
+            cv2.rectangle(img_array, (x0, y0), (x1, y1),
+                          tuple(color_mask.astype('int32').tolist()), 1)
+            bbox_text = '%s %.2f' % (catid2name[catid], score)
+            t_size = cv2.getTextSize(bbox_text, 0, 0.3, thickness=1)[0]
+            cv2.rectangle(img_array, (x0, y0), (x0 + t_size[0],
+                                                y0 - t_size[1] - 3),
+                          tuple(color_mask.astype('int32').tolist()), -1)
+            cv2.putText(
+                img_array,
+                bbox_text, (x0, y0 - 2),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                0.3, (0, 0, 0),
+                1,
+                lineType=cv2.LINE_AA)
+
+    return Image.fromarray(img_array.astype('uint8'))
+
+
+def draw_pose(image,
+              results,
+              visual_thread=0.6,
+              save_name='pose.jpg',
+              save_dir='output',
+              returnimg=False,
+              ids=None):
+    try:
+        import matplotlib.pyplot as plt
+        import matplotlib
+        plt.switch_backend('agg')
+    except Exception as e:
+        logger.error('Matplotlib not found, please install matplotlib.'
+                     'for example: `pip install matplotlib`.')
+        raise e
+
+    skeletons = np.array([item['keypoints'] for item in results])
+    kpt_nums = 17
+    if len(skeletons) > 0:
+        kpt_nums = int(skeletons.shape[1] / 3)
+    skeletons = skeletons.reshape(-1, kpt_nums, 3)
+    if kpt_nums == 17:  #plot coco keypoint
+        EDGES = [(0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6), (5, 7), (6, 8),
+                 (7, 9), (8, 10), (5, 11), (6, 12), (11, 13), (12, 14),
+                 (13, 15), (14, 16), (11, 12)]
+    else:  #plot mpii keypoint
+        EDGES = [(0, 1), (1, 2), (3, 4), (4, 5), (2, 6), (3, 6), (6, 7), (7, 8),
+                 (8, 9), (10, 11), (11, 12), (13, 14), (14, 15), (8, 12),
+                 (8, 13)]
+    NUM_EDGES = len(EDGES)
+
+    colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
+            [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
+            [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
+    cmap = matplotlib.cm.get_cmap('hsv')
+    plt.figure()
+
+    img = np.array(image).astype('float32')
+
+    color_set = results['colors'] if 'colors' in results else None
+
+    if 'bbox' in results and ids is None:
+        bboxs = results['bbox']
+        for j, rect in enumerate(bboxs):
+            xmin, ymin, xmax, ymax = rect
+            color = colors[0] if color_set is None else colors[color_set[j] %
+                                                               len(colors)]
+            cv2.rectangle(img, (xmin, ymin), (xmax, ymax), color, 1)
+
+    canvas = img.copy()
+    for i in range(kpt_nums):
+        for j in range(len(skeletons)):
+            if skeletons[j][i, 2] < visual_thread:
+                continue
+            if ids is None:
+                color = colors[i] if color_set is None else colors[color_set[j]
+                                                                   %
+                                                                   len(colors)]
+            else:
+                color = get_color(ids[j])
+
+            cv2.circle(
+                canvas,
+                tuple(skeletons[j][i, 0:2].astype('int32')),
+                2,
+                color,
+                thickness=-1)
+
+    to_plot = cv2.addWeighted(img, 0.3, canvas, 0.7, 0)
+    fig = matplotlib.pyplot.gcf()
+
+    stickwidth = 2
+
+    for i in range(NUM_EDGES):
+        for j in range(len(skeletons)):
+            edge = EDGES[i]
+            if skeletons[j][edge[0], 2] < visual_thread or skeletons[j][edge[
+                    1], 2] < visual_thread:
+                continue
+
+            cur_canvas = canvas.copy()
+            X = [skeletons[j][edge[0], 1], skeletons[j][edge[1], 1]]
+            Y = [skeletons[j][edge[0], 0], skeletons[j][edge[1], 0]]
+            mX = np.mean(X)
+            mY = np.mean(Y)
+            length = ((X[0] - X[1])**2 + (Y[0] - Y[1])**2)**0.5
+            angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
+            polygon = cv2.ellipse2Poly((int(mY), int(mX)),
+                                       (int(length / 2), stickwidth),
+                                       int(angle), 0, 360, 1)
+            if ids is None:
+                color = colors[i] if color_set is None else colors[color_set[j]
+                                                                   %
+                                                                   len(colors)]
+            else:
+                color = get_color(ids[j])
+            cv2.fillConvexPoly(cur_canvas, polygon, color)
+            canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
+    image = Image.fromarray(canvas.astype('uint8'))
+    plt.close()
+    return image
+
+
+def draw_pose3d(image,
+                pose3d,
+                pose2d=None,
+                visual_thread=0.6,
+                save_name='pose3d.jpg',
+                returnimg=True):
+    try:
+        import matplotlib.pyplot as plt
+        import matplotlib
+        plt.switch_backend('agg')
+    except Exception as e:
+        logger.error('Matplotlib not found, please install matplotlib.'
+                     'for example: `pip install matplotlib`.')
+        raise e
+
+    if pose3d.shape[0] == 24:
+        joints_connectivity_dict = [
+            [0, 1, 0], [1, 2, 0], [5, 4, 1], [4, 3, 1], [2, 3, 0], [2, 14, 1],
+            [3, 14, 1], [14, 16, 1], [15, 16, 1], [15, 12, 1], [6, 7, 0],
+            [7, 8, 0], [11, 10, 1], [10, 9, 1], [8, 12, 0], [9, 12, 1],
+            [12, 19, 1], [19, 18, 1], [19, 20, 0], [19, 21, 1], [22, 20, 0],
+            [23, 21, 1]
+        ]
+    elif pose3d.shape[0] == 14:
+        joints_connectivity_dict = [
+            [0, 1, 0], [1, 2, 0], [5, 4, 1], [4, 3, 1], [2, 3, 0], [2, 12, 0],
+            [3, 12, 1], [6, 7, 0], [7, 8, 0], [11, 10, 1], [10, 9, 1],
+            [8, 12, 0], [9, 12, 1], [12, 13, 1]
+        ]
+    else:
+        print(
+            "not defined joints number :{}, cannot visualize because unknown of joint connectivity".
+            format(pose.shape[0]))
+        return
+
+    def draw3Dpose(pose3d,
+                   ax,
+                   lcolor="#3498db",
+                   rcolor="#e74c3c",
+                   add_labels=False):
+        #    pose3d = orthographic_projection(pose3d, cam)
+        for i in joints_connectivity_dict:
+            x, y, z = [
+                np.array([pose3d[i[0], j], pose3d[i[1], j]]) for j in range(3)
+            ]
+            ax.plot(-x, -z, -y, lw=2, c=lcolor if i[2] else rcolor)
+
+        RADIUS = 1000
+        center_xy = 2 if pose3d.shape[0] == 14 else 14
+        x, y, z = pose3d[center_xy, 0], pose3d[center_xy, 1], pose3d[center_xy,
+                                                                     2]
+        ax.set_xlim3d([-RADIUS + x, RADIUS + x])
+        ax.set_ylim3d([-RADIUS + y, RADIUS + y])
+        ax.set_zlim3d([-RADIUS + z, RADIUS + z])
+
+        ax.set_xlabel("x")
+        ax.set_ylabel("y")
+        ax.set_zlabel("z")
+
+    def draw2Dpose(pose2d,
+                   ax,
+                   lcolor="#3498db",
+                   rcolor="#e74c3c",
+                   add_labels=False):
+        for i in joints_connectivity_dict:
+            if pose2d[i[0], 2] and pose2d[i[1], 2]:
+                x, y = [
+                    np.array([pose2d[i[0], j], pose2d[i[1], j]])
+                    for j in range(2)
+                ]
+                ax.plot(x, y, 0, lw=2, c=lcolor if i[2] else rcolor)
+
+    def draw_img_pose(pose3d,
+                      pose2d=None,
+                      frame=None,
+                      figsize=(12, 12),
+                      savepath=None):
+        fig = plt.figure(figsize=figsize, dpi=80)
+        # fig.clear()
+        fig.tight_layout()
+
+        ax = fig.add_subplot(221)
+        if frame is not None:
+            ax.imshow(frame, interpolation='nearest')
+        if pose2d is not None:
+            draw2Dpose(pose2d, ax)
+
+        ax = fig.add_subplot(222, projection='3d')
+        ax.view_init(45, 45)
+        draw3Dpose(pose3d, ax)
+        ax = fig.add_subplot(223, projection='3d')
+        ax.view_init(0, 0)
+        draw3Dpose(pose3d, ax)
+        ax = fig.add_subplot(224, projection='3d')
+        ax.view_init(0, 90)
+        draw3Dpose(pose3d, ax)
+
+        if savepath is not None:
+            plt.savefig(savepath)
+            plt.close()
+        else:
+            return fig
+
+    def fig2data(fig):
+        """
+        fig = plt.figure()
+        image = fig2data(fig)
+        @brief Convert a Matplotlib figure to a 4D numpy array with RGBA channels and return it
+        @param fig a matplotlib figure
+        @return a numpy 3D array of RGBA values
+        """
+        # draw the renderer
+        fig.canvas.draw()
+
+        # Get the RGBA buffer from the figure
+        w, h = fig.canvas.get_width_height()
+        buf = np.fromstring(fig.canvas.tostring_argb(), dtype=np.uint8)
+        buf.shape = (w, h, 4)
+
+        # canvas.tostring_argb give pixmap in ARGB mode. Roll the ALPHA channel to have it in RGBA mode
+        buf = np.roll(buf, 3, axis=2)
+        image = Image.frombytes("RGBA", (w, h), buf.tostring())
+        return image.convert("RGB")
+
+    fig = draw_img_pose(pose3d, pose2d, frame=image)
+    data = fig2data(fig)
+    if returnimg is False:
+        data.save(save_name)
+    else:
+        return data
diff --git a/rtdetr_paddle/ppdet/utils/voc_utils.py b/rtdetr_paddle/ppdet/utils/voc_utils.py
new file mode 100644
index 0000000..cd6d9f9
--- /dev/null
+++ b/rtdetr_paddle/ppdet/utils/voc_utils.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import os.path as osp
+import re
+import random
+
+__all__ = ['create_list']
+
+
+def create_list(devkit_dir, years, output_dir):
+    """
+    create following list:
+        1. trainval.txt
+        2. test.txt
+    """
+    trainval_list = []
+    test_list = []
+    for year in years:
+        trainval, test = _walk_voc_dir(devkit_dir, year, output_dir)
+        trainval_list.extend(trainval)
+        test_list.extend(test)
+
+    random.shuffle(trainval_list)
+    with open(osp.join(output_dir, 'trainval.txt'), 'w') as ftrainval:
+        for item in trainval_list:
+            ftrainval.write(item[0] + ' ' + item[1] + '\n')
+
+    with open(osp.join(output_dir, 'test.txt'), 'w') as fval:
+        ct = 0
+        for item in test_list:
+            ct += 1
+            fval.write(item[0] + ' ' + item[1] + '\n')
+
+
+def _get_voc_dir(devkit_dir, year, type):
+    return osp.join(devkit_dir, 'VOC' + year, type)
+
+
+def _walk_voc_dir(devkit_dir, year, output_dir):
+    filelist_dir = _get_voc_dir(devkit_dir, year, 'ImageSets/Main')
+    annotation_dir = _get_voc_dir(devkit_dir, year, 'Annotations')
+    img_dir = _get_voc_dir(devkit_dir, year, 'JPEGImages')
+    trainval_list = []
+    test_list = []
+    added = set()
+
+    for _, _, files in os.walk(filelist_dir):
+        for fname in files:
+            img_ann_list = []
+            if re.match(r'[a-z]+_trainval\.txt', fname):
+                img_ann_list = trainval_list
+            elif re.match(r'[a-z]+_test\.txt', fname):
+                img_ann_list = test_list
+            else:
+                continue
+            fpath = osp.join(filelist_dir, fname)
+            for line in open(fpath):
+                name_prefix = line.strip().split()[0]
+                if name_prefix in added:
+                    continue
+                added.add(name_prefix)
+                ann_path = osp.join(
+                    osp.relpath(annotation_dir, output_dir),
+                    name_prefix + '.xml')
+                img_path = osp.join(
+                    osp.relpath(img_dir, output_dir), name_prefix + '.jpg')
+                img_ann_list.append((img_path, ann_path))
+
+    return trainval_list, test_list
diff --git a/rtdetr_paddle/ppdet/version.py b/rtdetr_paddle/ppdet/version.py
new file mode 100644
index 0000000..d4be0af
--- /dev/null
+++ b/rtdetr_paddle/ppdet/version.py
@@ -0,0 +1,4 @@
+# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
+#
+full_version    = '2.4.0'
+commit          = '87ed5ba91eaeb332e8e5c3f4e7d5b1d765c75644'
diff --git a/rtdetr_paddle/requirements.txt b/rtdetr_paddle/requirements.txt
new file mode 100644
index 0000000..867b90f
--- /dev/null
+++ b/rtdetr_paddle/requirements.txt
@@ -0,0 +1,12 @@
+paddlepaddle-gpu==2.4.2
+tqdm
+typeguard
+visualdl>=2.2.0
+opencv-python <= 4.6.0
+PyYAML
+shapely
+scipy
+terminaltables
+Cython
+pycocotools
+setuptools
diff --git a/rtdetr_paddle/tools/eval.py b/rtdetr_paddle/tools/eval.py
new file mode 100755
index 0000000..d390d70
--- /dev/null
+++ b/rtdetr_paddle/tools/eval.py
@@ -0,0 +1,198 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+# add python path of PaddleDetection to sys.path
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
+sys.path.insert(0, parent_path)
+
+# ignore warning log
+import warnings
+warnings.filterwarnings('ignore')
+
+import paddle
+
+from ppdet.core.workspace import create, load_config, merge_config
+from ppdet.utils.check import check_gpu, check_npu, check_xpu, check_mlu, check_version, check_config
+from ppdet.utils.cli import ArgsParser, merge_args
+from ppdet.engine import Trainer, init_parallel_env
+from ppdet.metrics.coco_utils import json_eval_results
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('eval')
+
+
+def parse_args():
+    parser = ArgsParser()
+    parser.add_argument(
+        "--output_eval",
+        default=None,
+        type=str,
+        help="Evaluation directory, default is current directory.")
+
+    parser.add_argument(
+        '--json_eval',
+        action='store_true',
+        default=False,
+        help='Whether to re eval with already exists bbox.json or mask.json')
+
+    parser.add_argument(
+        "--slim_config",
+        default=None,
+        type=str,
+        help="Configuration file of slim method.")
+
+    # TODO: bias should be unified
+    parser.add_argument(
+        "--bias",
+        action="store_true",
+        help="whether add bias or not while getting w and h")
+
+    parser.add_argument(
+        "--classwise",
+        action="store_true",
+        help="whether per-category AP and draw P-R Curve or not.")
+
+    parser.add_argument(
+        '--save_prediction_only',
+        action='store_true',
+        default=False,
+        help='Whether to save the evaluation results only')
+
+    parser.add_argument(
+        "--amp",
+        action='store_true',
+        default=False,
+        help="Enable auto mixed precision eval.")
+
+    # for smalldet slice_infer
+    parser.add_argument(
+        "--slice_infer",
+        action='store_true',
+        help="Whether to slice the image and merge the inference results for small object detection."
+    )
+    parser.add_argument(
+        '--slice_size',
+        nargs='+',
+        type=int,
+        default=[640, 640],
+        help="Height of the sliced image.")
+    parser.add_argument(
+        "--overlap_ratio",
+        nargs='+',
+        type=float,
+        default=[0.25, 0.25],
+        help="Overlap height ratio of the sliced image.")
+    parser.add_argument(
+        "--combine_method",
+        type=str,
+        default='nms',
+        help="Combine method of the sliced images' detection results, choose in ['nms', 'nmm', 'concat']."
+    )
+    parser.add_argument(
+        "--match_threshold",
+        type=float,
+        default=0.6,
+        help="Combine method matching threshold.")
+    parser.add_argument(
+        "--match_metric",
+        type=str,
+        default='ios',
+        help="Combine method matching metric, choose in ['iou', 'ios'].")
+    args = parser.parse_args()
+    return args
+
+
+def run(FLAGS, cfg):
+    if FLAGS.json_eval:
+        logger.info(
+            "In json_eval mode, PaddleDetection will evaluate json files in "
+            "output_eval directly. And proposal.json, bbox.json and mask.json "
+            "will be detected by default.")
+        json_eval_results(
+            cfg.metric,
+            json_directory=FLAGS.output_eval,
+            dataset=create('EvalDataset')())
+        return
+
+    # init parallel environment if nranks > 1
+    init_parallel_env()
+
+    # build trainer
+    trainer = Trainer(cfg, mode='eval')
+    #load weights
+    trainer.load_weights(cfg.weights)
+
+    # training
+    if FLAGS.slice_infer:
+        trainer.evaluate_slice(
+            slice_size=FLAGS.slice_size,
+            overlap_ratio=FLAGS.overlap_ratio,
+            combine_method=FLAGS.combine_method,
+            match_threshold=FLAGS.match_threshold,
+            match_metric=FLAGS.match_metric)
+    else:
+        trainer.evaluate()
+
+
+def main():
+    FLAGS = parse_args()
+    cfg = load_config(FLAGS.config)
+    merge_args(cfg, FLAGS)
+    merge_config(FLAGS.opt)
+
+    # disable npu in config by default
+    if 'use_npu' not in cfg:
+        cfg.use_npu = False
+
+    # disable xpu in config by default
+    if 'use_xpu' not in cfg:
+        cfg.use_xpu = False
+
+    if 'use_gpu' not in cfg:
+        cfg.use_gpu = False
+
+    # disable mlu in config by default
+    if 'use_mlu' not in cfg:
+        cfg.use_mlu = False
+
+    if cfg.use_gpu:
+        place = paddle.set_device('gpu')
+    elif cfg.use_npu:
+        place = paddle.set_device('npu')
+    elif cfg.use_xpu:
+        place = paddle.set_device('xpu')
+    elif cfg.use_mlu:
+        place = paddle.set_device('mlu')
+    else:
+        place = paddle.set_device('cpu')
+
+    check_config(cfg)
+    check_gpu(cfg.use_gpu)
+    check_npu(cfg.use_npu)
+    check_xpu(cfg.use_xpu)
+    check_mlu(cfg.use_mlu)
+    check_version()
+
+    run(FLAGS, cfg)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/rtdetr_paddle/tools/export_model.py b/rtdetr_paddle/tools/export_model.py
new file mode 100644
index 0000000..621678c
--- /dev/null
+++ b/rtdetr_paddle/tools/export_model.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+# add python path of PaddleDetection to sys.path
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
+sys.path.insert(0, parent_path)
+
+# ignore warning log
+import warnings
+warnings.filterwarnings('ignore')
+
+import paddle
+from ppdet.core.workspace import load_config, merge_config
+from ppdet.utils.check import check_gpu, check_version, check_config
+from ppdet.utils.cli import ArgsParser
+from ppdet.engine import Trainer
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('export_model')
+
+
+def parse_args():
+    parser = ArgsParser()
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="output_inference",
+        help="Directory for storing the output model files.")
+    parser.add_argument(
+        "--export_serving_model",
+        type=bool,
+        default=False,
+        help="Whether to export serving model or not.")
+    parser.add_argument(
+        "--slim_config",
+        default=None,
+        type=str,
+        help="Configuration file of slim method.")
+    args = parser.parse_args()
+    return args
+
+
+def run(FLAGS, cfg):
+    trainer = Trainer(cfg, mode='test')
+    # load weights
+    trainer.load_weights(cfg.weights)
+
+    # export model
+    trainer.export(FLAGS.output_dir)
+
+    if FLAGS.export_serving_model:
+        from paddle_serving_client.io import inference_model_to_serving
+        model_name = os.path.splitext(os.path.split(cfg.filename)[-1])[0]
+
+        inference_model_to_serving(
+            dirname="{}/{}".format(FLAGS.output_dir, model_name),
+            serving_server="{}/{}/serving_server".format(FLAGS.output_dir,
+                                                         model_name),
+            serving_client="{}/{}/serving_client".format(FLAGS.output_dir,
+                                                         model_name),
+            model_filename="model.pdmodel",
+            params_filename="model.pdiparams")
+
+
+def main():
+    paddle.set_device("cpu")
+    FLAGS = parse_args()
+    cfg = load_config(FLAGS.config)
+    merge_config(FLAGS.opt)
+
+    # FIXME: Temporarily solve the priority problem of FLAGS.opt
+    merge_config(FLAGS.opt)
+    check_config(cfg)
+    if 'use_gpu' not in cfg:
+        cfg.use_gpu = False
+    check_gpu(cfg.use_gpu)
+    check_version()
+
+    run(FLAGS, cfg)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/rtdetr_paddle/tools/infer.py b/rtdetr_paddle/tools/infer.py
new file mode 100755
index 0000000..485c6fa
--- /dev/null
+++ b/rtdetr_paddle/tools/infer.py
@@ -0,0 +1,228 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+# add python path of PaddleDetection to sys.path
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
+sys.path.insert(0, parent_path)
+
+# ignore warning log
+import warnings
+warnings.filterwarnings('ignore')
+import glob
+import ast
+
+import paddle
+from ppdet.core.workspace import load_config, merge_config
+from ppdet.engine import Trainer
+from ppdet.utils.check import check_gpu, check_npu, check_xpu, check_mlu, check_version, check_config
+from ppdet.utils.cli import ArgsParser, merge_args
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('train')
+
+
+def parse_args():
+    parser = ArgsParser()
+    parser.add_argument(
+        "--infer_dir",
+        type=str,
+        default=None,
+        help="Directory for images to perform inference on.")
+    parser.add_argument(
+        "--infer_img",
+        type=str,
+        default=None,
+        help="Image path, has higher priority over --infer_dir")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="output",
+        help="Directory for storing the output visualization files.")
+    parser.add_argument(
+        "--draw_threshold",
+        type=float,
+        default=0.5,
+        help="Threshold to reserve the result for visualization.")
+    parser.add_argument(
+        "--slim_config",
+        default=None,
+        type=str,
+        help="Configuration file of slim method.")
+    parser.add_argument(
+        "--use_vdl",
+        type=bool,
+        default=False,
+        help="Whether to record the data to VisualDL.")
+    parser.add_argument(
+        '--vdl_log_dir',
+        type=str,
+        default="vdl_log_dir/image",
+        help='VisualDL logging directory for image.')
+    parser.add_argument(
+        "--save_results",
+        type=bool,
+        default=False,
+        help="Whether to save inference results to output_dir.")
+    parser.add_argument(
+        "--slice_infer",
+        action='store_true',
+        help="Whether to slice the image and merge the inference results for small object detection."
+    )
+    parser.add_argument(
+        '--slice_size',
+        nargs='+',
+        type=int,
+        default=[640, 640],
+        help="Height of the sliced image.")
+    parser.add_argument(
+        "--overlap_ratio",
+        nargs='+',
+        type=float,
+        default=[0.25, 0.25],
+        help="Overlap height ratio of the sliced image.")
+    parser.add_argument(
+        "--combine_method",
+        type=str,
+        default='nms',
+        help="Combine method of the sliced images' detection results, choose in ['nms', 'nmm', 'concat']."
+    )
+    parser.add_argument(
+        "--match_threshold",
+        type=float,
+        default=0.6,
+        help="Combine method matching threshold.")
+    parser.add_argument(
+        "--match_metric",
+        type=str,
+        default='ios',
+        help="Combine method matching metric, choose in ['iou', 'ios'].")
+    parser.add_argument(
+        "--visualize",
+        type=ast.literal_eval,
+        default=True,
+        help="Whether to save visualize results to output_dir.")
+    args = parser.parse_args()
+    return args
+
+
+def get_test_images(infer_dir, infer_img):
+    """
+    Get image path list in TEST mode
+    """
+    assert infer_img is not None or infer_dir is not None, \
+        "--infer_img or --infer_dir should be set"
+    assert infer_img is None or os.path.isfile(infer_img), \
+            "{} is not a file".format(infer_img)
+    assert infer_dir is None or os.path.isdir(infer_dir), \
+            "{} is not a directory".format(infer_dir)
+
+    # infer_img has a higher priority
+    if infer_img and os.path.isfile(infer_img):
+        return [infer_img]
+
+    images = set()
+    infer_dir = os.path.abspath(infer_dir)
+    assert os.path.isdir(infer_dir), \
+        "infer_dir {} is not a directory".format(infer_dir)
+    exts = ['jpg', 'jpeg', 'png', 'bmp']
+    exts += [ext.upper() for ext in exts]
+    for ext in exts:
+        images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
+    images = list(images)
+
+    assert len(images) > 0, "no image found in {}".format(infer_dir)
+    logger.info("Found {} inference images in total.".format(len(images)))
+
+    return images
+
+
+def run(FLAGS, cfg):
+    trainer = Trainer(cfg, mode='test')
+    trainer.load_weights(cfg.weights)
+    # get inference images
+    images = get_test_images(FLAGS.infer_dir, FLAGS.infer_img)
+
+    # inference
+    if FLAGS.slice_infer:
+        trainer.slice_predict(
+            images,
+            slice_size=FLAGS.slice_size,
+            overlap_ratio=FLAGS.overlap_ratio,
+            combine_method=FLAGS.combine_method,
+            match_threshold=FLAGS.match_threshold,
+            match_metric=FLAGS.match_metric,
+            draw_threshold=FLAGS.draw_threshold,
+            output_dir=FLAGS.output_dir,
+            save_results=FLAGS.save_results,
+            visualize=FLAGS.visualize)
+    else:
+        trainer.predict(
+            images,
+            draw_threshold=FLAGS.draw_threshold,
+            output_dir=FLAGS.output_dir,
+            save_results=FLAGS.save_results,
+            visualize=FLAGS.visualize)
+
+
+def main():
+    FLAGS = parse_args()
+    cfg = load_config(FLAGS.config)
+    merge_args(cfg, FLAGS)
+    merge_config(FLAGS.opt)
+
+    # disable npu in config by default
+    if 'use_npu' not in cfg:
+        cfg.use_npu = False
+
+    # disable xpu in config by default
+    if 'use_xpu' not in cfg:
+        cfg.use_xpu = False
+
+    if 'use_gpu' not in cfg:
+        cfg.use_gpu = False
+
+    # disable mlu in config by default
+    if 'use_mlu' not in cfg:
+        cfg.use_mlu = False
+
+    if cfg.use_gpu:
+        place = paddle.set_device('gpu')
+    elif cfg.use_npu:
+        place = paddle.set_device('npu')
+    elif cfg.use_xpu:
+        place = paddle.set_device('xpu')
+    elif cfg.use_mlu:
+        place = paddle.set_device('mlu')
+    else:
+        place = paddle.set_device('cpu')
+
+    check_config(cfg)
+    check_gpu(cfg.use_gpu)
+    check_npu(cfg.use_npu)
+    check_xpu(cfg.use_xpu)
+    check_mlu(cfg.use_mlu)
+    check_version()
+
+    run(FLAGS, cfg)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/rtdetr_paddle/tools/slice_image.py b/rtdetr_paddle/tools/slice_image.py
new file mode 100644
index 0000000..f739d74
--- /dev/null
+++ b/rtdetr_paddle/tools/slice_image.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from tqdm import tqdm
+
+
+def slice_data(image_dir, dataset_json_path, output_dir, slice_size,
+               overlap_ratio):
+    try:
+        from sahi.scripts.slice_coco import slice
+    except Exception as e:
+        raise RuntimeError(
+            'Unable to use sahi to slice images, please install sahi, for example: `pip install sahi`, see https://github.com/obss/sahi'
+        )
+    tqdm.write(
+        f" slicing for slice_size={slice_size}, overlap_ratio={overlap_ratio}")
+    slice(
+        image_dir=image_dir,
+        dataset_json_path=dataset_json_path,
+        output_dir=output_dir,
+        slice_size=slice_size,
+        overlap_ratio=overlap_ratio, )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--image_dir', type=str, default=None, help="The image folder path.")
+    parser.add_argument(
+        '--json_path', type=str, default=None, help="Dataset json path.")
+    parser.add_argument(
+        '--output_dir', type=str, default=None, help="Output dir.")
+    parser.add_argument(
+        '--slice_size', type=int, default=500, help="slice_size")
+    parser.add_argument(
+        '--overlap_ratio', type=float, default=0.25, help="overlap_ratio")
+    args = parser.parse_args()
+
+    slice_data(args.image_dir, args.json_path, args.output_dir, args.slice_size,
+               args.overlap_ratio)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/rtdetr_paddle/tools/train.py b/rtdetr_paddle/tools/train.py
new file mode 100755
index 0000000..954b4ec
--- /dev/null
+++ b/rtdetr_paddle/tools/train.py
@@ -0,0 +1,183 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+# add python path of PaddleDetection to sys.path
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2)))
+sys.path.insert(0, parent_path)
+
+# ignore warning log
+import warnings
+warnings.filterwarnings('ignore')
+
+import paddle
+
+from ppdet.core.workspace import load_config, merge_config
+from ppdet.engine import Trainer, init_parallel_env, set_random_seed, init_fleet_env
+from ppdet.utils.cli import ArgsParser, merge_args
+import ppdet.utils.check as check
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('train')
+
+
+def parse_args():
+    parser = ArgsParser()
+    parser.add_argument(
+        "--eval",
+        action='store_true',
+        default=False,
+        help="Whether to perform evaluation in train")
+    parser.add_argument(
+        "-r", "--resume", default=None, help="weights path for resume")
+    parser.add_argument(
+        "--slim_config",
+        default=None,
+        type=str,
+        help="Configuration file of slim method.")
+    parser.add_argument(
+        "--enable_ce",
+        type=bool,
+        default=False,
+        help="If set True, enable continuous evaluation job."
+        "This flag is only used for internal test.")
+    parser.add_argument(
+        "--amp",
+        action='store_true',
+        default=False,
+        help="Enable auto mixed precision training.")
+    parser.add_argument(
+        "--fleet", action='store_true', default=False, help="Use fleet or not")
+    parser.add_argument(
+        "--use_vdl",
+        type=bool,
+        default=False,
+        help="whether to record the data to VisualDL.")
+    parser.add_argument(
+        '--vdl_log_dir',
+        type=str,
+        default="vdl_log_dir/scalar",
+        help='VisualDL logging directory for scalar.')
+    parser.add_argument(
+        "--use_wandb",
+        type=bool,
+        default=False,
+        help="whether to record the data to wandb.")
+    parser.add_argument(
+        '--save_prediction_only',
+        action='store_true',
+        default=False,
+        help='Whether to save the evaluation results only')
+    parser.add_argument(
+        '--profiler_options',
+        type=str,
+        default=None,
+        help="The option of profiler, which should be in "
+        "format \"key1=value1;key2=value2;key3=value3\"."
+        "please see ppdet/utils/profiler.py for detail.")
+    parser.add_argument(
+        '--save_proposals',
+        action='store_true',
+        default=False,
+        help='Whether to save the train proposals')
+    parser.add_argument(
+        '--proposals_path',
+        type=str,
+        default="sniper/proposals.json",
+        help='Train proposals directory')
+    parser.add_argument(
+        "--to_static",
+        action='store_true',
+        default=False,
+        help="Enable dy2st to train.")
+
+    args = parser.parse_args()
+    return args
+
+
+def run(FLAGS, cfg):
+    # init fleet environment
+    if cfg.fleet:
+        init_fleet_env(cfg.get('find_unused_parameters', False))
+    else:
+        # init parallel environment if nranks > 1
+        init_parallel_env()
+
+    if FLAGS.enable_ce:
+        set_random_seed(0)
+
+    # build trainer
+    trainer = Trainer(cfg, mode='train')
+
+    # load weights
+    if FLAGS.resume is not None:
+        trainer.resume_weights(FLAGS.resume)
+    elif 'pretrain_weights' in cfg and cfg.pretrain_weights:
+        trainer.load_weights(cfg.pretrain_weights)
+
+    # training
+    trainer.train(FLAGS.eval)
+
+
+def main():
+    FLAGS = parse_args()
+    cfg = load_config(FLAGS.config)
+    merge_args(cfg, FLAGS)
+    merge_config(FLAGS.opt)
+
+    # disable npu in config by default
+    if 'use_npu' not in cfg:
+        cfg.use_npu = False
+
+    # disable xpu in config by default
+    if 'use_xpu' not in cfg:
+        cfg.use_xpu = False
+
+    if 'use_gpu' not in cfg:
+        cfg.use_gpu = False
+
+    # disable mlu in config by default
+    if 'use_mlu' not in cfg:
+        cfg.use_mlu = False
+
+    if cfg.use_gpu:
+        place = paddle.set_device('gpu')
+    elif cfg.use_npu:
+        place = paddle.set_device('npu')
+    elif cfg.use_xpu:
+        place = paddle.set_device('xpu')
+    elif cfg.use_mlu:
+        place = paddle.set_device('mlu')
+    else:
+        place = paddle.set_device('cpu')
+
+    # FIXME: Temporarily solve the priority problem of FLAGS.opt
+    merge_config(FLAGS.opt)
+    check.check_config(cfg)
+    check.check_gpu(cfg.use_gpu)
+    check.check_npu(cfg.use_npu)
+    check.check_xpu(cfg.use_xpu)
+    check.check_mlu(cfg.use_mlu)
+    check.check_version()
+
+    run(FLAGS, cfg)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/rtdetr_paddle/tools/x2coco.py b/rtdetr_paddle/tools/x2coco.py
new file mode 100644
index 0000000..78e8619
--- /dev/null
+++ b/rtdetr_paddle/tools/x2coco.py
@@ -0,0 +1,542 @@
+#!/usr/bin/env python
+# coding: utf-8
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import glob
+import json
+import os
+import os.path as osp
+import shutil
+import xml.etree.ElementTree as ET
+
+import numpy as np
+import PIL.ImageDraw
+from tqdm import tqdm
+import cv2
+
+label_to_num = {}
+categories_list = []
+labels_list = []
+
+
+class MyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.integer):
+            return int(obj)
+        elif isinstance(obj, np.floating):
+            return float(obj)
+        elif isinstance(obj, np.ndarray):
+            return obj.tolist()
+        else:
+            return super(MyEncoder, self).default(obj)
+
+
+def images_labelme(data, num):
+    image = {}
+    image['height'] = data['imageHeight']
+    image['width'] = data['imageWidth']
+    image['id'] = num + 1
+    if '\\' in data['imagePath']:
+        image['file_name'] = data['imagePath'].split('\\')[-1]
+    else:
+        image['file_name'] = data['imagePath'].split('/')[-1]
+    return image
+
+
+def images_cityscape(data, num, img_file):
+    image = {}
+    image['height'] = data['imgHeight']
+    image['width'] = data['imgWidth']
+    image['id'] = num + 1
+    image['file_name'] = img_file
+    return image
+
+
+def categories(label, labels_list):
+    category = {}
+    category['supercategory'] = 'component'
+    category['id'] = len(labels_list) + 1
+    category['name'] = label
+    return category
+
+
+def annotations_rectangle(points, label, image_num, object_num, label_to_num):
+    annotation = {}
+    seg_points = np.asarray(points).copy()
+    seg_points[1, :] = np.asarray(points)[2, :]
+    seg_points[2, :] = np.asarray(points)[1, :]
+    annotation['segmentation'] = [list(seg_points.flatten())]
+    annotation['iscrowd'] = 0
+    annotation['image_id'] = image_num + 1
+    annotation['bbox'] = list(
+        map(float, [
+            points[0][0], points[0][1], points[1][0] - points[0][0], points[1][
+                1] - points[0][1]
+        ]))
+    annotation['area'] = annotation['bbox'][2] * annotation['bbox'][3]
+    annotation['category_id'] = label_to_num[label]
+    annotation['id'] = object_num + 1
+    return annotation
+
+
+def annotations_polygon(height, width, points, label, image_num, object_num,
+                        label_to_num):
+    annotation = {}
+    annotation['segmentation'] = [list(np.asarray(points).flatten())]
+    annotation['iscrowd'] = 0
+    annotation['image_id'] = image_num + 1
+    annotation['bbox'] = list(map(float, get_bbox(height, width, points)))
+    annotation['area'] = annotation['bbox'][2] * annotation['bbox'][3]
+    annotation['category_id'] = label_to_num[label]
+    annotation['id'] = object_num + 1
+    return annotation
+
+
+def get_bbox(height, width, points):
+    polygons = points
+    mask = np.zeros([height, width], dtype=np.uint8)
+    mask = PIL.Image.fromarray(mask)
+    xy = list(map(tuple, polygons))
+    PIL.ImageDraw.Draw(mask).polygon(xy=xy, outline=1, fill=1)
+    mask = np.array(mask, dtype=bool)
+    index = np.argwhere(mask == 1)
+    rows = index[:, 0]
+    clos = index[:, 1]
+    left_top_r = np.min(rows)
+    left_top_c = np.min(clos)
+    right_bottom_r = np.max(rows)
+    right_bottom_c = np.max(clos)
+    return [
+        left_top_c, left_top_r, right_bottom_c - left_top_c,
+        right_bottom_r - left_top_r
+    ]
+
+
+def deal_json(ds_type, img_path, json_path):
+    data_coco = {}
+    images_list = []
+    annotations_list = []
+    image_num = -1
+    object_num = -1
+    for img_file in os.listdir(img_path):
+        img_label = os.path.splitext(img_file)[0]
+        if img_file.split('.')[
+                -1] not in ['bmp', 'jpg', 'jpeg', 'png', 'JPEG', 'JPG', 'PNG']:
+            continue
+        label_file = osp.join(json_path, img_label + '.json')
+        print('Generating dataset from:', label_file)
+        image_num = image_num + 1
+        with open(label_file) as f:
+            data = json.load(f)
+            if ds_type == 'labelme':
+                images_list.append(images_labelme(data, image_num))
+            elif ds_type == 'cityscape':
+                images_list.append(images_cityscape(data, image_num, img_file))
+            if ds_type == 'labelme':
+                for shapes in data['shapes']:
+                    object_num = object_num + 1
+                    label = shapes['label']
+                    if label not in labels_list:
+                        categories_list.append(categories(label, labels_list))
+                        labels_list.append(label)
+                        label_to_num[label] = len(labels_list)
+                    p_type = shapes['shape_type']
+                    if p_type == 'polygon':
+                        points = shapes['points']
+                        annotations_list.append(
+                            annotations_polygon(data['imageHeight'], data[
+                                'imageWidth'], points, label, image_num,
+                                                object_num, label_to_num))
+
+                    if p_type == 'rectangle':
+                        (x1, y1), (x2, y2) = shapes['points']
+                        x1, x2 = sorted([x1, x2])
+                        y1, y2 = sorted([y1, y2])
+                        points = [[x1, y1], [x2, y2], [x1, y2], [x2, y1]]
+                        annotations_list.append(
+                            annotations_rectangle(points, label, image_num,
+                                                  object_num, label_to_num))
+            elif ds_type == 'cityscape':
+                for shapes in data['objects']:
+                    object_num = object_num + 1
+                    label = shapes['label']
+                    if label not in labels_list:
+                        categories_list.append(categories(label, labels_list))
+                        labels_list.append(label)
+                        label_to_num[label] = len(labels_list)
+                    points = shapes['polygon']
+                    annotations_list.append(
+                        annotations_polygon(data['imgHeight'], data[
+                            'imgWidth'], points, label, image_num, object_num,
+                                            label_to_num))
+    data_coco['images'] = images_list
+    data_coco['categories'] = categories_list
+    data_coco['annotations'] = annotations_list
+    return data_coco
+
+
+def voc_get_label_anno(ann_dir_path, ann_ids_path, labels_path):
+    with open(labels_path, 'r') as f:
+        labels_str = f.read().split()
+    labels_ids = list(range(1, len(labels_str) + 1))
+
+    with open(ann_ids_path, 'r') as f:
+        ann_ids = [lin.strip().split(' ')[-1] for lin in f.readlines()]
+
+    ann_paths = []
+    for aid in ann_ids:
+        if aid.endswith('xml'):
+            ann_path = os.path.join(ann_dir_path, aid)
+        else:
+            ann_path = os.path.join(ann_dir_path, aid + '.xml')
+        ann_paths.append(ann_path)
+
+    return dict(zip(labels_str, labels_ids)), ann_paths
+
+
+def voc_get_image_info(annotation_root, im_id):
+    filename = annotation_root.findtext('filename')
+    assert filename is not None
+    img_name = os.path.basename(filename)
+
+    size = annotation_root.find('size')
+    width = float(size.findtext('width'))
+    height = float(size.findtext('height'))
+
+    image_info = {
+        'file_name': filename,
+        'height': height,
+        'width': width,
+        'id': im_id
+    }
+    return image_info
+
+
+def voc_get_coco_annotation(obj, label2id):
+    label = obj.findtext('name')
+    assert label in label2id, "label is not in label2id."
+    category_id = label2id[label]
+    bndbox = obj.find('bndbox')
+    xmin = float(bndbox.findtext('xmin'))
+    ymin = float(bndbox.findtext('ymin'))
+    xmax = float(bndbox.findtext('xmax'))
+    ymax = float(bndbox.findtext('ymax'))
+    assert xmax > xmin and ymax > ymin, "Box size error."
+    o_width = xmax - xmin
+    o_height = ymax - ymin
+    anno = {
+        'area': o_width * o_height,
+        'iscrowd': 0,
+        'bbox': [xmin, ymin, o_width, o_height],
+        'category_id': category_id,
+        'ignore': 0,
+    }
+    return anno
+
+
+def voc_xmls_to_cocojson(annotation_paths, label2id, output_dir, output_file):
+    output_json_dict = {
+        "images": [],
+        "type": "instances",
+        "annotations": [],
+        "categories": []
+    }
+    bnd_id = 1  # bounding box start id
+    im_id = 0
+    print('Start converting !')
+    for a_path in tqdm(annotation_paths):
+        # Read annotation xml
+        ann_tree = ET.parse(a_path)
+        ann_root = ann_tree.getroot()
+
+        img_info = voc_get_image_info(ann_root, im_id)
+        output_json_dict['images'].append(img_info)
+
+        for obj in ann_root.findall('object'):
+            ann = voc_get_coco_annotation(obj=obj, label2id=label2id)
+            ann.update({'image_id': im_id, 'id': bnd_id})
+            output_json_dict['annotations'].append(ann)
+            bnd_id = bnd_id + 1
+        im_id += 1
+
+    for label, label_id in label2id.items():
+        category_info = {'supercategory': 'none', 'id': label_id, 'name': label}
+        output_json_dict['categories'].append(category_info)
+    output_file = os.path.join(output_dir, output_file)
+    with open(output_file, 'w') as f:
+        output_json = json.dumps(output_json_dict)
+        f.write(output_json)
+
+
+def widerface_to_cocojson(root_path):
+    train_gt_txt = os.path.join(root_path, "wider_face_split", "wider_face_train_bbx_gt.txt")
+    val_gt_txt = os.path.join(root_path, "wider_face_split", "wider_face_val_bbx_gt.txt")
+    train_img_dir = os.path.join(root_path, "WIDER_train", "images")
+    val_img_dir = os.path.join(root_path, "WIDER_val", "images")
+    assert train_gt_txt
+    assert val_gt_txt
+    assert train_img_dir
+    assert val_img_dir
+    save_path = os.path.join(root_path, "widerface_train.json")
+    widerface_convert(train_gt_txt, train_img_dir, save_path)
+    print("Wider Face train dataset converts sucess, the json path: {}".format(save_path))
+    save_path = os.path.join(root_path, "widerface_val.json")
+    widerface_convert(val_gt_txt, val_img_dir, save_path)
+    print("Wider Face val dataset converts sucess, the json path: {}".format(save_path))
+
+
+def widerface_convert(gt_txt, img_dir, save_path):
+    output_json_dict = {
+        "images": [],
+        "type": "instances",
+        "annotations": [],
+        "categories": [{'supercategory': 'none', 'id': 0, 'name': "human_face"}]
+    }
+    bnd_id = 1  # bounding box start id
+    im_id = 0
+    print('Start converting !')
+    with open(gt_txt) as fd:
+        lines = fd.readlines()
+
+    i = 0
+    while i < len(lines):
+        image_name = lines[i].strip()
+        bbox_num = int(lines[i + 1].strip())
+        i += 2
+        img_info = get_widerface_image_info(img_dir, image_name, im_id)
+        if img_info:
+            output_json_dict["images"].append(img_info)
+            for j in range(i, i + bbox_num):
+                anno = get_widerface_ann_info(lines[j])
+                anno.update({'image_id': im_id, 'id': bnd_id})
+                output_json_dict['annotations'].append(anno)
+                bnd_id += 1
+        else:
+            print("The image dose not exist: {}".format(os.path.join(img_dir, image_name)))
+        bbox_num = 1 if bbox_num == 0 else bbox_num
+        i += bbox_num
+        im_id += 1
+    with open(save_path, 'w') as f:
+        output_json = json.dumps(output_json_dict)
+        f.write(output_json)
+
+
+def get_widerface_image_info(img_root, img_relative_path, img_id):
+    image_info = {}
+    save_path = os.path.join(img_root, img_relative_path)
+    if os.path.exists(save_path):
+        img = cv2.imread(save_path)
+        image_info["file_name"] = os.path.join(os.path.basename(
+            os.path.dirname(img_root)), os.path.basename(img_root),
+            img_relative_path)
+        image_info["height"] = img.shape[0]
+        image_info["width"] = img.shape[1]
+        image_info["id"] = img_id
+    return image_info
+
+
+def get_widerface_ann_info(info):
+    info = [int(x) for x in info.strip().split()]
+    anno = {
+        'area': info[2] * info[3],
+        'iscrowd': 0,
+        'bbox': [info[0], info[1], info[2], info[3]],
+        'category_id': 0,
+        'ignore': 0,
+        'blur': info[4],
+        'expression': info[5],
+        'illumination': info[6],
+        'invalid': info[7],
+        'occlusion': info[8],
+        'pose': info[9]
+    }
+    return anno
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        '--dataset_type',
+        help='the type of dataset, can be `voc`, `widerface`, `labelme` or `cityscape`')
+    parser.add_argument('--json_input_dir', help='input annotated directory')
+    parser.add_argument('--image_input_dir', help='image directory')
+    parser.add_argument(
+        '--output_dir', help='output dataset directory', default='./')
+    parser.add_argument(
+        '--train_proportion',
+        help='the proportion of train dataset',
+        type=float,
+        default=1.0)
+    parser.add_argument(
+        '--val_proportion',
+        help='the proportion of validation dataset',
+        type=float,
+        default=0.0)
+    parser.add_argument(
+        '--test_proportion',
+        help='the proportion of test dataset',
+        type=float,
+        default=0.0)
+    parser.add_argument(
+        '--voc_anno_dir',
+        help='In Voc format dataset, path to annotation files directory.',
+        type=str,
+        default=None)
+    parser.add_argument(
+        '--voc_anno_list',
+        help='In Voc format dataset, path to annotation files ids list.',
+        type=str,
+        default=None)
+    parser.add_argument(
+        '--voc_label_list',
+        help='In Voc format dataset, path to label list. The content of each line is a category.',
+        type=str,
+        default=None)
+    parser.add_argument(
+        '--voc_out_name',
+        type=str,
+        default='voc.json',
+        help='In Voc format dataset, path to output json file')
+    parser.add_argument(
+        '--widerface_root_dir',
+        help='The root_path for wider face dataset, which contains `wider_face_split`, `WIDER_train` and `WIDER_val`.And the json file will save in this path',
+        type=str,
+        default=None)
+    args = parser.parse_args()
+    try:
+        assert args.dataset_type in ['voc', 'labelme', 'cityscape', 'widerface']
+    except AssertionError as e:
+        print(
+            'Now only support the voc, cityscape dataset and labelme dataset!!')
+        os._exit(0)
+
+    if args.dataset_type == 'voc':
+        assert args.voc_anno_dir and args.voc_anno_list and args.voc_label_list
+        label2id, ann_paths = voc_get_label_anno(
+            args.voc_anno_dir, args.voc_anno_list, args.voc_label_list)
+        voc_xmls_to_cocojson(
+            annotation_paths=ann_paths,
+            label2id=label2id,
+            output_dir=args.output_dir,
+            output_file=args.voc_out_name)
+    elif args.dataset_type == "widerface":
+        assert args.widerface_root_dir
+        widerface_to_cocojson(args.widerface_root_dir)
+    else:
+        try:
+            assert os.path.exists(args.json_input_dir)
+        except AssertionError as e:
+            print('The json folder does not exist!')
+            os._exit(0)
+        try:
+            assert os.path.exists(args.image_input_dir)
+        except AssertionError as e:
+            print('The image folder does not exist!')
+            os._exit(0)
+        try:
+            assert abs(args.train_proportion + args.val_proportion \
+                    + args.test_proportion - 1.0) < 1e-5
+        except AssertionError as e:
+            print(
+                'The sum of pqoportion of training, validation and test datase must be 1!'
+            )
+            os._exit(0)
+
+        # Allocate the dataset.
+        total_num = len(glob.glob(osp.join(args.json_input_dir, '*.json')))
+        if args.train_proportion != 0:
+            train_num = int(total_num * args.train_proportion)
+            out_dir = args.output_dir + '/train'
+            if not os.path.exists(out_dir):
+                os.makedirs(out_dir)
+        else:
+            train_num = 0
+        if args.val_proportion == 0.0:
+            val_num = 0
+            test_num = total_num - train_num
+            out_dir = args.output_dir + '/test'
+            if args.test_proportion != 0.0 and not os.path.exists(out_dir):
+                os.makedirs(out_dir)
+        else:
+            val_num = int(total_num * args.val_proportion)
+            test_num = total_num - train_num - val_num
+            val_out_dir = args.output_dir + '/val'
+            if not os.path.exists(val_out_dir):
+                os.makedirs(val_out_dir)
+            test_out_dir = args.output_dir + '/test'
+            if args.test_proportion != 0.0 and not os.path.exists(test_out_dir):
+                os.makedirs(test_out_dir)
+        count = 1
+        for img_name in os.listdir(args.image_input_dir):
+            if count <= train_num:
+                if osp.exists(args.output_dir + '/train/'):
+                    shutil.copyfile(
+                        osp.join(args.image_input_dir, img_name),
+                        osp.join(args.output_dir + '/train/', img_name))
+            else:
+                if count <= train_num + val_num:
+                    if osp.exists(args.output_dir + '/val/'):
+                        shutil.copyfile(
+                            osp.join(args.image_input_dir, img_name),
+                            osp.join(args.output_dir + '/val/', img_name))
+                else:
+                    if osp.exists(args.output_dir + '/test/'):
+                        shutil.copyfile(
+                            osp.join(args.image_input_dir, img_name),
+                            osp.join(args.output_dir + '/test/', img_name))
+            count = count + 1
+
+        # Deal with the json files.
+        if not os.path.exists(args.output_dir + '/annotations'):
+            os.makedirs(args.output_dir + '/annotations')
+        if args.train_proportion != 0:
+            train_data_coco = deal_json(args.dataset_type,
+                                        args.output_dir + '/train',
+                                        args.json_input_dir)
+            train_json_path = osp.join(args.output_dir + '/annotations',
+                                       'instance_train.json')
+            json.dump(
+                train_data_coco,
+                open(train_json_path, 'w'),
+                indent=4,
+                cls=MyEncoder)
+        if args.val_proportion != 0:
+            val_data_coco = deal_json(args.dataset_type,
+                                      args.output_dir + '/val',
+                                      args.json_input_dir)
+            val_json_path = osp.join(args.output_dir + '/annotations',
+                                     'instance_val.json')
+            json.dump(
+                val_data_coco,
+                open(val_json_path, 'w'),
+                indent=4,
+                cls=MyEncoder)
+        if args.test_proportion != 0:
+            test_data_coco = deal_json(args.dataset_type,
+                                       args.output_dir + '/test',
+                                       args.json_input_dir)
+            test_json_path = osp.join(args.output_dir + '/annotations',
+                                      'instance_test.json')
+            json.dump(
+                test_data_coco,
+                open(test_json_path, 'w'),
+                indent=4,
+                cls=MyEncoder)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/rtdetr_pytorch/README.md b/rtdetr_pytorch/README.md
new file mode 100644
index 0000000..e16c898
--- /dev/null
+++ b/rtdetr_pytorch/README.md
@@ -0,0 +1,111 @@
+## TODO
+<details>
+<summary> see details </summary>
+
+- [x] Training
+- [x] Evaluation
+- [x] Export onnx
+- [x] Upload source code
+- [x] Upload weight convert from paddle, see [*links*](https://github.com/lyuwenyu/RT-DETR/issues/42)
+- [x] Align training details with the [*paddle version*](../rtdetr_paddle/)
+- [x] Tuning rtdetr based on [*pretrained weights*](https://github.com/lyuwenyu/RT-DETR/issues/42)
+
+</details>
+
+
+## Model Zoo
+
+| Model | Dataset | Input Size | AP<sup>val</sup> | AP<sub>50</sub><sup>val</sup> | #Params(M) | FPS |  checkpoint |
+| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
+rtdetr_r18vd | COCO | 640 | 46.4 | 63.7 | 20 | 217 | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r18vd_dec3_6x_coco_from_paddle.pth)
+rtdetr_r34vd | COCO | 640 | 48.9 | 66.8 | 31 | 161 | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r34vd_dec4_6x_coco_from_paddle.pth)
+rtdetr_r50vd_m | COCO | 640 | 51.3 | 69.5 | 36 | 145 | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_m_6x_coco_from_paddle.pth)
+rtdetr_r50vd | COCO | 640 | 53.1 | 71.2| 42 | 108 | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_6x_coco_from_paddle.pth)
+rtdetr_r101vd | COCO | 640 | 54.3 | 72.8 | 76 | 74 | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r101vd_6x_coco_from_paddle.pth)
+rtdetr_18vd | COCO+Objects365 | 640 | 49.0 | 66.5 | 20 | 217 | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r18vd_5x_coco_objects365_from_paddle.pth)
+rtdetr_r50vd | COCO+Objects365 | 640 | 55.2 | 73.4 | 42 | 108 | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_2x_coco_objects365_from_paddle.pth)
+rtdetr_r101vd | COCO+Objects365 | 640 | 56.2 | 74.5 | 76 | 74 | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r101vd_2x_coco_objects365_from_paddle.pth)
+rtdetr_regnet | COCO | 640 | 51.6 | 69.6 | 38 | 67 | [url<sup>*</sup>](https://drive.google.com/file/d/1K2EXJgnaEUJcZCLULHrZ492EF4PdgVp9/view?usp=sharing)
+rtdetr_dla34 | COCO | 640 | 49.6 | 67.4  | 34 | 83 | [url<sup>*</sup>](https://drive.google.com/file/d/1_rVpl-jIelwy2LDT3E4vdM4KCLBcOtzZ/view?usp=sharing)
+
+Notes
+- `COCO + Objects365` in the table means finetuned model on `COCO` using pretrained weights trained on `Objects365`.
+- `url`<sup>`*`</sup> is the url of pretrained weights convert from paddle model for save energy. *It may have slight differences between this table and paper*
+<!-- - `FPS` is evaluated on a single T4 GPU with $batch\\_size = 1$ and $tensorrt\\_fp16$ mode -->
+
+## Quick start
+
+<details>
+<summary>Install</summary>
+
+```bash
+pip install -r requirements.txt
+```
+
+</details>
+
+
+<details>
+<summary>Data</summary>
+
+- Download and extract COCO 2017 train and val images.
+```
+path/to/coco/
+  annotations/  # annotation json files
+  train2017/    # train images
+  val2017/      # val images
+```
+- Modify config [`img_folder`, `ann_file`](configs/dataset/coco_detection.yml)
+</details>
+
+
+
+<details>
+<summary>Training & Evaluation</summary>
+
+- Training on a Single GPU:
+
+```shell
+# training on single-gpu
+export CUDA_VISIBLE_DEVICES=0
+python tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml
+```
+
+- Training on Multiple GPUs:
+
+```shell
+# train on multi-gpu
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+torchrun --nproc_per_node=4 tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml
+```
+
+- Evaluation on Multiple GPUs:
+
+```shell
+# val on multi-gpu
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+torchrun --nproc_per_node=4 tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml -r path/to/checkpoint --test-only
+```
+
+</details>
+
+
+
+<details>
+<summary>Export</summary>
+
+```shell
+python tools/export_onnx.py -c configs/rtdetr/rtdetr_r18vd_6x_coco.yml -r path/to/checkpoint --check
+```
+</details>
+
+
+
+
+<details open>
+<summary>Train custom data</summary>
+
+1. set `remap_mscoco_category: False`. This variable only works for ms-coco dataset. If you want to use `remap_mscoco_category` logic on your dataset, please modify variable [`mscoco_category2name`](https://github.com/lyuwenyu/RT-DETR/blob/main/rtdetr_pytorch/src/data/coco/coco_dataset.py#L154) based on your dataset.
+
+2. add `-t path/to/checkpoint` (optinal) to tuning rtdetr based on pretrained checkpoint. see [training script details](./tools/README.md).
+</details>
diff --git a/rtdetr_pytorch/configs/dataset/coco_detection.yml b/rtdetr_pytorch/configs/dataset/coco_detection.yml
new file mode 100644
index 0000000..f71a4ef
--- /dev/null
+++ b/rtdetr_pytorch/configs/dataset/coco_detection.yml
@@ -0,0 +1,34 @@
+task: detection
+
+num_classes: 80
+remap_mscoco_category: True
+
+train_dataloader: 
+  type: DataLoader
+  dataset: 
+    type: CocoDetection
+    img_folder: ./dataset/coco/train2017/
+    ann_file: ./dataset/coco/annotations/instances_train2017.json
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: True
+  batch_size: 8
+  num_workers: 4
+  drop_last: True 
+
+
+val_dataloader:
+  type: DataLoader
+  dataset: 
+    type: CocoDetection
+    img_folder: ./dataset/coco/val2017/
+    ann_file: ./dataset/coco/annotations/instances_val2017.json
+    transforms:
+      type: Compose
+      ops: ~ 
+
+  shuffle: False
+  batch_size: 8
+  num_workers: 4
+  drop_last: False
\ No newline at end of file
diff --git a/rtdetr_pytorch/configs/rtdetr/include/dataloader.yml b/rtdetr_pytorch/configs/rtdetr/include/dataloader.yml
new file mode 100644
index 0000000..e3e6bc1
--- /dev/null
+++ b/rtdetr_pytorch/configs/rtdetr/include/dataloader.yml
@@ -0,0 +1,39 @@
+# num_classes: 91
+# remap_mscoco_category: True
+
+train_dataloader: 
+  dataset: 
+    return_masks: False
+    transforms:
+      ops:
+        - {type: RandomPhotometricDistort, p: 0.5}
+        - {type: RandomZoomOut, fill: 0}
+        - {type: RandomIoUCrop, p: 0.8}
+        - {type: SanitizeBoundingBox, min_size: 1}
+        - {type: RandomHorizontalFlip}
+        - {type: Resize, size: [640, 640], }
+        # - {type: Resize, size: 639, max_size: 640}
+        # - {type: PadToSize, spatial_size: 640}
+        - {type: ToImageTensor}
+        - {type: ConvertDtype}
+        - {type: SanitizeBoundingBox, min_size: 1}
+        - {type: ConvertBox, out_fmt: 'cxcywh', normalize: True}
+  shuffle: True
+  batch_size: 4
+  num_workers: 4
+  collate_fn: default_collate_fn
+
+
+val_dataloader:
+  dataset: 
+    transforms:
+      ops: 
+        # - {type: Resize, size: 639, max_size: 640}
+        # - {type: PadToSize, spatial_size: 640}
+        - {type: Resize, size: [640, 640]}
+        - {type: ToImageTensor}
+        - {type: ConvertDtype}
+  shuffle: False
+  batch_size: 8
+  num_workers: 4
+  collate_fn: default_collate_fn
diff --git a/rtdetr_pytorch/configs/rtdetr/include/dataloader_regnet.yml b/rtdetr_pytorch/configs/rtdetr/include/dataloader_regnet.yml
new file mode 100644
index 0000000..ba0607b
--- /dev/null
+++ b/rtdetr_pytorch/configs/rtdetr/include/dataloader_regnet.yml
@@ -0,0 +1,39 @@
+# num_classes: 91
+# remap_mscoco_category: True
+
+train_dataloader: 
+  dataset: 
+    return_masks: False
+    transforms:
+      ops:
+        - {type: RandomPhotometricDistort, p: 0.5}
+        - {type: RandomZoomOut, fill: 0}
+        - {type: RandomIoUCrop, p: 0.8}
+        - {type: SanitizeBoundingBox, min_size: 1}
+        - {type: RandomHorizontalFlip}
+        - {type: Resize, size: [640, 640], }
+        # - {type: Resize, size: 639, max_size: 640}
+        # - {type: PadToSize, spatial_size: 640}
+        - {type: ToImageTensor}
+        - {type: ConvertDtype}
+        - {type: SanitizeBoundingBox, min_size: 1}
+        - {type: ConvertBox, out_fmt: 'cxcywh', normalize: True}
+  shuffle: True
+  batch_size: 8
+  num_workers: 2
+  collate_fn: default_collate_fn
+
+
+val_dataloader:
+  dataset: 
+    transforms:
+      ops: 
+        # - {type: Resize, size: 639, max_size: 640}
+        # - {type: PadToSize, spatial_size: 640}
+        - {type: Resize, size: [640, 640]}
+        - {type: ToImageTensor}
+        - {type: ConvertDtype}
+  shuffle: False
+  batch_size: 8
+  num_workers: 2
+  collate_fn: default_collate_fn
diff --git a/rtdetr_pytorch/configs/rtdetr/include/optimizer.yml b/rtdetr_pytorch/configs/rtdetr/include/optimizer.yml
new file mode 100644
index 0000000..af2ad65
--- /dev/null
+++ b/rtdetr_pytorch/configs/rtdetr/include/optimizer.yml
@@ -0,0 +1,36 @@
+
+use_ema: True 
+ema:
+  type: ModelEMA
+  decay: 0.9999
+  warmups: 2000
+
+
+find_unused_parameters: True 
+
+epoches: 72
+clip_max_norm: 0.1
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: 'backbone'
+      lr: 0.00001
+    - 
+      params: '^(?=.*encoder(?=.*bias|.*norm.*weight)).*$'
+      weight_decay: 0.
+    -
+      params: '^(?=.*decoder(?=.*bias|.*norm.*weight)).*$'
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+
+lr_scheduler:
+  type: MultiStepLR
+  milestones: [1000]
+  gamma: 0.1
+
diff --git a/rtdetr_pytorch/configs/rtdetr/include/optimizer_regnet.yml b/rtdetr_pytorch/configs/rtdetr/include/optimizer_regnet.yml
new file mode 100644
index 0000000..52bd7a3
--- /dev/null
+++ b/rtdetr_pytorch/configs/rtdetr/include/optimizer_regnet.yml
@@ -0,0 +1,33 @@
+
+use_ema: True 
+ema:
+  type: ModelEMA
+  decay: 0.9999
+  warmups: 2000
+
+
+find_unused_parameters: True 
+
+epoches: 72
+clip_max_norm: 0.1
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*encoder(?=.*bias|.*norm.*weight)).*$'
+      weight_decay: 0.
+    -
+      params: '^(?=.*decoder(?=.*bias|.*norm.*weight)).*$'
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+
+lr_scheduler:
+  type: MultiStepLR
+  milestones: [1000]
+  gamma: 0.1
+
diff --git a/rtdetr_pytorch/configs/rtdetr/include/rtdetr_dla34.yml b/rtdetr_pytorch/configs/rtdetr/include/rtdetr_dla34.yml
new file mode 100644
index 0000000..209d344
--- /dev/null
+++ b/rtdetr_pytorch/configs/rtdetr/include/rtdetr_dla34.yml
@@ -0,0 +1,78 @@
+task: detection
+
+model: RTDETR
+criterion: SetCriterion
+postprocessor: RTDETRPostProcessor
+
+
+RTDETR: 
+  backbone: DLANet
+  encoder: HybridEncoder
+  decoder: RTDETRTransformer
+  multi_scale: [480, 512, 544, 576, 608, 640, 640, 640, 672, 704, 736, 768, 800]
+
+DLANet:
+  dla: dla34
+  pretrained: True
+  return_idx: [1, 2, 3]
+
+
+HybridEncoder:
+  in_channels: [128, 256, 512]
+  feat_strides: [8, 16, 32]
+
+  # intra
+  hidden_dim: 256
+  use_encoder_idx: [2]
+  num_encoder_layers: 1
+  nhead: 8
+  dim_feedforward: 1024
+  dropout: 0.
+  enc_act: 'gelu'
+  pe_temperature: 10000
+  
+  # cross
+  expansion: 1.0
+  depth_mult: 1
+  act: 'silu'
+
+  # eval
+  eval_spatial_size: [640, 640]
+
+
+RTDETRTransformer:
+  feat_channels: [256, 256, 256]
+  feat_strides: [8, 16, 32]
+  hidden_dim: 256
+  num_levels: 3
+
+  num_queries: 300
+
+  num_decoder_layers: 6
+  num_denoising: 100
+  
+  eval_idx: -1
+  eval_spatial_size: [640, 640]
+
+
+use_focal_loss: True
+
+RTDETRPostProcessor:
+  num_top_queries: 300
+
+
+SetCriterion:
+  weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2,}
+  losses: ['vfl', 'boxes', ]
+  alpha: 0.75
+  gamma: 2.0
+
+  matcher:
+    type: HungarianMatcher
+    weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2}
+    # use_focal_loss: True 
+    alpha: 0.25
+    gamma: 2.0
+
+
+
diff --git a/rtdetr_pytorch/configs/rtdetr/include/rtdetr_r50vd.yml b/rtdetr_pytorch/configs/rtdetr/include/rtdetr_r50vd.yml
new file mode 100644
index 0000000..7f2e1f3
--- /dev/null
+++ b/rtdetr_pytorch/configs/rtdetr/include/rtdetr_r50vd.yml
@@ -0,0 +1,81 @@
+task: detection
+
+model: RTDETR
+criterion: SetCriterion
+postprocessor: RTDETRPostProcessor
+
+
+RTDETR: 
+  backbone: PResNet
+  encoder: HybridEncoder
+  decoder: RTDETRTransformer
+  multi_scale: [480, 512, 544, 576, 608, 640, 640, 640, 672, 704, 736, 768, 800]
+
+PResNet:
+  depth: 50
+  variant: d
+  freeze_at: 0
+  return_idx: [1, 2, 3]
+  num_stages: 4
+  freeze_norm: True
+  pretrained: True 
+
+HybridEncoder:
+  in_channels: [512, 1024, 2048]
+  feat_strides: [8, 16, 32]
+
+  # intra
+  hidden_dim: 256
+  use_encoder_idx: [2]
+  num_encoder_layers: 1
+  nhead: 8
+  dim_feedforward: 1024
+  dropout: 0.
+  enc_act: 'gelu'
+  pe_temperature: 10000
+  
+  # cross
+  expansion: 1.0
+  depth_mult: 1
+  act: 'silu'
+
+  # eval
+  eval_spatial_size: [640, 640]
+
+
+RTDETRTransformer:
+  feat_channels: [256, 256, 256]
+  feat_strides: [8, 16, 32]
+  hidden_dim: 256
+  num_levels: 3
+
+  num_queries: 300
+
+  num_decoder_layers: 6
+  num_denoising: 100
+  
+  eval_idx: -1
+  eval_spatial_size: [640, 640]
+
+
+use_focal_loss: True
+
+RTDETRPostProcessor:
+  num_top_queries: 300
+
+
+SetCriterion:
+  weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2,}
+  losses: ['vfl', 'boxes', ]
+  alpha: 0.75
+  gamma: 2.0
+
+  matcher:
+    type: HungarianMatcher
+    weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2}
+    # use_focal_loss: True 
+    alpha: 0.25
+    gamma: 2.0
+
+
+
diff --git a/rtdetr_pytorch/configs/rtdetr/include/rtdetr_regnet.yml b/rtdetr_pytorch/configs/rtdetr/include/rtdetr_regnet.yml
new file mode 100644
index 0000000..0bc8cce
--- /dev/null
+++ b/rtdetr_pytorch/configs/rtdetr/include/rtdetr_regnet.yml
@@ -0,0 +1,77 @@
+task: detection
+
+model: RTDETR
+criterion: SetCriterion
+postprocessor: RTDETRPostProcessor
+
+
+RTDETR: 
+  backbone: RegNet
+  encoder: HybridEncoder
+  decoder: RTDETRTransformer
+  multi_scale: [480, 512, 544, 576, 608, 640, 640, 640, 672, 704, 736, 768, 800]
+
+
+RegNet:
+  return_idx: [1, 2, 3]
+  configuration: RegNetConfig()
+
+HybridEncoder:
+  in_channels: [192, 512, 1088]
+  feat_strides: [8, 16, 32]
+
+  # intra
+  hidden_dim: 256
+  use_encoder_idx: [2]
+  num_encoder_layers: 1
+  nhead: 8
+  dim_feedforward: 1024
+  dropout: 0.
+  enc_act: 'gelu'
+  pe_temperature: 10000
+  
+  # cross
+  expansion: 1.0
+  depth_mult: 1
+  act: 'silu'
+
+  # eval
+  eval_spatial_size: [640, 640]
+
+
+RTDETRTransformer:
+  feat_channels: [256, 256, 256]
+  feat_strides: [8, 16, 32]
+  hidden_dim: 256
+  num_levels: 3
+
+  num_queries: 300
+
+  num_decoder_layers: 6
+  num_denoising: 100
+  
+  eval_idx: -1
+  eval_spatial_size: [640, 640]
+
+
+use_focal_loss: True
+
+RTDETRPostProcessor:
+  num_top_queries: 300
+
+
+SetCriterion:
+  weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2,}
+  losses: ['vfl', 'boxes', ]
+  alpha: 0.75
+  gamma: 2.0
+
+  matcher:
+    type: HungarianMatcher
+    weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2}
+    # use_focal_loss: True 
+    alpha: 0.25
+    gamma: 2.0
+
+
+
diff --git a/rtdetr_pytorch/configs/rtdetr/rtdetr_dla34_6x_coco.yml b/rtdetr_pytorch/configs/rtdetr/rtdetr_dla34_6x_coco.yml
new file mode 100644
index 0000000..81d8339
--- /dev/null
+++ b/rtdetr_pytorch/configs/rtdetr/rtdetr_dla34_6x_coco.yml
@@ -0,0 +1,9 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetr_dla34.yml',
+]
+
+output_dir: ./output/rtdetr_dla34_6x_coco
diff --git a/rtdetr_pytorch/configs/rtdetr/rtdetr_r101vd_6x_coco.yml b/rtdetr_pytorch/configs/rtdetr/rtdetr_r101vd_6x_coco.yml
new file mode 100644
index 0000000..c6be6e0
--- /dev/null
+++ b/rtdetr_pytorch/configs/rtdetr/rtdetr_r101vd_6x_coco.yml
@@ -0,0 +1,28 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetr_r50vd.yml',
+]
+
+PResNet:
+  depth: 101
+
+
+HybridEncoder:
+  # intra
+  hidden_dim: 384
+  dim_feedforward: 2048
+
+
+RTDETRTransformer:
+  feat_channels: [384, 384, 384]
+
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: 'backbone'
+      lr: 0.000001
\ No newline at end of file
diff --git a/rtdetr_pytorch/configs/rtdetr/rtdetr_r18vd_6x_coco.yml b/rtdetr_pytorch/configs/rtdetr/rtdetr_r18vd_6x_coco.yml
new file mode 100644
index 0000000..791dd4b
--- /dev/null
+++ b/rtdetr_pytorch/configs/rtdetr/rtdetr_r18vd_6x_coco.yml
@@ -0,0 +1,49 @@
+
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetr_r50vd.yml',
+]
+
+
+output_dir: ./output/rtdetr_r18vd_6x_coco
+
+PResNet:
+  depth: 18
+  freeze_at: -1
+  freeze_norm: False
+  pretrained: True
+
+HybridEncoder:
+  in_channels: [128, 256, 512]
+  hidden_dim: 256
+  expansion: 0.5
+
+
+RTDETRTransformer:
+  eval_idx: -1
+  num_decoder_layers: 3
+  num_denoising: 100
+
+
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?=.*norm).*$'
+      lr: 0.00001
+      weight_decay: 0.
+    - 
+      params: '^(?=.*backbone)(?!.*norm).*$'
+      lr: 0.00001
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
diff --git a/rtdetr_pytorch/configs/rtdetr/rtdetr_r34vd_6x_coco.yml b/rtdetr_pytorch/configs/rtdetr/rtdetr_r34vd_6x_coco.yml
new file mode 100644
index 0000000..e7779a3
--- /dev/null
+++ b/rtdetr_pytorch/configs/rtdetr/rtdetr_r34vd_6x_coco.yml
@@ -0,0 +1,48 @@
+
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetr_r50vd.yml',
+]
+
+
+output_dir: ./output/rtdetr_r34vd_6x_coco
+
+
+PResNet:
+  depth: 34
+  freeze_at: -1
+  freeze_norm: False
+  pretrained: True
+
+
+HybridEncoder:
+  in_channels: [128, 256, 512]
+  hidden_dim: 256
+  expansion: 0.5
+
+
+RTDETRTransformer:
+  num_decoder_layers: 4
+
+
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?=.*norm|bn).*$'
+      weight_decay: 0.
+      lr: 0.00001
+    - 
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.00001
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
diff --git a/rtdetr_pytorch/configs/rtdetr/rtdetr_r50vd_6x_coco.yml b/rtdetr_pytorch/configs/rtdetr/rtdetr_r50vd_6x_coco.yml
new file mode 100644
index 0000000..1294971
--- /dev/null
+++ b/rtdetr_pytorch/configs/rtdetr/rtdetr_r50vd_6x_coco.yml
@@ -0,0 +1,9 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetr_r50vd.yml',
+]
+
+output_dir: ./output/rtdetr_r50vd_6x_coco
diff --git a/rtdetr_pytorch/configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml b/rtdetr_pytorch/configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml
new file mode 100644
index 0000000..6e61823
--- /dev/null
+++ b/rtdetr_pytorch/configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml
@@ -0,0 +1,16 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetr_r50vd.yml',
+]
+
+output_dir: ./output/rtdetr_r50vd_m_6x_coco
+
+
+HybridEncoder:
+  expansion: 0.5
+
+RTDETRTransformer:
+  eval_idx: 2 # use 3th decoder layer to eval
\ No newline at end of file
diff --git a/rtdetr_pytorch/configs/rtdetr/rtdetr_regnet_6x_coco.yml b/rtdetr_pytorch/configs/rtdetr/rtdetr_regnet_6x_coco.yml
new file mode 100644
index 0000000..a5d8672
--- /dev/null
+++ b/rtdetr_pytorch/configs/rtdetr/rtdetr_regnet_6x_coco.yml
@@ -0,0 +1,9 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader_regnet.yml',
+  './include/optimizer_regnet.yml',
+  './include/rtdetr_regnet.yml',
+]
+
+output_dir: ./output/rtdetr_regnet_6x_coco
diff --git a/rtdetr_pytorch/configs/runtime.yml b/rtdetr_pytorch/configs/runtime.yml
new file mode 100644
index 0000000..f08620f
--- /dev/null
+++ b/rtdetr_pytorch/configs/runtime.yml
@@ -0,0 +1,17 @@
+sync_bn: True
+find_unused_parameters: False
+
+
+use_amp: False
+
+scaler:
+  type: GradScaler
+  enabled: True
+
+
+use_ema: False
+ema:
+  type: ModelEMA
+  decay: 0.9999
+  warmups: 2000
+
diff --git a/rtdetr_pytorch/requirements.txt b/rtdetr_pytorch/requirements.txt
new file mode 100644
index 0000000..eb0fb1c
--- /dev/null
+++ b/rtdetr_pytorch/requirements.txt
@@ -0,0 +1,8 @@
+torch==2.0.1
+torchvision==0.15.2
+onnx==1.14.0
+onnxruntime==1.15.1
+pycocotools
+PyYAML
+scipy
+transformers
diff --git a/rtdetr_pytorch/src/__init__.py b/rtdetr_pytorch/src/__init__.py
new file mode 100644
index 0000000..6cb1033
--- /dev/null
+++ b/rtdetr_pytorch/src/__init__.py
@@ -0,0 +1,5 @@
+
+from . import data 
+from . import nn
+from . import optim
+from . import zoo
diff --git a/rtdetr_pytorch/src/core/__init__.py b/rtdetr_pytorch/src/core/__init__.py
new file mode 100644
index 0000000..35c455c
--- /dev/null
+++ b/rtdetr_pytorch/src/core/__init__.py
@@ -0,0 +1,7 @@
+"""by lyuwenyu
+"""
+
+# from .yaml_utils import register, create, load_config, merge_config, merge_dict
+from .yaml_utils import *
+from .config import BaseConfig
+from .yaml_config import YAMLConfig
diff --git a/rtdetr_pytorch/src/core/config.py b/rtdetr_pytorch/src/core/config.py
new file mode 100644
index 0000000..cf803ef
--- /dev/null
+++ b/rtdetr_pytorch/src/core/config.py
@@ -0,0 +1,264 @@
+"""by lyuwenyu
+"""
+
+from pprint import pprint
+import torch 
+import torch.nn as nn 
+from torch.utils.data import Dataset, DataLoader
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LRScheduler
+from torch.cuda.amp.grad_scaler import GradScaler
+
+from typing import Callable, List, Dict
+
+
+__all__ = ['BaseConfig', ]
+
+
+
+class BaseConfig(object):
+    # TODO property
+
+
+    def __init__(self) -> None:
+        super().__init__()
+
+        self.task :str = None 
+        
+        self._model :nn.Module = None 
+        self._postprocessor :nn.Module = None 
+        self._criterion :nn.Module = None 
+        self._optimizer :Optimizer = None 
+        self._lr_scheduler :LRScheduler = None 
+        self._train_dataloader :DataLoader = None 
+        self._val_dataloader :DataLoader = None 
+        self._ema :nn.Module = None 
+        self._scaler :GradScaler = None 
+
+        self.train_dataset :Dataset = None
+        self.val_dataset :Dataset = None
+        self.num_workers :int = 0
+        self.collate_fn :Callable = None
+
+        self.batch_size :int = None
+        self._train_batch_size :int = None
+        self._val_batch_size :int = None
+        self._train_shuffle: bool = None  
+        self._val_shuffle: bool = None 
+
+        self.evaluator :Callable[[nn.Module, DataLoader, str], ] = None
+
+        # runtime
+        self.resume :str = None
+        self.tuning :str = None
+
+        self.epoches :int = None
+        self.last_epoch :int = -1
+        self.end_epoch :int = None
+
+        self.use_amp :bool = False 
+        self.use_ema :bool = False 
+        self.sync_bn :bool = False 
+        self.clip_max_norm : float = None
+        self.find_unused_parameters :bool = None
+        # self.ema_decay: float = 0.9999
+        # self.grad_clip_: Callable = None
+
+        self.log_dir :str = './logs/'
+        self.log_step :int = 10
+        self._output_dir :str = None
+        self._print_freq :int = None 
+        self.checkpoint_step :int = 1
+
+        # self.device :str = torch.device('cpu')
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.device = torch.device(device)
+
+
+    @property
+    def model(self, ) -> nn.Module:
+        return self._model 
+    
+    @model.setter
+    def model(self, m):
+        assert isinstance(m, nn.Module), f'{type(m)} != nn.Module, please check your model class'
+        self._model = m 
+
+    @property
+    def postprocessor(self, ) -> nn.Module:
+        return self._postprocessor
+    
+    @postprocessor.setter
+    def postprocessor(self, m):
+        assert isinstance(m, nn.Module), f'{type(m)} != nn.Module, please check your model class'
+        self._postprocessor = m 
+
+    @property
+    def criterion(self, ) -> nn.Module:
+        return self._criterion
+    
+    @criterion.setter
+    def criterion(self, m):
+        assert isinstance(m, nn.Module), f'{type(m)} != nn.Module, please check your model class'
+        self._criterion = m 
+
+    @property
+    def optimizer(self, ) -> Optimizer:
+        return self._optimizer
+    
+    @optimizer.setter
+    def optimizer(self, m):
+        assert isinstance(m, Optimizer), f'{type(m)} != optim.Optimizer, please check your model class'
+        self._optimizer = m 
+
+    @property
+    def lr_scheduler(self, ) -> LRScheduler:
+        return self._lr_scheduler
+    
+    @lr_scheduler.setter
+    def lr_scheduler(self, m):
+        assert isinstance(m, LRScheduler), f'{type(m)} != LRScheduler, please check your model class'
+        self._lr_scheduler = m 
+
+
+    @property
+    def train_dataloader(self):
+        if self._train_dataloader is None and self.train_dataset is not None:
+            loader = DataLoader(self.train_dataset, 
+                                batch_size=self.train_batch_size, 
+                                num_workers=self.num_workers, 
+                                collate_fn=self.collate_fn,
+                                shuffle=self.train_shuffle, )
+            loader.shuffle = self.train_shuffle
+            self._train_dataloader = loader
+
+        return self._train_dataloader
+
+    @train_dataloader.setter
+    def train_dataloader(self, loader):
+        self._train_dataloader = loader 
+
+    @property
+    def val_dataloader(self):
+        if self._val_dataloader is None and self.val_dataset is not None:
+            loader = DataLoader(self.val_dataset, 
+                                batch_size=self.val_batch_size, 
+                                num_workers=self.num_workers, 
+                                drop_last=False,
+                                collate_fn=self.collate_fn, 
+                                shuffle=self.val_shuffle)
+            loader.shuffle = self.val_shuffle
+            self._val_dataloader = loader
+
+        return self._val_dataloader
+    
+    @val_dataloader.setter
+    def val_dataloader(self, loader):
+        self._val_dataloader = loader 
+
+
+    # TODO method
+    # @property
+    # def ema(self, ) -> nn.Module:
+    #     if self._ema is None and self.use_ema and self.model is not None:
+    #         self._ema = ModelEMA(self.model, self.ema_decay)
+    #     return self._ema
+
+    @property
+    def ema(self, ) -> nn.Module:
+        return self._ema 
+
+    @ema.setter
+    def ema(self, obj):
+        self._ema = obj
+    
+
+    @property
+    def scaler(self) -> GradScaler: 
+        if self._scaler is None and self.use_amp and torch.cuda.is_available():
+            self._scaler = GradScaler()
+        return self._scaler
+    
+    @scaler.setter
+    def scaler(self, obj: GradScaler):
+        self._scaler = obj
+
+
+    @property
+    def val_shuffle(self):
+        if self._val_shuffle is None:
+            print('warning: set default val_shuffle=False')
+            return False
+        return self._val_shuffle
+
+    @val_shuffle.setter
+    def val_shuffle(self, shuffle):
+        assert isinstance(shuffle, bool), 'shuffle must be bool'
+        self._val_shuffle = shuffle
+
+    @property
+    def train_shuffle(self):
+        if self._train_shuffle is None:
+            print('warning: set default train_shuffle=True')
+            return True
+        return self._train_shuffle
+
+    @train_shuffle.setter
+    def train_shuffle(self, shuffle):
+        assert isinstance(shuffle, bool), 'shuffle must be bool'
+        self._train_shuffle = shuffle
+
+
+    @property
+    def train_batch_size(self):
+        if self._train_batch_size is None and isinstance(self.batch_size, int):
+            print(f'warning: set train_batch_size=batch_size={self.batch_size}')
+            return self.batch_size
+        return self._train_batch_size
+
+    @train_batch_size.setter
+    def train_batch_size(self, batch_size):
+        assert isinstance(batch_size, int), 'batch_size must be int'
+        self._train_batch_size = batch_size
+
+    @property
+    def val_batch_size(self):
+        if self._val_batch_size is None:
+            print(f'warning: set val_batch_size=batch_size={self.batch_size}')
+            return self.batch_size
+        return self._val_batch_size
+
+    @val_batch_size.setter
+    def val_batch_size(self, batch_size):
+        assert isinstance(batch_size, int), 'batch_size must be int'
+        self._val_batch_size = batch_size
+
+
+    @property
+    def output_dir(self):
+        if self._output_dir is None:
+            return self.log_dir
+        return self._output_dir
+
+    @output_dir.setter
+    def output_dir(self, root):
+        self._output_dir = root
+
+    @property
+    def print_freq(self):
+        if self._print_freq is None:
+            # self._print_freq = self.log_step
+            return self.log_step
+        return self._print_freq
+
+    @print_freq.setter
+    def print_freq(self, n):
+        assert isinstance(n, int), 'print_freq must be int'
+        self._print_freq = n
+
+
+    # def __repr__(self) -> str:
+    #     pass 
+
+
+
diff --git a/rtdetr_pytorch/src/core/yaml_config.py b/rtdetr_pytorch/src/core/yaml_config.py
new file mode 100644
index 0000000..6f8f7ef
--- /dev/null
+++ b/rtdetr_pytorch/src/core/yaml_config.py
@@ -0,0 +1,152 @@
+"""by lyuwenyu
+"""
+
+import torch 
+import torch.nn as nn
+
+import re 
+import copy
+
+from .config import BaseConfig
+from .yaml_utils import load_config, merge_config, create, merge_dict
+
+
+class YAMLConfig(BaseConfig):
+    def __init__(self, cfg_path: str, **kwargs) -> None:
+        super().__init__()
+
+        cfg = load_config(cfg_path)
+        merge_dict(cfg, kwargs)
+
+        # pprint(cfg)
+
+        self.yaml_cfg = cfg 
+
+        self.log_step = cfg.get('log_step', 100)
+        self.checkpoint_step = cfg.get('checkpoint_step', 1)
+        self.epoches = cfg.get('epoches', -1)
+        self.resume = cfg.get('resume', '')
+        self.tuning = cfg.get('tuning', '')
+        self.sync_bn = cfg.get('sync_bn', False)
+        self.output_dir = cfg.get('output_dir', None)
+        
+        self.use_ema = cfg.get('use_ema', False)
+        self.use_amp = cfg.get('use_amp', False)
+        self.autocast = cfg.get('autocast', dict())
+        self.find_unused_parameters = cfg.get('find_unused_parameters', None)
+        self.clip_max_norm = cfg.get('clip_max_norm', 0.)
+
+
+    @property
+    def model(self, ) -> torch.nn.Module:
+        if self._model is None and 'model' in self.yaml_cfg:
+            merge_config(self.yaml_cfg)
+            self._model = create(self.yaml_cfg['model'])
+        return self._model 
+
+    @property
+    def postprocessor(self, ) -> torch.nn.Module:
+        if self._postprocessor is None and 'postprocessor' in self.yaml_cfg:
+            merge_config(self.yaml_cfg)
+            self._postprocessor = create(self.yaml_cfg['postprocessor'])
+        return self._postprocessor
+
+    @property
+    def criterion(self, ):
+        if self._criterion is None and 'criterion' in self.yaml_cfg:
+            merge_config(self.yaml_cfg)
+            self._criterion = create(self.yaml_cfg['criterion'])
+        return self._criterion
+
+    
+    @property
+    def optimizer(self, ):
+        if self._optimizer is None and 'optimizer' in self.yaml_cfg:
+            merge_config(self.yaml_cfg)
+            params = self.get_optim_params(self.yaml_cfg['optimizer'], self.model)
+            self._optimizer = create('optimizer', params=params)
+
+        return self._optimizer
+    
+    @property
+    def lr_scheduler(self, ):
+        if self._lr_scheduler is None and 'lr_scheduler' in self.yaml_cfg:
+            merge_config(self.yaml_cfg)
+            self._lr_scheduler = create('lr_scheduler', optimizer=self.optimizer)
+            print('Initial lr: ', self._lr_scheduler.get_last_lr())
+
+        return self._lr_scheduler
+    
+    @property
+    def train_dataloader(self, ):
+        if self._train_dataloader is None and 'train_dataloader' in self.yaml_cfg:
+            merge_config(self.yaml_cfg)
+            self._train_dataloader = create('train_dataloader')
+            self._train_dataloader.shuffle = self.yaml_cfg['train_dataloader'].get('shuffle', False)
+
+        return self._train_dataloader
+    
+    @property
+    def val_dataloader(self, ):
+        if self._val_dataloader is None and 'val_dataloader' in self.yaml_cfg:
+            merge_config(self.yaml_cfg)
+            self._val_dataloader = create('val_dataloader')
+            self._val_dataloader.shuffle = self.yaml_cfg['val_dataloader'].get('shuffle', False)
+
+        return self._val_dataloader
+    
+    
+    @property
+    def ema(self, ):
+        if self._ema is None and self.yaml_cfg.get('use_ema', False):
+            merge_config(self.yaml_cfg)
+            self._ema = create('ema', model=self.model)
+            
+        return self._ema
+    
+
+    @property
+    def scaler(self, ):
+        if self._scaler is None and self.yaml_cfg.get('use_amp', False):
+            merge_config(self.yaml_cfg)
+            self._scaler = create('scaler')
+
+        return self._scaler
+
+ 
+    @staticmethod
+    def get_optim_params(cfg: dict, model: nn.Module):
+        '''
+        E.g.:
+            ^(?=.*a)(?=.*b).*$         means including a and b
+            ^((?!b.)*a((?!b).)*$       means including a but not b
+            ^((?!b|c).)*a((?!b|c).)*$  means including a but not (b | c)
+        '''
+        assert 'type' in cfg, ''
+        cfg = copy.deepcopy(cfg)
+
+        if 'params' not in cfg:
+            return model.parameters() 
+
+        assert isinstance(cfg['params'], list), ''
+
+        param_groups = []
+        visited = []
+        for pg in cfg['params']:
+            pattern = pg['params']
+            params = {k: v for k, v in model.named_parameters() if v.requires_grad and len(re.findall(pattern, k)) > 0}
+            pg['params'] = params.values()
+            param_groups.append(pg)
+            visited.extend(list(params.keys()))
+
+        names = [k for k, v in model.named_parameters() if v.requires_grad]
+
+        if len(visited) < len(names):
+            unseen = set(names) - set(visited)
+            params = {k: v for k, v in model.named_parameters() if v.requires_grad and k in unseen}
+            param_groups.append({'params': params.values()})
+            visited.extend(list(params.keys()))
+
+        assert len(visited) == len(names), ''
+
+        return param_groups
diff --git a/rtdetr_pytorch/src/core/yaml_utils.py b/rtdetr_pytorch/src/core/yaml_utils.py
new file mode 100644
index 0000000..c9ed259
--- /dev/null
+++ b/rtdetr_pytorch/src/core/yaml_utils.py
@@ -0,0 +1,208 @@
+""""by lyuwenyu
+"""
+
+import os
+import yaml 
+import inspect
+import importlib
+
+__all__ = ['GLOBAL_CONFIG', 'register', 'create', 'load_config', 'merge_config', 'merge_dict']
+
+
+GLOBAL_CONFIG = dict()
+INCLUDE_KEY = '__include__'
+
+
+def register(cls: type):
+    '''
+    Args:
+        cls (type): Module class to be registered.
+    '''
+    if cls.__name__ in GLOBAL_CONFIG:
+        raise ValueError('{} already registered'.format(cls.__name__))
+
+    if inspect.isfunction(cls):
+        GLOBAL_CONFIG[cls.__name__] = cls
+    
+    elif inspect.isclass(cls):
+        GLOBAL_CONFIG[cls.__name__] = extract_schema(cls)
+
+    else:
+        raise ValueError(f'register {cls}')
+
+    return cls 
+
+
+def extract_schema(cls: type):
+    '''
+    Args:
+        cls (type),
+    Return:
+        Dict, 
+    '''
+    argspec = inspect.getfullargspec(cls.__init__)
+    arg_names = [arg for arg in argspec.args if arg != 'self']
+    num_defualts = len(argspec.defaults) if argspec.defaults is not None else 0
+    num_requires = len(arg_names) - num_defualts
+
+    schame = dict()
+    schame['_name'] = cls.__name__
+    schame['_pymodule'] = importlib.import_module(cls.__module__)
+    schame['_inject'] = getattr(cls, '__inject__', [])
+    schame['_share'] = getattr(cls, '__share__', [])
+
+    for i, name in enumerate(arg_names):
+        if name in schame['_share']:
+            assert i >= num_requires, 'share config must have default value.'
+            value = argspec.defaults[i - num_requires]
+        
+        elif i >= num_requires:
+            value = argspec.defaults[i - num_requires]
+
+        else:
+            value = None 
+
+        schame[name] = value
+        
+    return schame
+
+
+
+def create(type_or_name, **kwargs):
+    '''
+    '''
+    assert type(type_or_name) in (type, str), 'create should be class or name.'
+
+    name = type_or_name if isinstance(type_or_name, str) else type_or_name.__name__
+
+    if name in GLOBAL_CONFIG:
+        if hasattr(GLOBAL_CONFIG[name], '__dict__'):
+            return GLOBAL_CONFIG[name]
+    else:
+        raise ValueError('The module {} is not registered'.format(name))
+
+    cfg = GLOBAL_CONFIG[name]
+
+    if isinstance(cfg, dict) and 'type' in cfg:
+        _cfg: dict = GLOBAL_CONFIG[cfg['type']]
+        _cfg.update(cfg) # update global cls default args 
+        _cfg.update(kwargs) # TODO
+        name = _cfg.pop('type')
+        
+        return create(name)
+
+
+    cls = getattr(cfg['_pymodule'], name)
+    argspec = inspect.getfullargspec(cls.__init__)
+    arg_names = [arg for arg in argspec.args if arg != 'self']
+    
+    cls_kwargs = {}
+    cls_kwargs.update(cfg)
+    
+    # shared var
+    for k in cfg['_share']:
+        if k in GLOBAL_CONFIG:
+            cls_kwargs[k] = GLOBAL_CONFIG[k]
+        else:
+            cls_kwargs[k] = cfg[k]
+
+    # inject
+    for k in cfg['_inject']:
+        _k = cfg[k]
+
+        if _k is None:
+            continue
+
+        if isinstance(_k, str):            
+            if _k not in GLOBAL_CONFIG:
+                raise ValueError(f'Missing inject config of {_k}.')
+
+            _cfg = GLOBAL_CONFIG[_k]
+            
+            if isinstance(_cfg, dict):
+                cls_kwargs[k] = create(_cfg['_name'])
+            else:
+                cls_kwargs[k] = _cfg 
+
+        elif isinstance(_k, dict):
+            if 'type' not in _k.keys():
+                raise ValueError(f'Missing inject for `type` style.')
+
+            _type = str(_k['type'])
+            if _type not in GLOBAL_CONFIG:
+                raise ValueError(f'Missing {_type} in inspect stage.')
+
+            # TODO modified inspace, maybe get wrong result for using `> 1`
+            _cfg: dict = GLOBAL_CONFIG[_type]
+            # _cfg_copy = copy.deepcopy(_cfg)
+            _cfg.update(_k) # update 
+            cls_kwargs[k] = create(_type)
+            # _cfg.update(_cfg_copy) # resume
+
+        else:
+            raise ValueError(f'Inject does not support {_k}')
+
+
+    cls_kwargs = {n: cls_kwargs[n] for n in arg_names}
+
+    return cls(**cls_kwargs)
+
+
+
+def load_config(file_path, cfg=dict()):
+    '''load config
+    '''
+    _, ext = os.path.splitext(file_path)
+    assert ext in ['.yml', '.yaml'], "only support yaml files for now"
+
+    with open(file_path) as f:
+        file_cfg = yaml.load(f, Loader=yaml.Loader)
+        if file_cfg is None:
+            return {}
+
+    if INCLUDE_KEY in file_cfg:
+        base_yamls = list(file_cfg[INCLUDE_KEY])
+        for base_yaml in base_yamls:
+            if base_yaml.startswith('~'):
+                base_yaml = os.path.expanduser(base_yaml)
+
+            if not base_yaml.startswith('/'):
+                base_yaml = os.path.join(os.path.dirname(file_path), base_yaml)
+
+            with open(base_yaml) as f:
+                base_cfg = load_config(base_yaml, cfg)
+                merge_config(base_cfg, cfg)
+
+    return merge_config(file_cfg, cfg)
+
+
+
+def merge_dict(dct, another_dct):
+    '''merge another_dct into dct
+    '''
+    for k in another_dct:
+        if (k in dct and isinstance(dct[k], dict) and isinstance(another_dct[k], dict)):
+            merge_dict(dct[k], another_dct[k])
+        else:
+            dct[k] = another_dct[k]
+
+    return dct
+
+
+
+def merge_config(config, another_cfg=None):
+    """
+    Merge config into global config or another_cfg.
+
+    Args:
+        config (dict): Config to be merged.
+
+    Returns: global config
+    """
+    global GLOBAL_CONFIG
+    dct = GLOBAL_CONFIG if another_cfg is None else another_cfg
+    
+    return merge_dict(dct, config)
+
+
+
diff --git a/rtdetr_pytorch/src/data/__init__.py b/rtdetr_pytorch/src/data/__init__.py
new file mode 100644
index 0000000..95715f8
--- /dev/null
+++ b/rtdetr_pytorch/src/data/__init__.py
@@ -0,0 +1,7 @@
+
+from .coco import *
+from .cifar10 import CIFAR10
+
+from .dataloader import *
+from .transforms import *
+
diff --git a/rtdetr_pytorch/src/data/cifar10/__init__.py b/rtdetr_pytorch/src/data/cifar10/__init__.py
new file mode 100644
index 0000000..e5267dc
--- /dev/null
+++ b/rtdetr_pytorch/src/data/cifar10/__init__.py
@@ -0,0 +1,14 @@
+
+import torchvision
+from typing import Optional, Callable
+
+from src.core import register
+
+
+@register
+class CIFAR10(torchvision.datasets.CIFAR10):
+    __inject__ = ['transform', 'target_transform']
+    
+    def __init__(self, root: str, train: bool = True, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None, download: bool = False) -> None:
+        super().__init__(root, train, transform, target_transform, download)
+
diff --git a/rtdetr_pytorch/src/data/coco/__init__.py b/rtdetr_pytorch/src/data/coco/__init__.py
new file mode 100644
index 0000000..c83b002
--- /dev/null
+++ b/rtdetr_pytorch/src/data/coco/__init__.py
@@ -0,0 +1,9 @@
+from .coco_dataset import (
+    CocoDetection, 
+    mscoco_category2label,
+    mscoco_label2category,
+    mscoco_category2name,
+)
+from .coco_eval import *
+
+from .coco_utils import get_coco_api_from_dataset
\ No newline at end of file
diff --git a/rtdetr_pytorch/src/data/coco/coco_dataset.py b/rtdetr_pytorch/src/data/coco/coco_dataset.py
new file mode 100644
index 0000000..0ef7849
--- /dev/null
+++ b/rtdetr_pytorch/src/data/coco/coco_dataset.py
@@ -0,0 +1,238 @@
+"""
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+COCO dataset which returns image_id for evaluation.
+Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
+"""
+
+import torch
+import torch.utils.data
+
+import torchvision
+torchvision.disable_beta_transforms_warning()
+
+from torchvision import datapoints
+
+from pycocotools import mask as coco_mask
+
+from src.core import register
+
+__all__ = ['CocoDetection']
+
+
+@register
+class CocoDetection(torchvision.datasets.CocoDetection):
+    __inject__ = ['transforms']
+    __share__ = ['remap_mscoco_category']
+    
+    def __init__(self, img_folder, ann_file, transforms, return_masks, remap_mscoco_category=False):
+        super(CocoDetection, self).__init__(img_folder, ann_file)
+        self._transforms = transforms
+        self.prepare = ConvertCocoPolysToMask(return_masks, remap_mscoco_category)
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self.return_masks = return_masks
+        self.remap_mscoco_category = remap_mscoco_category
+
+    def __getitem__(self, idx):
+        img, target = super(CocoDetection, self).__getitem__(idx)
+        image_id = self.ids[idx]
+        target = {'image_id': image_id, 'annotations': target}
+        img, target = self.prepare(img, target)
+
+        # ['boxes', 'masks', 'labels']:
+        if 'boxes' in target:
+            target['boxes'] = datapoints.BoundingBox(
+                target['boxes'], 
+                format=datapoints.BoundingBoxFormat.XYXY, 
+                spatial_size=img.size[::-1]) # h w
+
+        if 'masks' in target:
+            target['masks'] = datapoints.Mask(target['masks'])
+
+        if self._transforms is not None:
+            img, target = self._transforms(img, target)
+            
+        return img, target
+
+    def extra_repr(self) -> str:
+        s = f' img_folder: {self.img_folder}\n ann_file: {self.ann_file}\n'
+        s += f' return_masks: {self.return_masks}\n'
+        if hasattr(self, '_transforms') and self._transforms is not None:
+            s += f' transforms:\n   {repr(self._transforms)}'
+
+        return s 
+
+
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+
+
+class ConvertCocoPolysToMask(object):
+    def __init__(self, return_masks=False, remap_mscoco_category=False):
+        self.return_masks = return_masks
+        self.remap_mscoco_category = remap_mscoco_category
+
+    def __call__(self, image, target):
+        w, h = image.size
+
+        image_id = target["image_id"]
+        image_id = torch.tensor([image_id])
+
+        anno = target["annotations"]
+
+        anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+
+        if self.remap_mscoco_category:
+            classes = [mscoco_category2label[obj["category_id"]] for obj in anno]
+        else:
+            classes = [obj["category_id"] for obj in anno]
+            
+        classes = torch.tensor(classes, dtype=torch.int64)
+
+        if self.return_masks:
+            segmentations = [obj["segmentation"] for obj in anno]
+            masks = convert_coco_poly_to_mask(segmentations, h, w)
+
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.view(num_keypoints, -1, 3)
+
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        if self.return_masks:
+            masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        if self.return_masks:
+            target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
+        target["area"] = area[keep]
+        target["iscrowd"] = iscrowd[keep]
+
+        target["orig_size"] = torch.as_tensor([int(w), int(h)])
+        target["size"] = torch.as_tensor([int(w), int(h)])
+    
+        return image, target
+
+
+mscoco_category2name = {
+    1: 'person',
+    2: 'bicycle',
+    3: 'car',
+    4: 'motorcycle',
+    5: 'airplane',
+    6: 'bus',
+    7: 'train',
+    8: 'truck',
+    9: 'boat',
+    10: 'traffic light',
+    11: 'fire hydrant',
+    13: 'stop sign',
+    14: 'parking meter',
+    15: 'bench',
+    16: 'bird',
+    17: 'cat',
+    18: 'dog',
+    19: 'horse',
+    20: 'sheep',
+    21: 'cow',
+    22: 'elephant',
+    23: 'bear',
+    24: 'zebra',
+    25: 'giraffe',
+    27: 'backpack',
+    28: 'umbrella',
+    31: 'handbag',
+    32: 'tie',
+    33: 'suitcase',
+    34: 'frisbee',
+    35: 'skis',
+    36: 'snowboard',
+    37: 'sports ball',
+    38: 'kite',
+    39: 'baseball bat',
+    40: 'baseball glove',
+    41: 'skateboard',
+    42: 'surfboard',
+    43: 'tennis racket',
+    44: 'bottle',
+    46: 'wine glass',
+    47: 'cup',
+    48: 'fork',
+    49: 'knife',
+    50: 'spoon',
+    51: 'bowl',
+    52: 'banana',
+    53: 'apple',
+    54: 'sandwich',
+    55: 'orange',
+    56: 'broccoli',
+    57: 'carrot',
+    58: 'hot dog',
+    59: 'pizza',
+    60: 'donut',
+    61: 'cake',
+    62: 'chair',
+    63: 'couch',
+    64: 'potted plant',
+    65: 'bed',
+    67: 'dining table',
+    70: 'toilet',
+    72: 'tv',
+    73: 'laptop',
+    74: 'mouse',
+    75: 'remote',
+    76: 'keyboard',
+    77: 'cell phone',
+    78: 'microwave',
+    79: 'oven',
+    80: 'toaster',
+    81: 'sink',
+    82: 'refrigerator',
+    84: 'book',
+    85: 'clock',
+    86: 'vase',
+    87: 'scissors',
+    88: 'teddy bear',
+    89: 'hair drier',
+    90: 'toothbrush'
+}
+
+mscoco_category2label = {k: i for i, k in enumerate(mscoco_category2name.keys())}
+mscoco_label2category = {v: k for k, v in mscoco_category2label.items()}
\ No newline at end of file
diff --git a/rtdetr_pytorch/src/data/coco/coco_eval.py b/rtdetr_pytorch/src/data/coco/coco_eval.py
new file mode 100644
index 0000000..2d629f5
--- /dev/null
+++ b/rtdetr_pytorch/src/data/coco/coco_eval.py
@@ -0,0 +1,269 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+COCO evaluator that works in distributed mode.
+
+Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py
+The difference is that there is less copy-pasting from pycocotools
+in the end of the file, as python3 can suppress prints with contextlib
+"""
+import os
+import contextlib
+import copy
+import numpy as np
+import torch
+
+from pycocotools.cocoeval import COCOeval
+from pycocotools.coco import COCO
+import pycocotools.mask as mask_util
+
+from src.misc import dist
+
+
+__all__ = ['CocoEvaluator',]
+
+
+class CocoEvaluator(object):
+    def __init__(self, coco_gt, iou_types):
+        assert isinstance(iou_types, (list, tuple))
+        coco_gt = copy.deepcopy(coco_gt)
+        self.coco_gt = coco_gt
+
+        self.iou_types = iou_types
+        self.coco_eval = {}
+        for iou_type in iou_types:
+            self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
+
+        self.img_ids = []
+        self.eval_imgs = {k: [] for k in iou_types}
+
+    def update(self, predictions):
+        img_ids = list(np.unique(list(predictions.keys())))
+        self.img_ids.extend(img_ids)
+
+        for iou_type in self.iou_types:
+            results = self.prepare(predictions, iou_type)
+
+            # suppress pycocotools prints
+            with open(os.devnull, 'w') as devnull:
+                with contextlib.redirect_stdout(devnull):
+                    coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO()
+            coco_eval = self.coco_eval[iou_type]
+
+            coco_eval.cocoDt = coco_dt
+            coco_eval.params.imgIds = list(img_ids)
+            img_ids, eval_imgs = evaluate(coco_eval)
+
+            self.eval_imgs[iou_type].append(eval_imgs)
+
+    def synchronize_between_processes(self):
+        for iou_type in self.iou_types:
+            self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
+            create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
+
+    def accumulate(self):
+        for coco_eval in self.coco_eval.values():
+            coco_eval.accumulate()
+
+    def summarize(self):
+        for iou_type, coco_eval in self.coco_eval.items():
+            print("IoU metric: {}".format(iou_type))
+            coco_eval.summarize()
+
+    def prepare(self, predictions, iou_type):
+        if iou_type == "bbox":
+            return self.prepare_for_coco_detection(predictions)
+        elif iou_type == "segm":
+            return self.prepare_for_coco_segmentation(predictions)
+        elif iou_type == "keypoints":
+            return self.prepare_for_coco_keypoint(predictions)
+        else:
+            raise ValueError("Unknown iou type {}".format(iou_type))
+
+    def prepare_for_coco_detection(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "bbox": box,
+                        "score": scores[k],
+                    }
+                    for k, box in enumerate(boxes)
+                ]
+            )
+        return coco_results
+
+    def prepare_for_coco_segmentation(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            scores = prediction["scores"]
+            labels = prediction["labels"]
+            masks = prediction["masks"]
+
+            masks = masks > 0.5
+
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+
+            rles = [
+                mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
+                for mask in masks
+            ]
+            for rle in rles:
+                rle["counts"] = rle["counts"].decode("utf-8")
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "segmentation": rle,
+                        "score": scores[k],
+                    }
+                    for k, rle in enumerate(rles)
+                ]
+            )
+        return coco_results
+
+    def prepare_for_coco_keypoint(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            keypoints = prediction["keypoints"]
+            keypoints = keypoints.flatten(start_dim=1).tolist()
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        'keypoints': keypoint,
+                        "score": scores[k],
+                    }
+                    for k, keypoint in enumerate(keypoints)
+                ]
+            )
+        return coco_results
+
+
+def convert_to_xywh(boxes):
+    xmin, ymin, xmax, ymax = boxes.unbind(1)
+    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
+
+
+def merge(img_ids, eval_imgs):
+    all_img_ids = dist.all_gather(img_ids)
+    all_eval_imgs = dist.all_gather(eval_imgs)
+
+    merged_img_ids = []
+    for p in all_img_ids:
+        merged_img_ids.extend(p)
+
+    merged_eval_imgs = []
+    for p in all_eval_imgs:
+        merged_eval_imgs.append(p)
+
+    merged_img_ids = np.array(merged_img_ids)
+    merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
+
+    # keep only unique (and in sorted order) images
+    merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
+    merged_eval_imgs = merged_eval_imgs[..., idx]
+
+    return merged_img_ids, merged_eval_imgs
+
+
+def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
+    img_ids, eval_imgs = merge(img_ids, eval_imgs)
+    img_ids = list(img_ids)
+    eval_imgs = list(eval_imgs.flatten())
+
+    coco_eval.evalImgs = eval_imgs
+    coco_eval.params.imgIds = img_ids
+    coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
+
+
+#################################################################
+# From pycocotools, just removed the prints and fixed
+# a Python3 bug about unicode not defined
+#################################################################
+
+
+# import io
+# from contextlib import redirect_stdout
+# def evaluate(imgs):
+#     with redirect_stdout(io.StringIO()):
+#         imgs.evaluate()
+#     return imgs.params.imgIds, np.asarray(imgs.evalImgs).reshape(-1, len(imgs.params.areaRng), len(imgs.params.imgIds))
+
+
+def evaluate(self):
+    '''
+    Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+    :return: None
+    '''
+    # tic = time.time()
+    # print('Running per image evaluation...')
+    p = self.params
+    # add backward compatibility if useSegm is specified in params
+    if p.useSegm is not None:
+        p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
+        print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
+    # print('Evaluate annotation type *{}*'.format(p.iouType))
+    p.imgIds = list(np.unique(p.imgIds))
+    if p.useCats:
+        p.catIds = list(np.unique(p.catIds))
+    p.maxDets = sorted(p.maxDets)
+    self.params = p
+
+    self._prepare()
+    # loop through images, area range, max detection number
+    catIds = p.catIds if p.useCats else [-1]
+
+    if p.iouType == 'segm' or p.iouType == 'bbox':
+        computeIoU = self.computeIoU
+    elif p.iouType == 'keypoints':
+        computeIoU = self.computeOks
+    self.ious = {
+        (imgId, catId): computeIoU(imgId, catId)
+        for imgId in p.imgIds
+        for catId in catIds}
+
+    evaluateImg = self.evaluateImg
+    maxDet = p.maxDets[-1]
+    evalImgs = [
+        evaluateImg(imgId, catId, areaRng, maxDet)
+        for catId in catIds
+        for areaRng in p.areaRng
+        for imgId in p.imgIds
+    ]
+    # this is NOT in the pycocotools code, but could be done outside
+    evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
+    self._paramsEval = copy.deepcopy(self.params)
+    # toc = time.time()
+    # print('DONE (t={:0.2f}s).'.format(toc-tic))
+    return p.imgIds, evalImgs
+
+#################################################################
+# end of straight copy from pycocotools, just removing the prints
+#################################################################
+
diff --git a/rtdetr_pytorch/src/data/coco/coco_utils.py b/rtdetr_pytorch/src/data/coco/coco_utils.py
new file mode 100644
index 0000000..48c0994
--- /dev/null
+++ b/rtdetr_pytorch/src/data/coco/coco_utils.py
@@ -0,0 +1,184 @@
+import os
+
+import torch
+import torch.utils.data
+import torchvision
+from pycocotools import mask as coco_mask
+from pycocotools.coco import COCO
+
+
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+
+
+class ConvertCocoPolysToMask:
+    def __call__(self, image, target):
+        w, h = image.size
+
+        image_id = target["image_id"]
+
+        anno = target["annotations"]
+
+        anno = [obj for obj in anno if obj["iscrowd"] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+
+        classes = [obj["category_id"] for obj in anno]
+        classes = torch.tensor(classes, dtype=torch.int64)
+
+        segmentations = [obj["segmentation"] for obj in anno]
+        masks = convert_coco_poly_to_mask(segmentations, h, w)
+
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.view(num_keypoints, -1, 3)
+
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] for obj in anno])
+        target["area"] = area
+        target["iscrowd"] = iscrowd
+
+        return image, target
+
+
+def _coco_remove_images_without_annotations(dataset, cat_list=None):
+    def _has_only_empty_bbox(anno):
+        return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
+
+    def _count_visible_keypoints(anno):
+        return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno)
+
+    min_keypoints_per_image = 10
+
+    def _has_valid_annotation(anno):
+        # if it's empty, there is no annotation
+        if len(anno) == 0:
+            return False
+        # if all boxes have close to zero area, there is no annotation
+        if _has_only_empty_bbox(anno):
+            return False
+        # keypoints task have a slight different criteria for considering
+        # if an annotation is valid
+        if "keypoints" not in anno[0]:
+            return True
+        # for keypoint detection tasks, only consider valid images those
+        # containing at least min_keypoints_per_image
+        if _count_visible_keypoints(anno) >= min_keypoints_per_image:
+            return True
+        return False
+
+    ids = []
+    for ds_idx, img_id in enumerate(dataset.ids):
+        ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None)
+        anno = dataset.coco.loadAnns(ann_ids)
+        if cat_list:
+            anno = [obj for obj in anno if obj["category_id"] in cat_list]
+        if _has_valid_annotation(anno):
+            ids.append(ds_idx)
+
+    dataset = torch.utils.data.Subset(dataset, ids)
+    return dataset
+
+
+def convert_to_coco_api(ds):
+    coco_ds = COCO()
+    # annotation IDs need to start at 1, not 0, see torchvision issue #1530
+    ann_id = 1
+    dataset = {"images": [], "categories": [], "annotations": []}
+    categories = set()
+    for img_idx in range(len(ds)):
+        # find better way to get target
+        # targets = ds.get_annotations(img_idx)
+        img, targets = ds[img_idx]
+        image_id = targets["image_id"].item()
+        img_dict = {}
+        img_dict["id"] = image_id
+        img_dict["height"] = img.shape[-2]
+        img_dict["width"] = img.shape[-1]
+        dataset["images"].append(img_dict)
+        bboxes = targets["boxes"].clone()
+        bboxes[:, 2:] -= bboxes[:, :2]
+        bboxes = bboxes.tolist()
+        labels = targets["labels"].tolist()
+        areas = targets["area"].tolist()
+        iscrowd = targets["iscrowd"].tolist()
+        if "masks" in targets:
+            masks = targets["masks"]
+            # make masks Fortran contiguous for coco_mask
+            masks = masks.permute(0, 2, 1).contiguous().permute(0, 2, 1)
+        if "keypoints" in targets:
+            keypoints = targets["keypoints"]
+            keypoints = keypoints.reshape(keypoints.shape[0], -1).tolist()
+        num_objs = len(bboxes)
+        for i in range(num_objs):
+            ann = {}
+            ann["image_id"] = image_id
+            ann["bbox"] = bboxes[i]
+            ann["category_id"] = labels[i]
+            categories.add(labels[i])
+            ann["area"] = areas[i]
+            ann["iscrowd"] = iscrowd[i]
+            ann["id"] = ann_id
+            if "masks" in targets:
+                ann["segmentation"] = coco_mask.encode(masks[i].numpy())
+            if "keypoints" in targets:
+                ann["keypoints"] = keypoints[i]
+                ann["num_keypoints"] = sum(k != 0 for k in keypoints[i][2::3])
+            dataset["annotations"].append(ann)
+            ann_id += 1
+    dataset["categories"] = [{"id": i} for i in sorted(categories)]
+    coco_ds.dataset = dataset
+    coco_ds.createIndex()
+    return coco_ds
+
+
+def get_coco_api_from_dataset(dataset):
+    # FIXME: This is... awful?
+    for _ in range(10):
+        if isinstance(dataset, torchvision.datasets.CocoDetection):
+            break
+        if isinstance(dataset, torch.utils.data.Subset):
+            dataset = dataset.dataset
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
+        return dataset.coco
+    return convert_to_coco_api(dataset)
+
+
diff --git a/rtdetr_pytorch/src/data/dataloader.py b/rtdetr_pytorch/src/data/dataloader.py
new file mode 100644
index 0000000..4db7cad
--- /dev/null
+++ b/rtdetr_pytorch/src/data/dataloader.py
@@ -0,0 +1,28 @@
+import torch 
+import torch.utils.data as data
+
+from src.core import register
+
+
+__all__ = ['DataLoader']
+
+
+@register
+class DataLoader(data.DataLoader):
+    __inject__ = ['dataset', 'collate_fn']
+
+    def __repr__(self) -> str:
+        format_string = self.__class__.__name__ + "("
+        for n in ['dataset', 'batch_size', 'num_workers', 'drop_last', 'collate_fn']:
+            format_string += "\n"
+            format_string += "    {0}: {1}".format(n, getattr(self, n))
+        format_string += "\n)"
+        return format_string
+
+
+
+@register
+def default_collate_fn(items):
+    '''default collate_fn
+    '''    
+    return torch.cat([x[0][None] for x in items], dim=0), [x[1] for x in items]
diff --git a/rtdetr_pytorch/src/data/functional.py b/rtdetr_pytorch/src/data/functional.py
new file mode 100644
index 0000000..336baa2
--- /dev/null
+++ b/rtdetr_pytorch/src/data/functional.py
@@ -0,0 +1,169 @@
+import torch
+import torchvision.transforms.functional as F
+
+from packaging import version
+from typing import Optional, List
+from torch import Tensor
+
+# needed due to empty tensor bug in pytorch and torchvision 0.5
+import torchvision
+if version.parse(torchvision.__version__) < version.parse('0.7'):
+    from torchvision.ops import _new_empty_tensor
+    from torchvision.ops.misc import _output_size
+
+
+def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
+    # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
+    """
+    Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
+    This will eventually be supported natively by PyTorch, and this
+    class can go away.
+    """
+    if version.parse(torchvision.__version__) < version.parse('0.7'):
+        if input.numel() > 0:
+            return torch.nn.functional.interpolate(
+                input, size, scale_factor, mode, align_corners
+            )
+
+        output_shape = _output_size(2, input, size, scale_factor)
+        output_shape = list(input.shape[:-2]) + list(output_shape)
+        return _new_empty_tensor(input, output_shape)
+    else:
+        return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)
+
+
+
+def crop(image, target, region):
+    cropped_image = F.crop(image, *region)
+
+    target = target.copy()
+    i, j, h, w = region
+
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor([h, w])
+
+    fields = ["labels", "area", "iscrowd"]
+
+    if "boxes" in target:
+        boxes = target["boxes"]
+        max_size = torch.as_tensor([w, h], dtype=torch.float32)
+        cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+        cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+        cropped_boxes = cropped_boxes.clamp(min=0)
+        area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
+        target["boxes"] = cropped_boxes.reshape(-1, 4)
+        target["area"] = area
+        fields.append("boxes")
+
+    if "masks" in target:
+        # FIXME should we update the area here if there are no boxes?
+        target['masks'] = target['masks'][:, i:i + h, j:j + w]
+        fields.append("masks")
+
+    # remove elements for which the boxes or masks that have zero area
+    if "boxes" in target or "masks" in target:
+        # favor boxes selection when defining which elements to keep
+        # this is compatible with previous implementation
+        if "boxes" in target:
+            cropped_boxes = target['boxes'].reshape(-1, 2, 2)
+            keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+        else:
+            keep = target['masks'].flatten(1).any(1)
+
+        for field in fields:
+            target[field] = target[field][keep]
+
+    return cropped_image, target
+
+
+def hflip(image, target):
+    flipped_image = F.hflip(image)
+
+    w, h = image.size
+
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
+        target["boxes"] = boxes
+
+    if "masks" in target:
+        target['masks'] = target['masks'].flip(-1)
+
+    return flipped_image, target
+
+
+def resize(image, target, size, max_size=None):
+    # size can be min_size (scalar) or (w, h) tuple
+
+    def get_size_with_aspect_ratio(image_size, size, max_size=None):
+        w, h = image_size
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(round(max_size * min_original_size / max_original_size))
+
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (h, w)
+
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+            
+        # r = min(size / min(h, w), max_size / max(h, w))
+        # ow = int(w * r)
+        # oh = int(h * r)
+
+        return (oh, ow)
+
+    def get_size(image_size, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(image_size, size, max_size)
+
+    size = get_size(image.size, size, max_size)
+    rescaled_image = F.resize(image, size)
+
+    if target is None:
+        return rescaled_image, None
+
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+    ratio_width, ratio_height = ratios
+
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
+        target["boxes"] = scaled_boxes
+
+    if "area" in target:
+        area = target["area"]
+        scaled_area = area * (ratio_width * ratio_height)
+        target["area"] = scaled_area
+
+    h, w = size
+    target["size"] = torch.tensor([h, w])
+
+    if "masks" in target:
+        target['masks'] = interpolate(
+            target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
+
+    return rescaled_image, target
+
+
+def pad(image, target, padding):
+    # assumes that we only pad on the bottom right corners
+    padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
+    if target is None:
+        return padded_image, None
+    target = target.copy()
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor(padded_image.size[::-1])
+    if "masks" in target:
+        target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
+    return padded_image, target
diff --git a/rtdetr_pytorch/src/data/transforms.py b/rtdetr_pytorch/src/data/transforms.py
new file mode 100644
index 0000000..13f469e
--- /dev/null
+++ b/rtdetr_pytorch/src/data/transforms.py
@@ -0,0 +1,150 @@
+""""by lyuwenyu
+"""
+
+
+import torch 
+import torch.nn as nn 
+
+import torchvision
+torchvision.disable_beta_transforms_warning()
+from torchvision import datapoints
+
+import torchvision.transforms.v2 as T
+import torchvision.transforms.v2.functional as F
+
+from PIL import Image 
+from typing import Any, Dict, List, Optional
+
+from src.core import register, GLOBAL_CONFIG
+
+
+__all__ = ['Compose', ]
+
+
+RandomPhotometricDistort = register(T.RandomPhotometricDistort)
+RandomZoomOut = register(T.RandomZoomOut)
+# RandomIoUCrop = register(T.RandomIoUCrop)
+RandomHorizontalFlip = register(T.RandomHorizontalFlip)
+Resize = register(T.Resize)
+ToImageTensor = register(T.ToImageTensor)
+ConvertDtype = register(T.ConvertDtype)
+SanitizeBoundingBox = register(T.SanitizeBoundingBox)
+RandomCrop = register(T.RandomCrop)
+Normalize = register(T.Normalize)
+
+
+
+@register
+class Compose(T.Compose):
+    def __init__(self, ops) -> None:
+        transforms = []
+        if ops is not None:
+            for op in ops:
+                if isinstance(op, dict):
+                    name = op.pop('type')
+                    transfom = getattr(GLOBAL_CONFIG[name]['_pymodule'], name)(**op)
+                    transforms.append(transfom)
+                    # op['type'] = name
+                elif isinstance(op, nn.Module):
+                    transforms.append(op)
+
+                else:
+                    raise ValueError('')
+        else:
+            transforms =[EmptyTransform(), ]
+ 
+        super().__init__(transforms=transforms)
+
+
+@register
+class EmptyTransform(T.Transform):
+    def __init__(self, ) -> None:
+        super().__init__()
+
+    def forward(self, *inputs):
+        inputs = inputs if len(inputs) > 1 else inputs[0]
+        return inputs
+
+
+@register
+class PadToSize(T.Pad):
+    _transformed_types = (
+        Image.Image,
+        datapoints.Image,
+        datapoints.Video,
+        datapoints.Mask,
+        datapoints.BoundingBox,
+    )
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        sz = F.get_spatial_size(flat_inputs[0])
+        h, w = self.spatial_size[0] - sz[0], self.spatial_size[1] - sz[1]
+        self.padding = [0, 0, w, h]
+        return dict(padding=self.padding)
+
+    def make_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        return self._get_params(flat_inputs)
+
+    def __init__(self, spatial_size, fill=0, padding_mode='constant') -> None:
+        if isinstance(spatial_size, int):
+            spatial_size = (spatial_size, spatial_size)
+        
+        self.spatial_size = spatial_size
+        super().__init__(0, fill, padding_mode)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:        
+        fill = self._fill[type(inpt)]
+        padding = params['padding']
+        return F.pad(inpt, padding=padding, fill=fill, padding_mode=self.padding_mode)  # type: ignore[arg-type]
+
+    def transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._transform(inpt, params)
+
+    def __call__(self, *inputs: Any) -> Any:
+        outputs = super().forward(*inputs)
+        if len(outputs) > 1 and isinstance(outputs[1], dict):
+            outputs[1]['padding'] = torch.tensor(self.padding)
+        return outputs
+
+
+@register
+class RandomIoUCrop(T.RandomIoUCrop):
+    def __init__(self, min_scale: float = 0.3, max_scale: float = 1, min_aspect_ratio: float = 0.5, max_aspect_ratio: float = 2, sampler_options: Optional[List[float]] = None, trials: int = 40, p: float = 1.0):
+        super().__init__(min_scale, max_scale, min_aspect_ratio, max_aspect_ratio, sampler_options, trials)
+        self.p = p 
+
+    def __call__(self, *inputs: Any) -> Any:
+        if torch.rand(1) >= self.p:
+            return inputs if len(inputs) > 1 else inputs[0]
+
+        return super().forward(*inputs)
+
+
+@register
+class ConvertBox(T.Transform):
+    _transformed_types = (
+        datapoints.BoundingBox,
+    )
+    def __init__(self, out_fmt='', normalize=False) -> None:
+        super().__init__()
+        self.out_fmt = out_fmt
+        self.normalize = normalize
+
+        self.data_fmt = {
+            'xyxy': datapoints.BoundingBoxFormat.XYXY,
+            'cxcywh': datapoints.BoundingBoxFormat.CXCYWH
+        }
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:  
+        if self.out_fmt:
+            spatial_size = inpt.spatial_size
+            in_fmt = inpt.format.value.lower()
+            inpt = torchvision.ops.box_convert(inpt, in_fmt=in_fmt, out_fmt=self.out_fmt)
+            inpt = datapoints.BoundingBox(inpt, format=self.data_fmt[self.out_fmt], spatial_size=spatial_size)
+        
+        if self.normalize:
+            inpt = inpt / torch.tensor(inpt.spatial_size[::-1]).tile(2)[None]
+
+        return inpt
+
+    def transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._transform(inpt, params)
diff --git a/rtdetr_pytorch/src/misc/__init__.py b/rtdetr_pytorch/src/misc/__init__.py
new file mode 100644
index 0000000..802b61e
--- /dev/null
+++ b/rtdetr_pytorch/src/misc/__init__.py
@@ -0,0 +1,3 @@
+
+from .logger import *
+from .visualizer import *
diff --git a/rtdetr_pytorch/src/misc/dist.py b/rtdetr_pytorch/src/misc/dist.py
new file mode 100644
index 0000000..4293de5
--- /dev/null
+++ b/rtdetr_pytorch/src/misc/dist.py
@@ -0,0 +1,189 @@
+"""
+reference
+- https://github.com/pytorch/vision/blob/main/references/detection/utils.py
+- https://github.com/facebookresearch/detr/blob/master/util/misc.py#L406
+
+by lyuwenyu
+"""
+
+import random
+import numpy as np 
+
+import torch
+import torch.nn as nn 
+import torch.distributed
+import torch.distributed as tdist
+
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+from torch.utils.data import DistributedSampler
+from torch.utils.data.dataloader import DataLoader
+
+
+def init_distributed():
+    '''
+    distributed setup
+    args:
+        backend (str), ('nccl', 'gloo')
+    '''
+    try:
+        # # https://pytorch.org/docs/stable/elastic/run.html
+        # LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1))  
+        # RANK = int(os.getenv('RANK', -1))
+        # WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1))
+        
+        tdist.init_process_group(init_method='env://', )
+        torch.distributed.barrier()
+
+        rank = get_rank()
+        device = torch.device(f'cuda:{rank}')
+        torch.cuda.set_device(device)
+
+        setup_print(rank == 0)
+        print('Initialized distributed mode...')
+
+        return True 
+
+    except:
+        print('Not init distributed mode.')
+        return False 
+
+
+def setup_print(is_main):
+    '''This function disables printing when not in master process
+    '''
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_main or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_available_and_initialized():
+    if not tdist.is_available():
+        return False
+    if not tdist.is_initialized():
+        return False
+    return True
+
+
+def get_rank():
+    if not is_dist_available_and_initialized():
+        return 0
+    return tdist.get_rank()
+
+
+def get_world_size():
+    if not is_dist_available_and_initialized():
+        return 1
+    return tdist.get_world_size()
+
+    
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+
+
+def warp_model(model, find_unused_parameters=False, sync_bn=False,):
+    if is_dist_available_and_initialized():
+        rank = get_rank()
+        model = nn.SyncBatchNorm.convert_sync_batchnorm(model) if sync_bn else model 
+        model = DDP(model, device_ids=[rank], output_device=rank, find_unused_parameters=find_unused_parameters)
+    return model
+
+
+def warp_loader(loader, shuffle=False):        
+    if is_dist_available_and_initialized():
+        sampler = DistributedSampler(loader.dataset, shuffle=shuffle)
+        loader = DataLoader(loader.dataset, 
+                            loader.batch_size, 
+                            sampler=sampler, 
+                            drop_last=loader.drop_last, 
+                            collate_fn=loader.collate_fn, 
+                            pin_memory=loader.pin_memory,
+                            num_workers=loader.num_workers, )
+    return loader
+
+
+
+def is_parallel(model) -> bool:
+    # Returns True if model is of type DP or DDP
+    return type(model) in (torch.nn.parallel.DataParallel, torch.nn.parallel.DistributedDataParallel)
+
+
+def de_parallel(model) -> nn.Module:
+    # De-parallelize a model: returns single-GPU model if model is of type DP or DDP
+    return model.module if is_parallel(model) else model
+
+
+def reduce_dict(data, avg=True):
+    '''
+    Args 
+        data dict: input, {k: v, ...}
+        avg bool: true
+    '''
+    world_size = get_world_size()
+    if world_size < 2:
+        return data
+    
+    with torch.no_grad():
+        keys, values = [], []
+        for k in sorted(data.keys()):
+            keys.append(k)
+            values.append(data[k])
+
+        values = torch.stack(values, dim=0)
+        tdist.all_reduce(values)
+
+        if avg is True:
+            values /= world_size
+        
+        _data = {k: v for k, v in zip(keys, values)}
+    
+    return _data
+
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+    data_list = [None] * world_size
+    tdist.all_gather_object(data_list, data)
+    return data_list
+
+    
+import time 
+def sync_time():
+    '''sync_time
+    '''
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+
+    return time.time()
+
+
+
+def set_seed(seed):
+    # fix the seed for reproducibility
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+
+
diff --git a/rtdetr_pytorch/src/misc/logger.py b/rtdetr_pytorch/src/misc/logger.py
new file mode 100644
index 0000000..6740530
--- /dev/null
+++ b/rtdetr_pytorch/src/misc/logger.py
@@ -0,0 +1,239 @@
+"""
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+https://github.com/facebookresearch/detr/blob/main/util/misc.py
+Mostly copy-paste from torchvision references.
+"""
+
+import time
+import pickle
+import datetime
+from collections import defaultdict, deque
+from typing import Dict
+
+import torch
+import torch.distributed as tdist
+
+from .dist import is_dist_available_and_initialized, get_world_size
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_available_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        tdist.barrier()
+        tdist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to("cuda")
+
+    # obtain Tensor size of each rank
+    local_size = torch.tensor([tensor.numel()], device="cuda")
+    size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
+    tdist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
+    if local_size != max_size:
+        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
+        tensor = torch.cat((tensor, padding), dim=0)
+    tdist.all_gather(tensor_list, tensor)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+
+def reduce_dict(input_dict, average=True) -> Dict[str, torch.Tensor]:
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that all processes
+    have the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        tdist.all_reduce(values)
+        if average:
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        if torch.cuda.is_available():
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}',
+                'max mem: {memory:.0f}'
+            ])
+        else:
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}'
+            ])
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+
diff --git a/rtdetr_pytorch/src/misc/visualizer.py b/rtdetr_pytorch/src/misc/visualizer.py
new file mode 100644
index 0000000..843f8eb
--- /dev/null
+++ b/rtdetr_pytorch/src/misc/visualizer.py
@@ -0,0 +1,34 @@
+""""by lyuwenyu
+"""
+
+import torch
+import torch.utils.data
+
+import torchvision
+torchvision.disable_beta_transforms_warning()
+
+import PIL 
+
+__all__ = ['show_sample']
+
+def show_sample(sample):
+    """for coco dataset/dataloader
+    """
+    import matplotlib.pyplot as plt
+    from torchvision.transforms.v2 import functional as F
+    from torchvision.utils import draw_bounding_boxes
+
+    image, target = sample
+    if isinstance(image, PIL.Image.Image):
+        image = F.to_image_tensor(image)
+
+    image = F.convert_dtype(image, torch.uint8)
+    annotated_image = draw_bounding_boxes(image, target["boxes"], colors="yellow", width=3)
+
+    fig, ax = plt.subplots()
+    ax.imshow(annotated_image.permute(1, 2, 0).numpy())
+    ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
+    fig.tight_layout()
+    fig.show()
+    plt.show()
+
diff --git a/rtdetr_pytorch/src/nn/__init__.py b/rtdetr_pytorch/src/nn/__init__.py
new file mode 100644
index 0000000..7df8a1c
--- /dev/null
+++ b/rtdetr_pytorch/src/nn/__init__.py
@@ -0,0 +1,7 @@
+
+from .arch import *
+from .criterion import *
+
+# 
+from .backbone import *
+
diff --git a/rtdetr_pytorch/src/nn/arch/__init__.py b/rtdetr_pytorch/src/nn/arch/__init__.py
new file mode 100644
index 0000000..070f19b
--- /dev/null
+++ b/rtdetr_pytorch/src/nn/arch/__init__.py
@@ -0,0 +1 @@
+from .classification import *
diff --git a/rtdetr_pytorch/src/nn/arch/classification.py b/rtdetr_pytorch/src/nn/arch/classification.py
new file mode 100644
index 0000000..2f1fa56
--- /dev/null
+++ b/rtdetr_pytorch/src/nn/arch/classification.py
@@ -0,0 +1,41 @@
+import torch 
+import torch.nn as nn 
+
+from src.core import register
+
+
+__all__ = ['Classification', 'ClassHead']
+
+
+@register
+class Classification(nn.Module):
+    __inject__ = ['backbone', 'head']
+
+    def __init__(self, backbone: nn.Module, head: nn.Module=None):
+        super().__init__()
+        
+        self.backbone = backbone
+        self.head = head
+
+    def forward(self, x):
+        x = self.backbone(x)
+
+        if self.head is not None:
+            x = self.head(x)
+
+        return x 
+
+
+@register
+class ClassHead(nn.Module):
+    def __init__(self, hidden_dim, num_classes):
+        super().__init__()
+        self.pool = nn.AdaptiveAvgPool2d(1)
+        self.proj = nn.Linear(hidden_dim, num_classes)  
+
+    def forward(self, x):
+        x = x[0] if isinstance(x, (list, tuple)) else x 
+        x = self.pool(x)
+        x = x.reshape(x.shape[0], -1)
+        x = self.proj(x)
+        return x 
diff --git a/rtdetr_pytorch/src/nn/backbone/__init__.py b/rtdetr_pytorch/src/nn/backbone/__init__.py
new file mode 100644
index 0000000..f8571dc
--- /dev/null
+++ b/rtdetr_pytorch/src/nn/backbone/__init__.py
@@ -0,0 +1,6 @@
+
+from .presnet import *
+from .test_resnet import *
+from .regnet import *
+from .common import *
+from .dla import *
\ No newline at end of file
diff --git a/rtdetr_pytorch/src/nn/backbone/common.py b/rtdetr_pytorch/src/nn/backbone/common.py
new file mode 100644
index 0000000..72e38d7
--- /dev/null
+++ b/rtdetr_pytorch/src/nn/backbone/common.py
@@ -0,0 +1,102 @@
+'''by lyuwenyu
+'''
+
+import torch 
+import torch.nn as nn
+
+
+
+class ConvNormLayer(nn.Module):
+    def __init__(self, ch_in, ch_out, kernel_size, stride, padding=None, bias=False, act=None):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            ch_in, 
+            ch_out, 
+            kernel_size, 
+            stride, 
+            padding=(kernel_size-1)//2 if padding is None else padding, 
+            bias=bias)
+        self.norm = nn.BatchNorm2d(ch_out)
+        self.act = nn.Identity() if act is None else get_activation(act) 
+
+    def forward(self, x):
+        return self.act(self.norm(self.conv(x)))
+
+
+class FrozenBatchNorm2d(nn.Module):
+    """copy and modified from https://github.com/facebookresearch/detr/blob/master/models/backbone.py
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+    def __init__(self, num_features, eps=1e-5):
+        super(FrozenBatchNorm2d, self).__init__()
+        n = num_features
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+        self.eps = eps
+        self.num_features = n 
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        num_batches_tracked_key = prefix + 'num_batches_tracked'
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs)
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        scale = w * (rv + self.eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+
+    def extra_repr(self):
+        return (
+            "{num_features}, eps={eps}".format(**self.__dict__)
+        )
+
+
+def get_activation(act: str, inpace: bool=True):
+    '''get activation
+    '''
+    act = act.lower()
+    
+    if act == 'silu':
+        m = nn.SiLU()
+
+    elif act == 'relu':
+        m = nn.ReLU()
+
+    elif act == 'leaky_relu':
+        m = nn.LeakyReLU()
+
+    elif act == 'silu':
+        m = nn.SiLU()
+    
+    elif act == 'gelu':
+        m = nn.GELU()
+        
+    elif act is None:
+        m = nn.Identity()
+    
+    elif isinstance(act, nn.Module):
+        m = act
+
+    else:
+        raise RuntimeError('')  
+
+    if hasattr(m, 'inplace'):
+        m.inplace = inpace
+    
+    return m 
diff --git a/rtdetr_pytorch/src/nn/backbone/dla.py b/rtdetr_pytorch/src/nn/backbone/dla.py
new file mode 100644
index 0000000..23c9cd6
--- /dev/null
+++ b/rtdetr_pytorch/src/nn/backbone/dla.py
@@ -0,0 +1,452 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import logging
+from os.path import join
+
+import torch
+from torch import nn
+import torch.utils.model_zoo as model_zoo
+# from mmdet.models.builder import BACKBONES
+from src.core import register
+
+
+BN_MOMENTUM = 0.1
+logger = logging.getLogger(__name__)
+
+
+def get_model_url(data='imagenet', name='dla34', hash='ba72cf86'):
+    return join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash))
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(
+        in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False
+    )
+
+
+class BasicBlock(nn.Module):
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(
+            inplanes,
+            planes,
+            kernel_size=3,
+            stride=stride,
+            padding=dilation,
+            bias=False,
+            dilation=dilation,
+        )
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=1,
+            padding=dilation,
+            bias=False,
+            dilation=dilation,
+        )
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 2
+
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(Bottleneck, self).__init__()
+        expansion = Bottleneck.expansion
+        bottle_planes = planes // expansion
+        self.conv1 = nn.Conv2d(inplanes, bottle_planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(
+            bottle_planes,
+            bottle_planes,
+            kernel_size=3,
+            stride=stride,
+            padding=dilation,
+            bias=False,
+            dilation=dilation,
+        )
+        self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(bottle_planes, planes, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class BottleneckX(nn.Module):
+    expansion = 2
+    cardinality = 32
+
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(BottleneckX, self).__init__()
+        cardinality = BottleneckX.cardinality
+        # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0)))
+        # bottle_planes = dim * cardinality
+        bottle_planes = planes * cardinality // 32
+        self.conv1 = nn.Conv2d(inplanes, bottle_planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(
+            bottle_planes,
+            bottle_planes,
+            kernel_size=3,
+            stride=stride,
+            padding=dilation,
+            bias=False,
+            dilation=dilation,
+            groups=cardinality,
+        )
+        self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(bottle_planes, planes, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Root(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, residual):
+        super(Root, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            1,
+            stride=1,
+            bias=False,
+            padding=(kernel_size - 1) // 2,
+        )
+        self.bn = nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.residual = residual
+
+    def forward(self, *x):
+        children = x
+        x = self.conv(torch.cat(x, 1))
+        x = self.bn(x)
+        if self.residual:
+            x += children[0]
+        x = self.relu(x)
+
+        return x
+
+
+class Tree(nn.Module):
+    def __init__(
+        self,
+        levels,
+        block,
+        in_channels,
+        out_channels,
+        stride=1,
+        level_root=False,
+        root_dim=0,
+        root_kernel_size=1,
+        dilation=1,
+        root_residual=False,
+    ):
+        super(Tree, self).__init__()
+        if root_dim == 0:
+            root_dim = 2 * out_channels
+        if level_root:
+            root_dim += in_channels
+        if levels == 1:
+            self.tree1 = block(in_channels, out_channels, stride, dilation=dilation)
+            self.tree2 = block(out_channels, out_channels, 1, dilation=dilation)
+        else:
+            self.tree1 = Tree(
+                levels - 1,
+                block,
+                in_channels,
+                out_channels,
+                stride,
+                root_dim=0,
+                root_kernel_size=root_kernel_size,
+                dilation=dilation,
+                root_residual=root_residual,
+            )
+            self.tree2 = Tree(
+                levels - 1,
+                block,
+                out_channels,
+                out_channels,
+                root_dim=root_dim + out_channels,
+                root_kernel_size=root_kernel_size,
+                dilation=dilation,
+                root_residual=root_residual,
+            )
+        if levels == 1:
+            self.root = Root(root_dim, out_channels, root_kernel_size, root_residual)
+        self.level_root = level_root
+        self.root_dim = root_dim
+        self.downsample = None
+        self.project = None
+        self.levels = levels
+        if stride > 1:
+            self.downsample = nn.MaxPool2d(stride, stride=stride)
+        if levels == 1 and in_channels != out_channels:
+            self.project = nn.Sequential(
+                nn.Conv2d(
+                    in_channels, out_channels, kernel_size=1, stride=1, bias=False
+                ),
+                nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM),
+            )
+
+    def forward(self, x, residual=None, children=None):
+        children = [] if children is None else children
+        bottom = self.downsample(x) if self.downsample else x
+        residual = self.project(bottom) if self.project else bottom
+        if self.level_root:
+            children.append(bottom)
+        x1 = self.tree1(x, residual)
+        if self.levels == 1:
+            x2 = self.tree2(x1)
+            x = self.root(x2, x1, *children)
+        else:
+            children.append(x1)
+            x = self.tree2(x1, children=children)
+        return x
+
+
+class DLA(nn.Module):
+    def __init__(
+        self,
+        levels,
+        channels,
+        num_classes=1000,
+        block=BasicBlock,
+        out_indices=(2, 3, 4, 5),
+        residual_root=False,
+        linear_root=False,
+    ):
+        super(DLA, self).__init__()
+        self.channels = channels
+        self.num_classes = num_classes
+        self.out_indices = out_indices
+        self.base_layer = nn.Sequential(
+            nn.Conv2d(3, channels[0], kernel_size=7, stride=1, padding=3, bias=False),
+            nn.BatchNorm2d(channels[0], momentum=BN_MOMENTUM),
+            nn.ReLU(inplace=True),
+        )
+        self.level0 = self._make_conv_level(channels[0], channels[0], levels[0])
+        self.level1 = self._make_conv_level(
+            channels[0], channels[1], levels[1], stride=2
+        )
+        self.level2 = Tree(
+            levels[2],
+            block,
+            channels[1],
+            channels[2],
+            2,
+            level_root=False,
+            root_residual=residual_root,
+        )
+        self.level3 = Tree(
+            levels[3],
+            block,
+            channels[2],
+            channels[3],
+            2,
+            level_root=True,
+            root_residual=residual_root,
+        )
+        self.level4 = Tree(
+            levels[4],
+            block,
+            channels[3],
+            channels[4],
+            2,
+            level_root=True,
+            root_residual=residual_root,
+        )
+        self.level5 = Tree(
+            levels[5],
+            block,
+            channels[4],
+            channels[5],
+            2,
+            level_root=True,
+            root_residual=residual_root,
+        )
+
+        # for m in self.modules():
+        #     if isinstance(m, nn.Conv2d):
+        #         n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        #         m.weight.data.normal_(0, math.sqrt(2. / n))
+        #     elif isinstance(m, nn.BatchNorm2d):
+        #         m.weight.data.fill_(1)
+        #         m.bias.data.zero_()
+
+    def _make_level(self, block, inplanes, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or inplanes != planes:
+            downsample = nn.Sequential(
+                nn.MaxPool2d(stride, stride=stride),
+                nn.Conv2d(inplanes, planes, kernel_size=1, stride=1, bias=False),
+                nn.BatchNorm2d(planes, momentum=BN_MOMENTUM),
+            )
+
+        layers = []
+        layers.append(block(inplanes, planes, stride, downsample=downsample))
+        for i in range(1, blocks):
+            layers.append(block(inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1):
+        modules = []
+        for i in range(convs):
+            modules.extend(
+                [
+                    nn.Conv2d(
+                        inplanes,
+                        planes,
+                        kernel_size=3,
+                        stride=stride if i == 0 else 1,
+                        padding=dilation,
+                        bias=False,
+                        dilation=dilation,
+                    ),
+                    nn.BatchNorm2d(planes, momentum=BN_MOMENTUM),
+                    nn.ReLU(inplace=True),
+                ]
+            )
+            inplanes = planes
+        return nn.Sequential(*modules)
+
+    def forward(self, x):
+        y = []
+        x = self.base_layer(x)
+        for i in range(6):
+            x = getattr(self, 'level{}'.format(i))(x)
+            if i in self.out_indices:
+                y.append(x)
+        return y
+
+    def load_pretrained_model(self, data='imagenet', name='dla34', hash='ba72cf86'):
+        # fc = self.fc
+        if name.endswith('.pth'):
+            model_weights = torch.load(data + name)
+        else:
+            model_url = get_model_url(data, name, hash)
+            model_weights = model_zoo.load_url(model_url)
+        self.load_state_dict(model_weights, strict=False)
+        # self.fc = fc
+
+
+def dla34(pretrained=True, levels=None, in_channels=None, **kwargs):  # DLA-34
+    model = DLA(levels=levels, channels=in_channels, block=BasicBlock, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla34', hash='ba72cf86')
+    return model
+
+@register
+class DLANet(nn.Module):
+    def __init__(
+        self,
+        dla='dla34',
+        pretrained=True,
+        levels=[1, 1, 1, 2, 2, 1],
+        in_channels=[16, 32, 64, 128, 256, 512],
+        return_index = [1, 2, 3],
+        cfg=None,
+    ):
+        super(DLANet, self).__init__()
+        self.cfg = cfg
+        self.in_channels = in_channels
+
+        self.model = eval(dla)(
+            pretrained=pretrained, levels=levels, in_channels=in_channels
+        )
+        self.return_index = return_index
+    def forward(self, x):
+        x = self.model(x)
+        max_list = max(self.return_index)
+        min_list = min(self.return_index)
+        return x[min_list:max_list+1]
+
+
+class Identity(nn.Module):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+def fill_fc_weights(layers):
+    for m in layers.modules():
+        if isinstance(m, nn.Conv2d):
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+
+def fill_up_weights(up):
+    w = up.weight.data
+    f = math.ceil(w.size(2) / 2)
+    c = (2 * f - 1 - f % 2) / (2.0 * f)
+    for i in range(w.size(2)):
+        for j in range(w.size(3)):
+            w[0, 0, i, j] = (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
+    for c in range(1, w.size(0)):
+        w[c, 0, :, :] = w[0, 0, :, :]
\ No newline at end of file
diff --git a/rtdetr_pytorch/src/nn/backbone/presnet.py b/rtdetr_pytorch/src/nn/backbone/presnet.py
new file mode 100644
index 0000000..2a6b4ba
--- /dev/null
+++ b/rtdetr_pytorch/src/nn/backbone/presnet.py
@@ -0,0 +1,225 @@
+'''by lyuwenyu
+'''
+import torch
+import torch.nn as nn 
+import torch.nn.functional as F 
+
+from collections import OrderedDict
+
+from .common import get_activation, ConvNormLayer, FrozenBatchNorm2d
+
+from src.core import register
+
+
+__all__ = ['PResNet']
+
+
+ResNet_cfg = {
+    18: [2, 2, 2, 2],
+    34: [3, 4, 6, 3],
+    50: [3, 4, 6, 3],
+    101: [3, 4, 23, 3],
+    # 152: [3, 8, 36, 3],
+}
+
+
+donwload_url = {
+    18: 'https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet18_vd_pretrained_from_paddle.pth',
+    34: 'https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet34_vd_pretrained_from_paddle.pth',
+    50: 'https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet50_vd_ssld_v2_pretrained_from_paddle.pth',
+    101: 'https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet101_vd_ssld_pretrained_from_paddle.pth',
+}
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, ch_in, ch_out, stride, shortcut, act='relu', variant='b'):
+        super().__init__()
+
+        self.shortcut = shortcut
+
+        if not shortcut:
+            if variant == 'd' and stride == 2:
+                self.short = nn.Sequential(OrderedDict([
+                    ('pool', nn.AvgPool2d(2, 2, 0, ceil_mode=True)),
+                    ('conv', ConvNormLayer(ch_in, ch_out, 1, 1))
+                ]))
+            else:
+                self.short = ConvNormLayer(ch_in, ch_out, 1, stride)
+
+        self.branch2a = ConvNormLayer(ch_in, ch_out, 3, stride, act=act)
+        self.branch2b = ConvNormLayer(ch_out, ch_out, 3, 1, act=None)
+        self.act = nn.Identity() if act is None else get_activation(act) 
+
+
+    def forward(self, x):
+        out = self.branch2a(x)
+        out = self.branch2b(out)
+        if self.shortcut:
+            short = x
+        else:
+            short = self.short(x)
+        
+        out = out + short
+        out = self.act(out)
+
+        return out
+
+
+class BottleNeck(nn.Module):
+    expansion = 4
+
+    def __init__(self, ch_in, ch_out, stride, shortcut, act='relu', variant='b'):
+        super().__init__()
+
+        if variant == 'a':
+            stride1, stride2 = stride, 1
+        else:
+            stride1, stride2 = 1, stride
+
+        width = ch_out 
+
+        self.branch2a = ConvNormLayer(ch_in, width, 1, stride1, act=act)
+        self.branch2b = ConvNormLayer(width, width, 3, stride2, act=act)
+        self.branch2c = ConvNormLayer(width, ch_out * self.expansion, 1, 1)
+
+        self.shortcut = shortcut
+        if not shortcut:
+            if variant == 'd' and stride == 2:
+                self.short = nn.Sequential(OrderedDict([
+                    ('pool', nn.AvgPool2d(2, 2, 0, ceil_mode=True)),
+                    ('conv', ConvNormLayer(ch_in, ch_out * self.expansion, 1, 1))
+                ]))
+            else:
+                self.short = ConvNormLayer(ch_in, ch_out * self.expansion, 1, stride)
+
+        self.act = nn.Identity() if act is None else get_activation(act) 
+
+    def forward(self, x):
+        out = self.branch2a(x)
+        out = self.branch2b(out)
+        out = self.branch2c(out)
+
+        if self.shortcut:
+            short = x
+        else:
+            short = self.short(x)
+
+        out = out + short
+        out = self.act(out)
+
+        return out
+
+
+class Blocks(nn.Module):
+    def __init__(self, block, ch_in, ch_out, count, stage_num, act='relu', variant='b'):
+        super().__init__()
+
+        self.blocks = nn.ModuleList()
+        for i in range(count):
+            self.blocks.append(
+                block(
+                    ch_in, 
+                    ch_out,
+                    stride=2 if i == 0 and stage_num != 2 else 1, 
+                    shortcut=False if i == 0 else True,
+                    variant=variant,
+                    act=act)
+            )
+
+            if i == 0:
+                ch_in = ch_out * block.expansion
+
+    def forward(self, x):
+        out = x
+        for block in self.blocks:
+            out = block(out)
+        return out
+
+
+@register
+class PResNet(nn.Module):
+    def __init__(
+        self, 
+        depth, 
+        variant='d', 
+        num_stages=4, 
+        return_idx=[0, 1, 2, 3], 
+        act='relu',
+        freeze_at=-1, 
+        freeze_norm=True, 
+        pretrained=False):
+        super().__init__()
+
+        block_nums = ResNet_cfg[depth]
+        ch_in = 64
+        if variant in ['c', 'd']:
+            conv_def = [
+                [3, ch_in // 2, 3, 2, "conv1_1"],
+                [ch_in // 2, ch_in // 2, 3, 1, "conv1_2"],
+                [ch_in // 2, ch_in, 3, 1, "conv1_3"],
+            ]
+        else:
+            conv_def = [[3, ch_in, 7, 2, "conv1_1"]]
+
+        self.conv1 = nn.Sequential(OrderedDict([
+            (_name, ConvNormLayer(c_in, c_out, k, s, act=act)) for c_in, c_out, k, s, _name in conv_def
+        ]))
+
+        ch_out_list = [64, 128, 256, 512]
+        block = BottleNeck if depth >= 50 else BasicBlock
+
+        _out_channels = [block.expansion * v for v in ch_out_list]
+        _out_strides = [4, 8, 16, 32]
+
+        self.res_layers = nn.ModuleList()
+        for i in range(num_stages):
+            stage_num = i + 2
+            self.res_layers.append(
+                Blocks(block, ch_in, ch_out_list[i], block_nums[i], stage_num, act=act, variant=variant)
+            )
+            ch_in = _out_channels[i]
+
+        self.return_idx = return_idx
+        self.out_channels = [_out_channels[_i] for _i in return_idx]
+        self.out_strides = [_out_strides[_i] for _i in return_idx]
+
+        if freeze_at >= 0:
+            self._freeze_parameters(self.conv1)
+            for i in range(min(freeze_at, num_stages)):
+                self._freeze_parameters(self.res_layers[i])
+
+        if freeze_norm:
+            self._freeze_norm(self)
+
+        if pretrained:
+            state = torch.hub.load_state_dict_from_url(donwload_url[depth])
+            self.load_state_dict(state)
+            print(f'Load PResNet{depth} state_dict')
+            
+    def _freeze_parameters(self, m: nn.Module):
+        for p in m.parameters():
+            p.requires_grad = False
+
+    def _freeze_norm(self, m: nn.Module):
+        if isinstance(m, nn.BatchNorm2d):
+            m = FrozenBatchNorm2d(m.num_features)
+        else:
+            for name, child in m.named_children():
+                _child = self._freeze_norm(child)
+                if _child is not child:
+                    setattr(m, name, _child)
+        return m
+
+    def forward(self, x):
+        conv1 = self.conv1(x)
+        x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1)
+        outs = []
+        for idx, stage in enumerate(self.res_layers):
+            x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+        return outs
+
+
diff --git a/rtdetr_pytorch/src/nn/backbone/regnet.py b/rtdetr_pytorch/src/nn/backbone/regnet.py
new file mode 100644
index 0000000..2282910
--- /dev/null
+++ b/rtdetr_pytorch/src/nn/backbone/regnet.py
@@ -0,0 +1,23 @@
+import torch
+import torch.nn as nn 
+from transformers import RegNetModel
+
+
+from src.core import register
+
+__all__ = ['RegNet']
+
+@register
+class RegNet(nn.Module):
+    def __init__(self, configuration, return_idx=[0, 1, 2, 3]):
+        super(RegNet, self).__init__()  
+        self.model = RegNetModel.from_pretrained("facebook/regnet-y-040")
+        self.return_idx = return_idx
+
+
+    def forward(self, x):
+        
+        outputs = self.model(x, output_hidden_states = True)
+        x = outputs.hidden_states[2:5]
+
+        return x
\ No newline at end of file
diff --git a/rtdetr_pytorch/src/nn/backbone/test_resnet.py b/rtdetr_pytorch/src/nn/backbone/test_resnet.py
new file mode 100644
index 0000000..6639d79
--- /dev/null
+++ b/rtdetr_pytorch/src/nn/backbone/test_resnet.py
@@ -0,0 +1,81 @@
+import torch
+import torch.nn as nn 
+import torch.nn.functional as F 
+
+from collections import OrderedDict
+
+
+from src.core import register
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(BasicBlock, self).__init__()
+
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.shortcut = nn.Sequential()         
+        if stride != 1 or in_planes != self.expansion*planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion*planes,kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(self.expansion*planes)
+            )
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))       
+        out += self.shortcut(x)          
+        out = F.relu(out)
+        return out
+
+
+
+class _ResNet(nn.Module):
+    def __init__(self, block, num_blocks, num_classes=10):
+        super().__init__()
+        self.in_planes = 64
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        
+        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
+        
+        self.linear = nn.Linear(512 * block.expansion, num_classes)
+
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1]*(num_blocks-1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_planes, planes, stride))
+            self.in_planes = planes * block.expansion 
+        return nn.Sequential(*layers)
+        
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        out = F.avg_pool2d(out, 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)              
+        return out
+        
+
+@register
+class MResNet(nn.Module):
+    def __init__(self, num_classes=10, num_blocks=[2, 2, 2, 2]) -> None:
+        super().__init__()
+        self.model = _ResNet(BasicBlock, num_blocks, num_classes)
+        
+    def forward(self, x):
+        return self.model(x)
+
diff --git a/rtdetr_pytorch/src/nn/backbone/utils.py b/rtdetr_pytorch/src/nn/backbone/utils.py
new file mode 100644
index 0000000..ee250b1
--- /dev/null
+++ b/rtdetr_pytorch/src/nn/backbone/utils.py
@@ -0,0 +1,58 @@
+"""
+https://github.com/pytorch/vision/blob/main/torchvision/models/_utils.py
+
+by lyuwenyu
+"""
+
+from collections import OrderedDict
+from typing import Dict, List
+
+
+import torch.nn as nn 
+
+
+class IntermediateLayerGetter(nn.ModuleDict):
+    """
+    Module wrapper that returns intermediate layers from a model
+
+    It has a strong assumption that the modules have been registered
+    into the model in the same order as they are used.
+    This means that one should **not** reuse the same nn.Module
+    twice in the forward if you want this to work.
+
+    Additionally, it is only able to query submodules that are directly
+    assigned to the model. So if `model` is passed, `model.feature1` can
+    be returned, but not `model.feature1.layer2`.
+    """
+
+    _version = 3
+
+    def __init__(self, model: nn.Module, return_layers: List[str]) -> None:
+        if not set(return_layers).issubset([name for name, _ in model.named_children()]):
+            raise ValueError("return_layers are not present in model. {}"\
+                .format([name for name, _ in model.named_children()]))
+        orig_return_layers = return_layers
+        return_layers = {str(k): str(k)  for k in return_layers}
+        layers = OrderedDict()
+        for name, module in model.named_children():
+            layers[name] = module
+            if name in return_layers:
+                del return_layers[name]
+            if not return_layers:
+                break
+
+        super().__init__(layers)
+        self.return_layers = orig_return_layers
+
+    def forward(self, x):
+        # out = OrderedDict()
+        outputs = []
+        for name, module in self.items():
+            x = module(x)
+            if name in self.return_layers:
+                # out_name = self.return_layers[name]
+                # out[out_name] = x
+                outputs.append(x)
+        
+        return outputs
+
diff --git a/rtdetr_pytorch/src/nn/criterion/__init__.py b/rtdetr_pytorch/src/nn/criterion/__init__.py
new file mode 100644
index 0000000..9804569
--- /dev/null
+++ b/rtdetr_pytorch/src/nn/criterion/__init__.py
@@ -0,0 +1,6 @@
+
+import torch.nn as nn 
+from src.core import register
+
+CrossEntropyLoss = register(nn.CrossEntropyLoss)
+
diff --git a/rtdetr_pytorch/src/nn/criterion/utils.py b/rtdetr_pytorch/src/nn/criterion/utils.py
new file mode 100644
index 0000000..7d8833e
--- /dev/null
+++ b/rtdetr_pytorch/src/nn/criterion/utils.py
@@ -0,0 +1,20 @@
+import torch 
+import torchvision
+
+
+
+def format_target(targets):
+    '''
+    Args:
+        targets (List[Dict]),
+    Return: 
+        tensor (Tensor), [im_id, label, bbox,]
+    '''
+    outputs = []
+    for i, tgt in enumerate(targets):
+        boxes =  torchvision.ops.box_convert(tgt['boxes'], in_fmt='xyxy', out_fmt='cxcywh') 
+        labels = tgt['labels'].reshape(-1, 1)
+        im_ids = torch.ones_like(labels) * i
+        outputs.append(torch.cat([im_ids, labels, boxes], dim=1))
+
+    return torch.cat(outputs, dim=0)
diff --git a/rtdetr_pytorch/src/optim/__init__.py b/rtdetr_pytorch/src/optim/__init__.py
new file mode 100644
index 0000000..1bd7c81
--- /dev/null
+++ b/rtdetr_pytorch/src/optim/__init__.py
@@ -0,0 +1,4 @@
+
+from .ema import *
+from .optim import *
+from .amp import *
\ No newline at end of file
diff --git a/rtdetr_pytorch/src/optim/amp.py b/rtdetr_pytorch/src/optim/amp.py
new file mode 100644
index 0000000..e43d021
--- /dev/null
+++ b/rtdetr_pytorch/src/optim/amp.py
@@ -0,0 +1,12 @@
+import torch
+import torch.nn as nn 
+import torch.cuda.amp as amp
+
+
+from src.core import register
+import src.misc.dist as dist 
+
+
+__all__ = ['GradScaler']
+
+GradScaler = register(amp.grad_scaler.GradScaler)
diff --git a/rtdetr_pytorch/src/optim/ema.py b/rtdetr_pytorch/src/optim/ema.py
new file mode 100644
index 0000000..bf962b3
--- /dev/null
+++ b/rtdetr_pytorch/src/optim/ema.py
@@ -0,0 +1,115 @@
+"""
+reference: 
+https://github.com/ultralytics/yolov5/blob/master/utils/torch_utils.py#L404
+
+by lyuwenyu
+"""
+
+import torch
+import torch.nn as nn 
+
+import math
+from copy import deepcopy
+
+
+
+from src.core import register
+import src.misc.dist as dist 
+
+
+__all__ = ['ModelEMA']
+
+
+
+@register
+class ModelEMA(object):
+    """ Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models
+    Keep a moving average of everything in the model state_dict (parameters and buffers).
+    This is intended to allow functionality like
+    https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
+    A smoothed version of the weights is necessary for some training schemes to perform well.
+    This class is sensitive where it is initialized in the sequence of model init,
+    GPU assignment and distributed training wrappers.
+    """
+    def __init__(self, model: nn.Module, decay: float=0.9999, warmups: int=2000):
+        super().__init__()
+
+        # Create EMA
+        self.module = deepcopy(dist.de_parallel(model)).eval()  # FP32 EMA
+        
+        # if next(model.parameters()).device.type != 'cpu':
+        #     self.module.half()  # FP16 EMA
+        
+        self.decay = decay 
+        self.warmups = warmups
+        self.updates = 0  # number of EMA updates
+        # self.filter_no_grad = filter_no_grad
+        self.decay_fn = lambda x: decay * (1 - math.exp(-x / warmups))  # decay exponential ramp (to help early epochs)
+        
+        for p in self.module.parameters():
+            p.requires_grad_(False)
+
+    def update(self, model: nn.Module):
+        # Update EMA parameters
+        with torch.no_grad():
+            self.updates += 1
+            d = self.decay_fn(self.updates)
+
+            msd = dist.de_parallel(model).state_dict()
+            for k, v in self.module.state_dict().items():
+                if v.dtype.is_floating_point:
+                    v *= d
+                    v += (1 - d) * msd[k].detach()
+            
+    def to(self, *args, **kwargs):
+        self.module = self.module.to(*args, **kwargs)
+        return self
+
+    def update_attr(self, model, include=(), exclude=('process_group', 'reducer')):
+        # Update EMA attributes
+        self.copy_attr(self.module, model, include, exclude)
+
+    @staticmethod
+    def copy_attr(a, b, include=(), exclude=()):
+        # Copy attributes from b to a, options to only include [...] and to exclude [...]
+        for k, v in b.__dict__.items():
+            if (len(include) and k not in include) or k.startswith('_') or k in exclude:
+                continue
+            else:
+                setattr(a, k, v)
+
+    def state_dict(self, ):
+        return dict(module=self.module.state_dict(), updates=self.updates, warmups=self.warmups)
+    
+    def load_state_dict(self, state):
+        self.module.load_state_dict(state['module']) 
+        if 'updates' in state:
+            self.updates = state['updates']
+
+    def forwad(self, ):
+        raise RuntimeError('ema...')
+
+    def extra_repr(self) -> str:
+        return f'decay={self.decay}, warmups={self.warmups}'
+
+
+
+
+class ExponentialMovingAverage(torch.optim.swa_utils.AveragedModel):
+    """Maintains moving averages of model parameters using an exponential decay.
+    ``ema_avg = decay * avg_model_param + (1 - decay) * model_param``
+    `torch.optim.swa_utils.AveragedModel <https://pytorch.org/docs/stable/optim.html#custom-averaging-strategies>`_
+    is used to compute the EMA.
+    """
+    def __init__(self, model, decay, device="cpu", use_buffers=True):
+
+        self.decay_fn = lambda x: decay * (1 - math.exp(-x / 2000))  
+        
+        def ema_avg(avg_model_param, model_param, num_averaged):
+            decay = self.decay_fn(num_averaged)
+            return decay * avg_model_param + (1 - decay) * model_param
+
+        super().__init__(model, device, ema_avg, use_buffers=use_buffers)
+
+
+
diff --git a/rtdetr_pytorch/src/optim/optim.py b/rtdetr_pytorch/src/optim/optim.py
new file mode 100644
index 0000000..b10bd82
--- /dev/null
+++ b/rtdetr_pytorch/src/optim/optim.py
@@ -0,0 +1,22 @@
+
+import torch 
+import torch.nn as nn 
+import torch.optim as optim
+import torch.optim.lr_scheduler as lr_scheduler
+
+from src.core import register
+
+
+__all__ = ['AdamW', 'SGD', 'Adam', 'MultiStepLR', 'CosineAnnealingLR', 'OneCycleLR', 'LambdaLR']
+
+
+
+SGD = register(optim.SGD)
+Adam = register(optim.Adam)
+AdamW = register(optim.AdamW)
+
+
+MultiStepLR = register(lr_scheduler.MultiStepLR)
+CosineAnnealingLR = register(lr_scheduler.CosineAnnealingLR)
+OneCycleLR = register(lr_scheduler.OneCycleLR)
+LambdaLR = register(lr_scheduler.LambdaLR)
diff --git a/rtdetr_pytorch/src/solver/__init__.py b/rtdetr_pytorch/src/solver/__init__.py
new file mode 100644
index 0000000..eddab7b
--- /dev/null
+++ b/rtdetr_pytorch/src/solver/__init__.py
@@ -0,0 +1,12 @@
+"""by lyuwenyu
+"""
+
+from .solver import BaseSolver
+from .det_solver import DetSolver
+
+
+from typing import Dict 
+
+TASKS :Dict[str, BaseSolver] = {
+    'detection': DetSolver,
+}
\ No newline at end of file
diff --git a/rtdetr_pytorch/src/solver/det_engine.py b/rtdetr_pytorch/src/solver/det_engine.py
new file mode 100644
index 0000000..fbca083
--- /dev/null
+++ b/rtdetr_pytorch/src/solver/det_engine.py
@@ -0,0 +1,190 @@
+"""
+Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+https://github.com/facebookresearch/detr/blob/main/engine.py
+
+by lyuwenyu
+"""
+
+import math
+import os
+import sys
+import pathlib
+from typing import Iterable
+
+import torch
+import torch.amp 
+
+from src.data import CocoEvaluator
+from src.misc import (MetricLogger, SmoothedValue, reduce_dict)
+
+
+def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
+                    data_loader: Iterable, optimizer: torch.optim.Optimizer,
+                    device: torch.device, epoch: int, max_norm: float = 0, **kwargs):
+    model.train()
+    criterion.train()
+    metric_logger = MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    # metric_logger.add_meter('class_error', SmoothedValue(window_size=1, fmt='{value:.2f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+    print_freq = kwargs.get('print_freq', 10)
+    
+    ema = kwargs.get('ema', None)
+    scaler = kwargs.get('scaler', None)
+
+    for samples, targets in metric_logger.log_every(data_loader, print_freq, header):
+        samples = samples.to(device)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+
+        if scaler is not None:
+            with torch.autocast(device_type=str(device), cache_enabled=True):
+                outputs = model(samples, targets)
+            
+            with torch.autocast(device_type=str(device), enabled=False):
+                loss_dict = criterion(outputs, targets)
+
+            loss = sum(loss_dict.values())
+            scaler.scale(loss).backward()
+            
+            if max_norm > 0:
+                scaler.unscale_(optimizer)
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad()
+
+        else:
+            outputs = model(samples, targets)
+            loss_dict = criterion(outputs, targets)
+            
+            loss = sum(loss_dict.values())
+            optimizer.zero_grad()
+            loss.backward()
+            
+            if max_norm > 0:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+
+            optimizer.step()
+        
+        # ema 
+        if ema is not None:
+            ema.update(model)
+
+        loss_dict_reduced = reduce_dict(loss_dict)
+        loss_value = sum(loss_dict_reduced.values())
+
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value))
+            print(loss_dict_reduced)
+            sys.exit(1)
+
+        metric_logger.update(loss=loss_value, **loss_dict_reduced)
+        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+
+@torch.no_grad()
+def evaluate(model: torch.nn.Module, criterion: torch.nn.Module, postprocessors, data_loader, base_ds, device, output_dir):
+    model.eval()
+    criterion.eval()
+
+    metric_logger = MetricLogger(delimiter="  ")
+    # metric_logger.add_meter('class_error', SmoothedValue(window_size=1, fmt='{value:.2f}'))
+    header = 'Test:'
+
+    # iou_types = tuple(k for k in ('segm', 'bbox') if k in postprocessors.keys())
+    iou_types = postprocessors.iou_types
+    coco_evaluator = CocoEvaluator(base_ds, iou_types)
+    # coco_evaluator.coco_eval[iou_types[0]].params.iouThrs = [0, 0.1, 0.5, 0.75]
+
+    panoptic_evaluator = None
+    # if 'panoptic' in postprocessors.keys():
+    #     panoptic_evaluator = PanopticEvaluator(
+    #         data_loader.dataset.ann_file,
+    #         data_loader.dataset.ann_folder,
+    #         output_dir=os.path.join(output_dir, "panoptic_eval"),
+    #     )
+
+    for samples, targets in metric_logger.log_every(data_loader, 10, header):
+        samples = samples.to(device)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+
+        # with torch.autocast(device_type=str(device)):
+        #     outputs = model(samples)
+
+        outputs = model(samples)
+
+        # loss_dict = criterion(outputs, targets)
+        # weight_dict = criterion.weight_dict
+        # # reduce losses over all GPUs for logging purposes
+        # loss_dict_reduced = reduce_dict(loss_dict)
+        # loss_dict_reduced_scaled = {k: v * weight_dict[k]
+        #                             for k, v in loss_dict_reduced.items() if k in weight_dict}
+        # loss_dict_reduced_unscaled = {f'{k}_unscaled': v
+        #                               for k, v in loss_dict_reduced.items()}
+        # metric_logger.update(loss=sum(loss_dict_reduced_scaled.values()),
+        #                      **loss_dict_reduced_scaled,
+        #                      **loss_dict_reduced_unscaled)
+        # metric_logger.update(class_error=loss_dict_reduced['class_error'])
+
+        orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)        
+        results = postprocessors(outputs, orig_target_sizes)
+        # results = postprocessors(outputs, targets)
+
+        # if 'segm' in postprocessors.keys():
+        #     target_sizes = torch.stack([t["size"] for t in targets], dim=0)
+        #     results = postprocessors['segm'](results, outputs, orig_target_sizes, target_sizes)
+
+        res = {target['image_id'].item(): output for target, output in zip(targets, results)}
+        if coco_evaluator is not None:
+            coco_evaluator.update(res)
+
+        # if panoptic_evaluator is not None:
+        #     res_pano = postprocessors["panoptic"](outputs, target_sizes, orig_target_sizes)
+        #     for i, target in enumerate(targets):
+        #         image_id = target["image_id"].item()
+        #         file_name = f"{image_id:012d}.png"
+        #         res_pano[i]["image_id"] = image_id
+        #         res_pano[i]["file_name"] = file_name
+        #     panoptic_evaluator.update(res_pano)
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    if coco_evaluator is not None:
+        coco_evaluator.synchronize_between_processes()
+    if panoptic_evaluator is not None:
+        panoptic_evaluator.synchronize_between_processes()
+
+    # accumulate predictions from all images
+    if coco_evaluator is not None:
+        coco_evaluator.accumulate()
+        coco_evaluator.summarize()
+
+    # panoptic_res = None
+    # if panoptic_evaluator is not None:
+    #     panoptic_res = panoptic_evaluator.summarize()
+    
+    stats = {}
+    # stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+    if coco_evaluator is not None:
+        if 'bbox' in iou_types:
+            stats['coco_eval_bbox'] = coco_evaluator.coco_eval['bbox'].stats.tolist()
+        if 'segm' in iou_types:
+            stats['coco_eval_masks'] = coco_evaluator.coco_eval['segm'].stats.tolist()
+            
+    # if panoptic_res is not None:
+    #     stats['PQ_all'] = panoptic_res["All"]
+    #     stats['PQ_th'] = panoptic_res["Things"]
+    #     stats['PQ_st'] = panoptic_res["Stuff"]
+
+    return stats, coco_evaluator
+
+
+
diff --git a/rtdetr_pytorch/src/solver/det_solver.py b/rtdetr_pytorch/src/solver/det_solver.py
new file mode 100644
index 0000000..d0a0a84
--- /dev/null
+++ b/rtdetr_pytorch/src/solver/det_solver.py
@@ -0,0 +1,104 @@
+'''
+by lyuwenyu
+'''
+import time 
+import json
+import datetime
+
+import torch 
+
+from src.misc import dist
+from src.data import get_coco_api_from_dataset
+
+from .solver import BaseSolver
+from .det_engine import train_one_epoch, evaluate
+
+
+class DetSolver(BaseSolver):
+    
+    def fit(self, ):
+        print("Start training")
+        self.train()
+
+        args = self.cfg 
+        
+        n_parameters = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
+        print('number of params:', n_parameters)
+
+        base_ds = get_coco_api_from_dataset(self.val_dataloader.dataset)
+        # best_stat = {'coco_eval_bbox': 0, 'coco_eval_masks': 0, 'epoch': -1, }
+        best_stat = {'epoch': -1, }
+
+        start_time = time.time()
+        for epoch in range(self.last_epoch + 1, args.epoches):
+            if dist.is_dist_available_and_initialized():
+                self.train_dataloader.sampler.set_epoch(epoch)
+            
+            train_stats = train_one_epoch(
+                self.model, self.criterion, self.train_dataloader, self.optimizer, self.device, epoch,
+                args.clip_max_norm, print_freq=args.log_step, ema=self.ema, scaler=self.scaler)
+
+            self.lr_scheduler.step()
+            
+            if self.output_dir:
+                checkpoint_paths = [self.output_dir / 'checkpoint.pth']
+                # extra checkpoint before LR drop and every 100 epochs
+                if (epoch + 1) % args.checkpoint_step == 0:
+                    checkpoint_paths.append(self.output_dir / f'checkpoint{epoch:04}.pth')
+                for checkpoint_path in checkpoint_paths:
+                    dist.save_on_master(self.state_dict(epoch), checkpoint_path)
+
+            module = self.ema.module if self.ema else self.model
+            test_stats, coco_evaluator = evaluate(
+                module, self.criterion, self.postprocessor, self.val_dataloader, base_ds, self.device, self.output_dir
+            )
+
+            # TODO 
+            for k in test_stats.keys():
+                if k in best_stat:
+                    best_stat['epoch'] = epoch if test_stats[k][0] > best_stat[k] else best_stat['epoch']
+                    best_stat[k] = max(best_stat[k], test_stats[k][0])
+                else:
+                    best_stat['epoch'] = epoch
+                    best_stat[k] = test_stats[k][0]
+            print('best_stat: ', best_stat)
+
+
+            log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                        **{f'test_{k}': v for k, v in test_stats.items()},
+                        'epoch': epoch,
+                        'n_parameters': n_parameters}
+
+            if self.output_dir and dist.is_main_process():
+                with (self.output_dir / "log.txt").open("a") as f:
+                    f.write(json.dumps(log_stats) + "\n")
+
+                # for evaluation logs
+                if coco_evaluator is not None:
+                    (self.output_dir / 'eval').mkdir(exist_ok=True)
+                    if "bbox" in coco_evaluator.coco_eval:
+                        filenames = ['latest.pth']
+                        if epoch % 50 == 0:
+                            filenames.append(f'{epoch:03}.pth')
+                        for name in filenames:
+                            torch.save(coco_evaluator.coco_eval["bbox"].eval,
+                                    self.output_dir / "eval" / name)
+
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('Training time {}'.format(total_time_str))
+
+
+    def val(self, ):
+        self.eval()
+
+        base_ds = get_coco_api_from_dataset(self.val_dataloader.dataset)
+        
+        module = self.ema.module if self.ema else self.model
+        test_stats, coco_evaluator = evaluate(module, self.criterion, self.postprocessor,
+                self.val_dataloader, base_ds, self.device, self.output_dir)
+                
+        if self.output_dir:
+            dist.save_on_master(coco_evaluator.coco_eval["bbox"].eval, self.output_dir / "eval.pth")
+        
+        return
diff --git a/rtdetr_pytorch/src/solver/solver.py b/rtdetr_pytorch/src/solver/solver.py
new file mode 100644
index 0000000..55452f2
--- /dev/null
+++ b/rtdetr_pytorch/src/solver/solver.py
@@ -0,0 +1,182 @@
+"""by lyuwenyu
+"""
+
+import torch 
+import torch.nn as nn 
+
+from datetime import datetime
+from pathlib import Path 
+from typing import Dict
+
+from src.misc import dist
+from src.core import BaseConfig
+
+
+class BaseSolver(object):
+    def __init__(self, cfg: BaseConfig) -> None:
+        
+        self.cfg = cfg 
+
+    def setup(self, ):
+        '''Avoid instantiating unnecessary classes 
+        '''
+        cfg = self.cfg
+        device = cfg.device
+        self.device = device
+        self.last_epoch = cfg.last_epoch
+
+        self.model = dist.warp_model(cfg.model.to(device), cfg.find_unused_parameters, cfg.sync_bn)
+        self.criterion = cfg.criterion.to(device)
+        self.postprocessor = cfg.postprocessor
+
+        # NOTE (lvwenyu): should load_tuning_state before ema instance building
+        if self.cfg.tuning:
+            print(f'Tuning checkpoint from {self.cfg.tuning}')
+            self.load_tuning_state(self.cfg.tuning)
+
+        self.scaler = cfg.scaler
+        self.ema = cfg.ema.to(device) if cfg.ema is not None else None 
+
+        self.output_dir = Path(cfg.output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+
+
+    def train(self, ):
+        self.setup()
+        self.optimizer = self.cfg.optimizer
+        self.lr_scheduler = self.cfg.lr_scheduler
+
+        # NOTE instantiating order
+        if self.cfg.resume:
+            print(f'Resume checkpoint from {self.cfg.resume}')
+            self.resume(self.cfg.resume)
+
+        self.train_dataloader = dist.warp_loader(self.cfg.train_dataloader, \
+            shuffle=self.cfg.train_dataloader.shuffle)
+        self.val_dataloader = dist.warp_loader(self.cfg.val_dataloader, \
+            shuffle=self.cfg.val_dataloader.shuffle)
+
+
+    def eval(self, ):
+        self.setup()
+        self.val_dataloader = dist.warp_loader(self.cfg.val_dataloader, \
+            shuffle=self.cfg.val_dataloader.shuffle)
+
+        if self.cfg.resume:
+            print(f'resume from {self.cfg.resume}')
+            self.resume(self.cfg.resume)
+
+
+    def state_dict(self, last_epoch):
+        '''state dict
+        '''
+        state = {}
+        state['model'] = dist.de_parallel(self.model).state_dict()
+        state['date'] = datetime.now().isoformat()
+
+        # TODO
+        state['last_epoch'] = last_epoch
+
+        if self.optimizer is not None:
+            state['optimizer'] = self.optimizer.state_dict()
+
+        if self.lr_scheduler is not None:
+            state['lr_scheduler'] = self.lr_scheduler.state_dict()
+            # state['last_epoch'] = self.lr_scheduler.last_epoch
+
+        if self.ema is not None:
+            state['ema'] = self.ema.state_dict()
+
+        if self.scaler is not None:
+            state['scaler'] = self.scaler.state_dict()
+
+        return state
+
+
+    def load_state_dict(self, state):
+        '''load state dict
+        '''
+        # TODO
+        if getattr(self, 'last_epoch', None) and 'last_epoch' in state:
+            self.last_epoch = state['last_epoch']
+            print('Loading last_epoch')
+
+        if getattr(self, 'model', None) and 'model' in state:
+            if dist.is_parallel(self.model):
+                self.model.module.load_state_dict(state['model'])
+            else:
+                self.model.load_state_dict(state['model'])
+            print('Loading model.state_dict')
+
+        if getattr(self, 'ema', None) and 'ema' in state:
+            self.ema.load_state_dict(state['ema'])
+            print('Loading ema.state_dict')
+
+        if getattr(self, 'optimizer', None) and 'optimizer' in state:
+            self.optimizer.load_state_dict(state['optimizer'])
+            print('Loading optimizer.state_dict')
+
+        if getattr(self, 'lr_scheduler', None) and 'lr_scheduler' in state:
+            self.lr_scheduler.load_state_dict(state['lr_scheduler'])
+            print('Loading lr_scheduler.state_dict')
+
+        if getattr(self, 'scaler', None) and 'scaler' in state:
+            self.scaler.load_state_dict(state['scaler'])
+            print('Loading scaler.state_dict')
+
+
+    def save(self, path):
+        '''save state
+        '''
+        state = self.state_dict()
+        dist.save_on_master(state, path)
+
+
+    def resume(self, path):
+        '''load resume
+        '''
+        # for cuda:0 memory
+        state = torch.load(path, map_location='cpu')
+        self.load_state_dict(state)
+
+    def load_tuning_state(self, path,):
+        """only load model for tuning and skip missed/dismatched keys
+        """
+        if 'http' in path:
+            state = torch.hub.load_state_dict_from_url(path, map_location='cpu')
+        else:
+            state = torch.load(path, map_location='cpu')
+
+        module = dist.de_parallel(self.model)
+        
+        # TODO hard code
+        if 'ema' in state:
+            stat, infos = self._matched_state(module.state_dict(), state['ema']['module'])
+        else:
+            stat, infos = self._matched_state(module.state_dict(), state['model'])
+
+        module.load_state_dict(stat, strict=False)
+        print(f'Load model.state_dict, {infos}')
+
+    @staticmethod
+    def _matched_state(state: Dict[str, torch.Tensor], params: Dict[str, torch.Tensor]):
+        missed_list = []
+        unmatched_list = []
+        matched_state = {}
+        for k, v in state.items():
+            if k in params:
+                if v.shape == params[k].shape:
+                    matched_state[k] = params[k]
+                else:
+                    unmatched_list.append(k)
+            else:
+                missed_list.append(k)
+
+        return matched_state, {'missed': missed_list, 'unmatched': unmatched_list}
+
+
+    def fit(self, ):
+        raise NotImplementedError('')
+
+    def val(self, ):
+        raise NotImplementedError('')
diff --git a/rtdetr_pytorch/src/zoo/__init__.py b/rtdetr_pytorch/src/zoo/__init__.py
new file mode 100644
index 0000000..e6c56d9
--- /dev/null
+++ b/rtdetr_pytorch/src/zoo/__init__.py
@@ -0,0 +1,2 @@
+
+from .rtdetr import *
diff --git a/rtdetr_pytorch/src/zoo/rtdetr/__init__.py b/rtdetr_pytorch/src/zoo/rtdetr/__init__.py
new file mode 100644
index 0000000..1b4583b
--- /dev/null
+++ b/rtdetr_pytorch/src/zoo/rtdetr/__init__.py
@@ -0,0 +1,12 @@
+"""by lyuwenyu
+"""
+
+
+from .rtdetr import *
+
+from .hybrid_encoder import *
+from .rtdetr_decoder import *
+from .rtdetr_postprocessor import *
+from .rtdetr_criterion import *
+
+from .matcher import *
diff --git a/rtdetr_pytorch/src/zoo/rtdetr/box_ops.py b/rtdetr_pytorch/src/zoo/rtdetr/box_ops.py
new file mode 100644
index 0000000..5d65866
--- /dev/null
+++ b/rtdetr_pytorch/src/zoo/rtdetr/box_ops.py
@@ -0,0 +1,89 @@
+'''
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+https://github.com/facebookresearch/detr/blob/main/util/box_ops.py
+'''
+
+import torch
+from torchvision.ops.boxes import box_area
+
+
+def box_cxcywh_to_xyxy(x):
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
+         (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return torch.stack(b, dim=-1)
+
+
+def box_xyxy_to_cxcywh(x):
+    x0, y0, x1, y1 = x.unbind(-1)
+    b = [(x0 + x1) / 2, (y0 + y1) / 2,
+         (x1 - x0), (y1 - y0)]
+    return torch.stack(b, dim=-1)
+
+
+# modified from torchvision to also return the union
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/
+
+    The boxes should be in [x0, y0, x1, y1] format
+
+    Returns a [N, M] pairwise matrix, where N = len(boxes1)
+    and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    iou, union = box_iou(boxes1, boxes2)
+
+    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    area = wh[:, :, 0] * wh[:, :, 1]
+
+    return iou - (area - union) / area
+
+
+def masks_to_boxes(masks):
+    """Compute the bounding boxes around the provided masks
+
+    The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
+
+    Returns a [N, 4] tensors, with the boxes in xyxy format
+    """
+    if masks.numel() == 0:
+        return torch.zeros((0, 4), device=masks.device)
+
+    h, w = masks.shape[-2:]
+
+    y = torch.arange(0, h, dtype=torch.float)
+    x = torch.arange(0, w, dtype=torch.float)
+    y, x = torch.meshgrid(y, x)
+
+    x_mask = (masks * x.unsqueeze(0))
+    x_max = x_mask.flatten(1).max(-1)[0]
+    x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+
+    y_mask = (masks * y.unsqueeze(0))
+    y_max = y_mask.flatten(1).max(-1)[0]
+    y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+
+    return torch.stack([x_min, y_min, x_max, y_max], 1)
\ No newline at end of file
diff --git a/rtdetr_pytorch/src/zoo/rtdetr/denoising.py b/rtdetr_pytorch/src/zoo/rtdetr/denoising.py
new file mode 100644
index 0000000..6830752
--- /dev/null
+++ b/rtdetr_pytorch/src/zoo/rtdetr/denoising.py
@@ -0,0 +1,125 @@
+"""by lyuwenyu
+"""
+
+import torch 
+
+from .utils import inverse_sigmoid
+from .box_ops import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh
+
+
+
+def get_contrastive_denoising_training_group(targets,
+                                             num_classes,
+                                             num_queries,
+                                             class_embed,
+                                             num_denoising=100,
+                                             label_noise_ratio=0.5,
+                                             box_noise_scale=1.0,):
+    """cnd"""
+    if num_denoising <= 0:
+        return None, None, None, None
+
+    num_gts = [len(t['labels']) for t in targets]
+    device = targets[0]['labels'].device
+    
+    max_gt_num = max(num_gts)
+    if max_gt_num == 0:
+        return None, None, None, None
+
+    num_group = num_denoising // max_gt_num
+    num_group = 1 if num_group == 0 else num_group
+    # pad gt to max_num of a batch
+    bs = len(num_gts)
+
+    input_query_class = torch.full([bs, max_gt_num], num_classes, dtype=torch.int32, device=device)
+    input_query_bbox = torch.zeros([bs, max_gt_num, 4], device=device)
+    pad_gt_mask = torch.zeros([bs, max_gt_num], dtype=torch.bool, device=device)
+
+    for i in range(bs):
+        num_gt = num_gts[i]
+        if num_gt > 0:
+            input_query_class[i, :num_gt] = targets[i]['labels']
+            input_query_bbox[i, :num_gt] = targets[i]['boxes']
+            pad_gt_mask[i, :num_gt] = 1
+    # each group has positive and negative queries.
+    input_query_class = input_query_class.tile([1, 2 * num_group])
+    input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1])
+    pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group])
+    # positive and negative mask
+    negative_gt_mask = torch.zeros([bs, max_gt_num * 2, 1], device=device)
+    negative_gt_mask[:, max_gt_num:] = 1
+    negative_gt_mask = negative_gt_mask.tile([1, num_group, 1])
+    positive_gt_mask = 1 - negative_gt_mask
+    # contrastive denoising training positive index
+    positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
+    dn_positive_idx = torch.nonzero(positive_gt_mask)[:, 1]
+    dn_positive_idx = torch.split(dn_positive_idx, [n * num_group for n in num_gts])
+    # total denoising queries
+    num_denoising = int(max_gt_num * 2 * num_group)
+
+    if label_noise_ratio > 0:
+        mask = torch.rand_like(input_query_class, dtype=torch.float) < (label_noise_ratio * 0.5)
+        # randomly put a new one here
+        new_label = torch.randint_like(mask, 0, num_classes, dtype=input_query_class.dtype)
+        input_query_class = torch.where(mask & pad_gt_mask, new_label, input_query_class)
+
+    # if label_noise_ratio > 0:
+    #     input_query_class = input_query_class.flatten()
+    #     pad_gt_mask = pad_gt_mask.flatten()
+    #     # half of bbox prob
+    #     # mask = torch.rand(input_query_class.shape, device=device) < (label_noise_ratio * 0.5)
+    #     mask = torch.rand_like(input_query_class) < (label_noise_ratio * 0.5)
+    #     chosen_idx = torch.nonzero(mask * pad_gt_mask).squeeze(-1)
+    #     # randomly put a new one here
+    #     new_label = torch.randint_like(chosen_idx, 0, num_classes, dtype=input_query_class.dtype)
+    #     # input_query_class.scatter_(dim=0, index=chosen_idx, value=new_label)
+    #     input_query_class[chosen_idx] = new_label
+    #     input_query_class = input_query_class.reshape(bs, num_denoising)
+    #     pad_gt_mask = pad_gt_mask.reshape(bs, num_denoising)
+
+    if box_noise_scale > 0:
+        known_bbox = box_cxcywh_to_xyxy(input_query_bbox)
+        diff = torch.tile(input_query_bbox[..., 2:] * 0.5, [1, 1, 2]) * box_noise_scale
+        rand_sign = torch.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
+        rand_part = torch.rand_like(input_query_bbox)
+        rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (1 - negative_gt_mask)
+        rand_part *= rand_sign
+        known_bbox += rand_part * diff
+        known_bbox.clip_(min=0.0, max=1.0)
+        input_query_bbox = box_xyxy_to_cxcywh(known_bbox)
+        input_query_bbox = inverse_sigmoid(input_query_bbox)
+
+    # class_embed = torch.concat([class_embed, torch.zeros([1, class_embed.shape[-1]], device=device)])
+    # input_query_class = torch.gather(
+    #     class_embed, input_query_class.flatten(),
+    #     axis=0).reshape(bs, num_denoising, -1)
+    # input_query_class = class_embed(input_query_class.flatten()).reshape(bs, num_denoising, -1)
+    input_query_class = class_embed(input_query_class)
+
+    tgt_size = num_denoising + num_queries
+    # attn_mask = torch.ones([tgt_size, tgt_size], device=device) < 0
+    attn_mask = torch.full([tgt_size, tgt_size], False, dtype=torch.bool, device=device)
+    # match query cannot see the reconstruction
+    attn_mask[num_denoising:, :num_denoising] = True
+    
+    # reconstruct cannot see each other
+    for i in range(num_group):
+        if i == 0:
+            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1): num_denoising] = True
+        if i == num_group - 1:
+            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), :max_gt_num * i * 2] = True
+        else:
+            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1): num_denoising] = True
+            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), :max_gt_num * 2 * i] = True
+        
+    dn_meta = {
+        "dn_positive_idx": dn_positive_idx,
+        "dn_num_group": num_group,
+        "dn_num_split": [num_denoising, num_queries]
+    }
+
+    # print(input_query_class.shape) # torch.Size([4, 196, 256])
+    # print(input_query_bbox.shape) # torch.Size([4, 196, 4])
+    # print(attn_mask.shape) # torch.Size([496, 496])
+    
+    return input_query_class, input_query_bbox, attn_mask, dn_meta
diff --git a/rtdetr_pytorch/src/zoo/rtdetr/hybrid_encoder.py b/rtdetr_pytorch/src/zoo/rtdetr/hybrid_encoder.py
new file mode 100644
index 0000000..804db69
--- /dev/null
+++ b/rtdetr_pytorch/src/zoo/rtdetr/hybrid_encoder.py
@@ -0,0 +1,322 @@
+'''by lyuwenyu
+'''
+
+import copy
+import torch 
+import torch.nn as nn 
+import torch.nn.functional as F 
+
+from .utils import get_activation
+
+from src.core import register
+
+
+__all__ = ['HybridEncoder']
+
+
+
+class ConvNormLayer(nn.Module):
+    def __init__(self, ch_in, ch_out, kernel_size, stride, padding=None, bias=False, act=None):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            ch_in, 
+            ch_out, 
+            kernel_size, 
+            stride, 
+            padding=(kernel_size-1)//2 if padding is None else padding, 
+            bias=bias)
+        self.norm = nn.BatchNorm2d(ch_out)
+        self.act = nn.Identity() if act is None else get_activation(act) 
+
+    def forward(self, x):
+        return self.act(self.norm(self.conv(x)))
+
+
+class RepVggBlock(nn.Module):
+    def __init__(self, ch_in, ch_out, act='relu'):
+        super().__init__()
+        self.ch_in = ch_in
+        self.ch_out = ch_out
+        self.conv1 = ConvNormLayer(ch_in, ch_out, 3, 1, padding=1, act=None)
+        self.conv2 = ConvNormLayer(ch_in, ch_out, 1, 1, padding=0, act=None)
+        self.act = nn.Identity() if act is None else get_activation(act) 
+
+    def forward(self, x):
+        if hasattr(self, 'conv'):
+            y = self.conv(x)
+        else:
+            y = self.conv1(x) + self.conv2(x)
+
+        return self.act(y)
+
+    def convert_to_deploy(self):
+        if not hasattr(self, 'conv'):
+            self.conv = nn.Conv2d(self.ch_in, self.ch_out, 3, 1, padding=1)
+
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.conv.weight.data = kernel
+        self.conv.bias.data = bias 
+        # self.__delattr__('conv1')
+        # self.__delattr__('conv2')
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
+        
+        return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1), bias3x3 + bias1x1
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return F.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch: ConvNormLayer):
+        if branch is None:
+            return 0, 0
+        kernel = branch.conv.weight
+        running_mean = branch.norm.running_mean
+        running_var = branch.norm.running_var
+        gamma = branch.norm.weight
+        beta = branch.norm.bias
+        eps = branch.norm.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+
+class CSPRepLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_blocks=3,
+                 expansion=1.0,
+                 bias=None,
+                 act="silu"):
+        super(CSPRepLayer, self).__init__()
+        hidden_channels = int(out_channels * expansion)
+        self.conv1 = ConvNormLayer(in_channels, hidden_channels, 1, 1, bias=bias, act=act)
+        self.conv2 = ConvNormLayer(in_channels, hidden_channels, 1, 1, bias=bias, act=act)
+        self.bottlenecks = nn.Sequential(*[
+            RepVggBlock(hidden_channels, hidden_channels, act=act) for _ in range(num_blocks)
+        ])
+        if hidden_channels != out_channels:
+            self.conv3 = ConvNormLayer(hidden_channels, out_channels, 1, 1, bias=bias, act=act)
+        else:
+            self.conv3 = nn.Identity()
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_1 = self.bottlenecks(x_1)
+        x_2 = self.conv2(x)
+        return self.conv3(x_1 + x_2)
+
+
+# transformer
+class TransformerEncoderLayer(nn.Module):
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 normalize_before=False):
+        super().__init__()
+        self.normalize_before = normalize_before
+
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout, batch_first=True)
+
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = get_activation(activation) 
+
+    @staticmethod
+    def with_pos_embed(tensor, pos_embed):
+        return tensor if pos_embed is None else tensor + pos_embed
+
+    def forward(self, src, src_mask=None, pos_embed=None) -> torch.Tensor:
+        residual = src
+        if self.normalize_before:
+            src = self.norm1(src)
+        q = k = self.with_pos_embed(src, pos_embed)
+        src, _ = self.self_attn(q, k, value=src, attn_mask=src_mask)
+
+        src = residual + self.dropout1(src)
+        if not self.normalize_before:
+            src = self.norm1(src)
+
+        residual = src
+        if self.normalize_before:
+            src = self.norm2(src)
+        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = residual + self.dropout2(src)
+        if not self.normalize_before:
+            src = self.norm2(src)
+        return src
+
+
+class TransformerEncoder(nn.Module):
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super(TransformerEncoder, self).__init__()
+        self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for _ in range(num_layers)])
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, src, src_mask=None, pos_embed=None) -> torch.Tensor:
+        output = src
+        for layer in self.layers:
+            output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+@register
+class HybridEncoder(nn.Module):
+    def __init__(self,
+                 in_channels=[512, 1024, 2048],
+                 feat_strides=[8, 16, 32],
+                 hidden_dim=256,
+                 nhead=8,
+                 dim_feedforward = 1024,
+                 dropout=0.0,
+                 enc_act='gelu',
+                 use_encoder_idx=[2],
+                 num_encoder_layers=1,
+                 pe_temperature=10000,
+                 expansion=1.0,
+                 depth_mult=1.0,
+                 act='silu',
+                 eval_spatial_size=None):
+        super().__init__()
+        self.in_channels = in_channels
+        self.feat_strides = feat_strides
+        self.hidden_dim = hidden_dim
+        self.use_encoder_idx = use_encoder_idx
+        self.num_encoder_layers = num_encoder_layers
+        self.pe_temperature = pe_temperature
+        self.eval_spatial_size = eval_spatial_size
+
+        self.out_channels = [hidden_dim for _ in range(len(in_channels))]
+        self.out_strides = feat_strides
+        
+        # channel projection
+        self.input_proj = nn.ModuleList()
+        for in_channel in in_channels:
+            self.input_proj.append(
+                nn.Sequential(
+                    nn.Conv2d(in_channel, hidden_dim, kernel_size=1, bias=False),
+                    nn.BatchNorm2d(hidden_dim)
+                )
+            )
+
+        # encoder transformer
+        encoder_layer = TransformerEncoderLayer(
+            hidden_dim, 
+            nhead=nhead,
+            dim_feedforward=dim_feedforward, 
+            dropout=dropout,
+            activation=enc_act)
+
+        self.encoder = nn.ModuleList([
+            TransformerEncoder(copy.deepcopy(encoder_layer), num_encoder_layers) for _ in range(len(use_encoder_idx))
+        ])
+
+        # top-down fpn
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_blocks = nn.ModuleList()
+        for _ in range(len(in_channels) - 1, 0, -1):
+            self.lateral_convs.append(ConvNormLayer(hidden_dim, hidden_dim, 1, 1, act=act))
+            self.fpn_blocks.append(
+                CSPRepLayer(hidden_dim * 2, hidden_dim, round(3 * depth_mult), act=act, expansion=expansion)
+            )
+
+        # bottom-up pan
+        self.downsample_convs = nn.ModuleList()
+        self.pan_blocks = nn.ModuleList()
+        for _ in range(len(in_channels) - 1):
+            self.downsample_convs.append(
+                ConvNormLayer(hidden_dim, hidden_dim, 3, 2, act=act)
+            )
+            self.pan_blocks.append(
+                CSPRepLayer(hidden_dim * 2, hidden_dim, round(3 * depth_mult), act=act, expansion=expansion)
+            )
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        if self.eval_spatial_size:
+            for idx in self.use_encoder_idx:
+                stride = self.feat_strides[idx]
+                pos_embed = self.build_2d_sincos_position_embedding(
+                    self.eval_spatial_size[1] // stride, self.eval_spatial_size[0] // stride,
+                    self.hidden_dim, self.pe_temperature)
+                setattr(self, f'pos_embed{idx}', pos_embed)
+                # self.register_buffer(f'pos_embed{idx}', pos_embed)
+
+    @staticmethod
+    def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.):
+        '''
+        '''
+        grid_w = torch.arange(int(w), dtype=torch.float32)
+        grid_h = torch.arange(int(h), dtype=torch.float32)
+        grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing='ij')
+        assert embed_dim % 4 == 0, \
+            'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
+        pos_dim = embed_dim // 4
+        omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
+        omega = 1. / (temperature ** omega)
+
+        out_w = grid_w.flatten()[..., None] @ omega[None]
+        out_h = grid_h.flatten()[..., None] @ omega[None]
+
+        return torch.concat([out_w.sin(), out_w.cos(), out_h.sin(), out_h.cos()], dim=1)[None, :, :]
+
+    def forward(self, feats):
+        assert len(feats) == len(self.in_channels)
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+        
+        # encoder
+        if self.num_encoder_layers > 0:
+            for i, enc_ind in enumerate(self.use_encoder_idx):
+                h, w = proj_feats[enc_ind].shape[2:]
+                # flatten [B, C, H, W] to [B, HxW, C]
+                src_flatten = proj_feats[enc_ind].flatten(2).permute(0, 2, 1)
+                if self.training or self.eval_spatial_size is None:
+                    pos_embed = self.build_2d_sincos_position_embedding(
+                        w, h, self.hidden_dim, self.pe_temperature).to(src_flatten.device)
+                else:
+                    pos_embed = getattr(self, f'pos_embed{enc_ind}', None).to(src_flatten.device)
+
+                memory = self.encoder[i](src_flatten, pos_embed=pos_embed)
+                proj_feats[enc_ind] = memory.permute(0, 2, 1).reshape(-1, self.hidden_dim, h, w).contiguous()
+                # print([x.is_contiguous() for x in proj_feats ])
+
+        # broadcasting and fusion
+        inner_outs = [proj_feats[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_high = inner_outs[0]
+            feat_low = proj_feats[idx - 1]
+            feat_high = self.lateral_convs[len(self.in_channels) - 1 - idx](feat_high)
+            inner_outs[0] = feat_high
+            upsample_feat = F.interpolate(feat_high, scale_factor=2., mode='nearest')
+            inner_out = self.fpn_blocks[len(self.in_channels)-1-idx](torch.concat([upsample_feat, feat_low], dim=1))
+            inner_outs.insert(0, inner_out)
+
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_high = inner_outs[idx + 1]
+            downsample_feat = self.downsample_convs[idx](feat_low)
+            out = self.pan_blocks[idx](torch.concat([downsample_feat, feat_high], dim=1))
+            outs.append(out)
+
+        return outs
diff --git a/rtdetr_pytorch/src/zoo/rtdetr/matcher.py b/rtdetr_pytorch/src/zoo/rtdetr/matcher.py
new file mode 100644
index 0000000..cf9dec1
--- /dev/null
+++ b/rtdetr_pytorch/src/zoo/rtdetr/matcher.py
@@ -0,0 +1,108 @@
+"""
+Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+Modules to compute the matching cost and solve the corresponding LSAP.
+
+by lyuwenyu
+"""
+
+import torch
+import torch.nn.functional as F 
+
+from scipy.optimize import linear_sum_assignment
+from torch import nn
+
+from .box_ops import box_cxcywh_to_xyxy, generalized_box_iou
+
+from src.core import register
+
+
+@register
+class HungarianMatcher(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
+    while the others are un-matched (and thus treated as non-objects).
+    """
+
+    __share__ = ['use_focal_loss', ]
+
+    def __init__(self, weight_dict, use_focal_loss=False, alpha=0.25, gamma=2.0):
+        """Creates the matcher
+
+        Params:
+            cost_class: This is the relative weight of the classification error in the matching cost
+            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
+            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
+        """
+        super().__init__()
+        self.cost_class = weight_dict['cost_class']
+        self.cost_bbox = weight_dict['cost_bbox']
+        self.cost_giou = weight_dict['cost_giou']
+
+        self.use_focal_loss = use_focal_loss
+        self.alpha = alpha
+        self.gamma = gamma
+
+        assert self.cost_class != 0 or self.cost_bbox != 0 or self.cost_giou != 0, "all costs cant be 0"
+
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """ Performs the matching
+
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
+
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
+
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        bs, num_queries = outputs["pred_logits"].shape[:2]
+
+        # We flatten to compute the cost matrices in a batch
+        if self.use_focal_loss:
+            out_prob = F.sigmoid(outputs["pred_logits"].flatten(0, 1))
+        else:
+            out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1)  # [batch_size * num_queries, num_classes]
+
+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+
+        # Also concat the target labels and boxes
+        tgt_ids = torch.cat([v["labels"] for v in targets])
+        tgt_bbox = torch.cat([v["boxes"] for v in targets])
+
+        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+        # but approximate it in 1 - proba[target class].
+        # The 1 is a constant that doesn't change the matching, it can be ommitted.
+        if self.use_focal_loss:
+            out_prob = out_prob[:, tgt_ids]
+            neg_cost_class = (1 - self.alpha) * (out_prob**self.gamma) * (-(1 - out_prob + 1e-8).log())
+            pos_cost_class = self.alpha * ((1 - out_prob)**self.gamma) * (-(out_prob + 1e-8).log())
+            cost_class = pos_cost_class - neg_cost_class        
+        else:
+            cost_class = -out_prob[:, tgt_ids]
+
+        # Compute the L1 cost between boxes
+        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
+
+        # Compute the giou cost betwen boxes
+        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
+        
+        # Final cost matrix
+        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
+        C = C.view(bs, num_queries, -1).cpu()
+
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
+
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr.py
new file mode 100644
index 0000000..851d4f7
--- /dev/null
+++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr.py
@@ -0,0 +1,44 @@
+"""by lyuwenyu
+"""
+
+import torch 
+import torch.nn as nn 
+import torch.nn.functional as F 
+
+import random 
+import numpy as np 
+
+from src.core import register
+
+
+__all__ = ['RTDETR', ]
+
+
+@register
+class RTDETR(nn.Module):
+    __inject__ = ['backbone', 'encoder', 'decoder', ]
+
+    def __init__(self, backbone: nn.Module, encoder, decoder, multi_scale=None):
+        super().__init__()
+        self.backbone = backbone
+        self.decoder = decoder
+        self.encoder = encoder
+        self.multi_scale = multi_scale
+        
+    def forward(self, x, targets=None):
+        if self.multi_scale and self.training:
+            sz = np.random.choice(self.multi_scale)
+            x = F.interpolate(x, size=[sz, sz])
+            
+        x = self.backbone(x)
+        x = self.encoder(x)        
+        x = self.decoder(x, targets)
+
+        return x
+    
+    def deploy(self, ):
+        self.eval()
+        for m in self.modules():
+            if hasattr(m, 'convert_to_deploy'):
+                m.convert_to_deploy()
+        return self 
diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_criterion.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_criterion.py
new file mode 100644
index 0000000..3ce77c0
--- /dev/null
+++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_criterion.py
@@ -0,0 +1,341 @@
+"""
+reference: 
+https://github.com/facebookresearch/detr/blob/main/models/detr.py
+
+by lyuwenyu
+"""
+
+
+import torch 
+import torch.nn as nn 
+import torch.nn.functional as F 
+import torchvision
+
+# from torchvision.ops import box_convert, generalized_box_iou
+from .box_ops import box_cxcywh_to_xyxy, box_iou, generalized_box_iou
+
+from src.misc.dist import get_world_size, is_dist_available_and_initialized
+from src.core import register
+
+
+
+@register
+class SetCriterion(nn.Module):
+    """ This class computes the loss for DETR.
+    The process happens in two steps:
+        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+    """
+    __share__ = ['num_classes', ]
+    __inject__ = ['matcher', ]
+
+    def __init__(self, matcher, weight_dict, losses, alpha=0.2, gamma=2.0, eos_coef=1e-4, num_classes=80):
+        """ Create the criterion.
+        Parameters:
+            num_classes: number of object categories, omitting the special no-object category
+            matcher: module able to compute a matching between targets and proposals
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            eos_coef: relative classification weight applied to the no-object category
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.losses = losses 
+
+        empty_weight = torch.ones(self.num_classes + 1)
+        empty_weight[-1] = eos_coef
+        self.register_buffer('empty_weight', empty_weight)
+
+        self.alpha = alpha
+        self.gamma = gamma
+
+
+    def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
+        """Classification loss (NLL)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+        """
+        assert 'pred_logits' in outputs
+        src_logits = outputs['pred_logits']
+
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
+                                    dtype=torch.int64, device=src_logits.device)
+        target_classes[idx] = target_classes_o
+
+        loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight)
+        losses = {'loss_ce': loss_ce}
+
+        if log:
+            # TODO this should probably be a separate loss, not hacked in this one here
+            losses['class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0]
+        return losses
+
+    def loss_labels_bce(self, outputs, targets, indices, num_boxes, log=True):
+        src_logits = outputs['pred_logits']
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
+                                    dtype=torch.int64, device=src_logits.device)
+        target_classes[idx] = target_classes_o
+
+        target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1]
+        loss = F.binary_cross_entropy_with_logits(src_logits, target * 1., reduction='none')
+        loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes
+        return {'loss_bce': loss}
+
+    def loss_labels_focal(self, outputs, targets, indices, num_boxes, log=True):
+        assert 'pred_logits' in outputs
+        src_logits = outputs['pred_logits']
+
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
+                                    dtype=torch.int64, device=src_logits.device)
+        target_classes[idx] = target_classes_o
+
+        target = F.one_hot(target_classes, num_classes=self.num_classes+1)[..., :-1]
+        # ce_loss = F.binary_cross_entropy_with_logits(src_logits, target * 1., reduction="none")
+        # prob = F.sigmoid(src_logits) # TODO .detach()
+        # p_t = prob * target + (1 - prob) * (1 - target)
+        # alpha_t = self.alpha * target + (1 - self.alpha) * (1 - target)
+        # loss = alpha_t * ce_loss * ((1 - p_t) ** self.gamma)
+        # loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes
+        loss = torchvision.ops.sigmoid_focal_loss(src_logits, target, self.alpha, self.gamma, reduction='none')
+        loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes
+
+        return {'loss_focal': loss}
+
+    def loss_labels_vfl(self, outputs, targets, indices, num_boxes, log=True):
+        assert 'pred_boxes' in outputs
+        idx = self._get_src_permutation_idx(indices)
+
+        src_boxes = outputs['pred_boxes'][idx]
+        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        ious, _ = box_iou(box_cxcywh_to_xyxy(src_boxes), box_cxcywh_to_xyxy(target_boxes))
+        ious = torch.diag(ious).detach()
+
+        src_logits = outputs['pred_logits']
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
+                                    dtype=torch.int64, device=src_logits.device)
+        target_classes[idx] = target_classes_o
+        target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1]
+
+        target_score_o = torch.zeros_like(target_classes, dtype=src_logits.dtype)
+        target_score_o[idx] = ious.to(target_score_o.dtype)
+        target_score = target_score_o.unsqueeze(-1) * target
+
+        pred_score = F.sigmoid(src_logits).detach()
+        weight = self.alpha * pred_score.pow(self.gamma) * (1 - target) + target_score
+        
+        loss = F.binary_cross_entropy_with_logits(src_logits, target_score, weight=weight, reduction='none')
+        loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes
+        return {'loss_vfl': loss}
+
+    @torch.no_grad()
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """ Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes
+        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients
+        """
+        pred_logits = outputs['pred_logits']
+        device = pred_logits.device
+        tgt_lengths = torch.as_tensor([len(v["labels"]) for v in targets], device=device)
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (pred_logits.argmax(-1) != pred_logits.shape[-1] - 1).sum(1)
+        card_err = F.l1_loss(card_pred.float(), tgt_lengths.float())
+        losses = {'cardinality_error': card_err}
+        return losses
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
+           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
+           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        assert 'pred_boxes' in outputs
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs['pred_boxes'][idx]
+        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        losses = {}
+
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
+        losses['loss_bbox'] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - torch.diag(generalized_box_iou(
+                box_cxcywh_to_xyxy(src_boxes),
+                box_cxcywh_to_xyxy(target_boxes)))
+        losses['loss_giou'] = loss_giou.sum() / num_boxes
+        return losses
+
+    def loss_masks(self, outputs, targets, indices, num_boxes):
+        """Compute the losses related to the masks: the focal loss and the dice loss.
+           targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
+        """
+        assert "pred_masks" in outputs
+
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        src_masks = outputs["pred_masks"]
+        src_masks = src_masks[src_idx]
+        masks = [t["masks"] for t in targets]
+        # TODO use valid to mask invalid areas due to padding in loss
+        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
+        target_masks = target_masks.to(src_masks)
+        target_masks = target_masks[tgt_idx]
+
+        # upsample predictions to the target size
+        src_masks = interpolate(src_masks[:, None], size=target_masks.shape[-2:],
+                                mode="bilinear", align_corners=False)
+        src_masks = src_masks[:, 0].flatten(1)
+
+        target_masks = target_masks.flatten(1)
+        target_masks = target_masks.view(src_masks.shape)
+        losses = {
+            "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes),
+            "loss_dice": dice_loss(src_masks, target_masks, num_boxes),
+        }
+        return losses
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
+        loss_map = {
+            'labels': self.loss_labels,
+            'cardinality': self.loss_cardinality,
+            'boxes': self.loss_boxes,
+            'masks': self.loss_masks,
+
+            'bce': self.loss_labels_bce,
+            'focal': self.loss_labels_focal,
+            'vfl': self.loss_labels_vfl,
+        }
+        assert loss in loss_map, f'do you really want to compute {loss} loss?'
+        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
+
+    def forward(self, outputs, targets):
+        """ This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if 'aux' not in k}
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t["labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        if is_dist_available_and_initialized():
+            torch.distributed.all_reduce(num_boxes)
+        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            l_dict = self.get_loss(loss, outputs, targets, indices, num_boxes)
+            l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+            losses.update(l_dict)
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if 'aux_outputs' in outputs:
+            for i, aux_outputs in enumerate(outputs['aux_outputs']):
+                indices = self.matcher(aux_outputs, targets)
+                for loss in self.losses:
+                    if loss == 'masks':
+                        # Intermediate masks losses are too costly to compute, we ignore them.
+                        continue
+                    kwargs = {}
+                    if loss == 'labels':
+                        # Logging is enabled only for the last layer
+                        kwargs = {'log': False}
+
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs)
+                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                    l_dict = {k + f'_aux_{i}': v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        # In case of cdn auxiliary losses. For rtdetr
+        if 'dn_aux_outputs' in outputs:
+            assert 'dn_meta' in outputs, ''
+            indices = self.get_cdn_matched_indices(outputs['dn_meta'], targets)
+            num_boxes = num_boxes * outputs['dn_meta']['dn_num_group']
+
+            for i, aux_outputs in enumerate(outputs['dn_aux_outputs']):
+                # indices = self.matcher(aux_outputs, targets)
+                for loss in self.losses:
+                    if loss == 'masks':
+                        # Intermediate masks losses are too costly to compute, we ignore them.
+                        continue
+                    kwargs = {}
+                    if loss == 'labels':
+                        # Logging is enabled only for the last layer
+                        kwargs = {'log': False}
+
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs)
+                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                    l_dict = {k + f'_dn_{i}': v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        return losses
+
+    @staticmethod
+    def get_cdn_matched_indices(dn_meta, targets):
+        '''get_cdn_matched_indices
+        '''
+        dn_positive_idx, dn_num_group = dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
+        num_gts = [len(t['labels']) for t in targets]
+        device = targets[0]['labels'].device
+        
+        dn_match_indices = []
+        for i, num_gt in enumerate(num_gts):
+            if num_gt > 0:
+                gt_idx = torch.arange(num_gt, dtype=torch.int64, device=device)
+                gt_idx = gt_idx.tile(dn_num_group)
+                assert len(dn_positive_idx[i]) == len(gt_idx)
+                dn_match_indices.append((dn_positive_idx[i], gt_idx))
+            else:
+                dn_match_indices.append((torch.zeros(0, dtype=torch.int64, device=device), \
+                    torch.zeros(0, dtype=torch.int64,  device=device)))
+        
+        return dn_match_indices
+
+
+
+
+
+@torch.no_grad()
+def accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    if target.numel() == 0:
+        return [torch.zeros([], device=output.device)]
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+
+
+
+
diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py
new file mode 100644
index 0000000..a611474
--- /dev/null
+++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py
@@ -0,0 +1,574 @@
+"""by lyuwenyu
+"""
+
+import math 
+import copy 
+from collections import OrderedDict
+
+import torch 
+import torch.nn as nn 
+import torch.nn.functional as F 
+import torch.nn.init as init 
+
+from .denoising import get_contrastive_denoising_training_group
+from .utils import deformable_attention_core_func, get_activation, inverse_sigmoid
+from .utils import bias_init_with_prob
+
+
+from src.core import register
+
+
+__all__ = ['RTDETRTransformer']
+
+
+
+class MLP(nn.Module):
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, act='relu'):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+        self.act = nn.Identity() if act is None else get_activation(act)
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = self.act(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+
+class MSDeformableAttention(nn.Module):
+    def __init__(self, embed_dim=256, num_heads=8, num_levels=4, num_points=4,):
+        """
+        Multi-Scale Deformable Attention Module
+        """
+        super(MSDeformableAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_levels = num_levels
+        self.num_points = num_points
+        self.total_points = num_heads * num_levels * num_points
+
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        self.sampling_offsets = nn.Linear(embed_dim, self.total_points * 2,)
+        self.attention_weights = nn.Linear(embed_dim, self.total_points)
+        self.value_proj = nn.Linear(embed_dim, embed_dim)
+        self.output_proj = nn.Linear(embed_dim, embed_dim)
+
+        self.ms_deformable_attn_core = deformable_attention_core_func
+
+        self._reset_parameters()
+
+
+    def _reset_parameters(self):
+        # sampling_offsets
+        init.constant_(self.sampling_offsets.weight, 0)
+        thetas = torch.arange(self.num_heads, dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = grid_init / grid_init.abs().max(-1, keepdim=True).values
+        grid_init = grid_init.reshape(self.num_heads, 1, 1, 2).tile([1, self.num_levels, self.num_points, 1])
+        scaling = torch.arange(1, self.num_points + 1, dtype=torch.float32).reshape(1, 1, -1, 1)
+        grid_init *= scaling
+        self.sampling_offsets.bias.data[...] = grid_init.flatten()
+
+        # attention_weights
+        init.constant_(self.attention_weights.weight, 0)
+        init.constant_(self.attention_weights.bias, 0)
+
+        # proj
+        init.xavier_uniform_(self.value_proj.weight)
+        init.constant_(self.value_proj.bias, 0)
+        init.xavier_uniform_(self.output_proj.weight)
+        init.constant_(self.output_proj.bias, 0)
+
+
+    def forward(self,
+                query,
+                reference_points,
+                value,
+                value_spatial_shapes,
+                value_mask=None):
+        """
+        Args:
+            query (Tensor): [bs, query_length, C]
+            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area
+            value (Tensor): [bs, value_length, C]
+            value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+            value_level_start_index (List): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
+            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
+
+        Returns:
+            output (Tensor): [bs, Length_{query}, C]
+        """
+        bs, Len_q = query.shape[:2]
+        Len_v = value.shape[1]
+
+        value = self.value_proj(value)
+        if value_mask is not None:
+            value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
+            value *= value_mask
+        value = value.reshape(bs, Len_v, self.num_heads, self.head_dim)
+
+        sampling_offsets = self.sampling_offsets(query).reshape(
+            bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).reshape(
+            bs, Len_q, self.num_heads, self.num_levels * self.num_points)
+        attention_weights = F.softmax(attention_weights, dim=-1).reshape(
+            bs, Len_q, self.num_heads, self.num_levels, self.num_points)
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.tensor(value_spatial_shapes)
+            offset_normalizer = offset_normalizer.flip([1]).reshape(
+                1, 1, 1, self.num_levels, 1, 2)
+            sampling_locations = reference_points.reshape(
+                bs, Len_q, 1, self.num_levels, 1, 2
+            ) + sampling_offsets / offset_normalizer
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2] + sampling_offsets /
+                self.num_points * reference_points[:, :, None, :, None, 2:] * 0.5)
+        else:
+            raise ValueError(
+                "Last dim of reference_points must be 2 or 4, but get {} instead.".
+                format(reference_points.shape[-1]))
+
+        output = self.ms_deformable_attn_core(value, value_spatial_shapes, sampling_locations, attention_weights)
+
+        output = self.output_proj(output)
+
+        return output
+
+
+class TransformerDecoderLayer(nn.Module):
+    def __init__(self,
+                 d_model=256,
+                 n_head=8,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 n_levels=4,
+                 n_points=4,):
+        super(TransformerDecoderLayer, self).__init__()
+
+        # self attention
+        self.self_attn = nn.MultiheadAttention(d_model, n_head, dropout=dropout, batch_first=True)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # cross attention
+        self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels, n_points)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.activation = getattr(F, activation)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(d_model)
+
+        # self._reset_parameters()
+
+    # def _reset_parameters(self):
+    #     linear_init_(self.linear1)
+    #     linear_init_(self.linear2)
+    #     xavier_uniform_(self.linear1.weight)
+    #     xavier_uniform_(self.linear2.weight)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+
+    def forward(self,
+                tgt,
+                reference_points,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                attn_mask=None,
+                memory_mask=None,
+                query_pos_embed=None):
+        # self attention
+        q = k = self.with_pos_embed(tgt, query_pos_embed)
+
+        # if attn_mask is not None:
+        #     attn_mask = torch.where(
+        #         attn_mask.to(torch.bool),
+        #         torch.zeros_like(attn_mask),
+        #         torch.full_like(attn_mask, float('-inf'), dtype=tgt.dtype))
+
+        tgt2, _ = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        # cross attention
+        tgt2 = self.cross_attn(\
+            self.with_pos_embed(tgt, query_pos_embed), 
+            reference_points, 
+            memory, 
+            memory_spatial_shapes, 
+            memory_mask)
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+
+        # ffn
+        tgt2 = self.forward_ffn(tgt)
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt.clamp(min=-65504, max=65504))
+
+        return tgt
+
+
+class TransformerDecoder(nn.Module):
+    def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
+        super(TransformerDecoder, self).__init__()
+        self.layers = nn.ModuleList([copy.deepcopy(decoder_layer) for _ in range(num_layers)])
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
+
+    def forward(self,
+                tgt,
+                ref_points_unact,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                bbox_head,
+                score_head,
+                query_pos_head,
+                attn_mask=None,
+                memory_mask=None):
+        output = tgt
+        dec_out_bboxes = []
+        dec_out_logits = []
+        ref_points_detach = F.sigmoid(ref_points_unact)
+
+        for i, layer in enumerate(self.layers):
+            ref_points_input = ref_points_detach.unsqueeze(2)
+            query_pos_embed = query_pos_head(ref_points_detach)
+
+            output = layer(output, ref_points_input, memory,
+                           memory_spatial_shapes, memory_level_start_index,
+                           attn_mask, memory_mask, query_pos_embed)
+
+            inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points_detach))
+
+            if self.training:
+                dec_out_logits.append(score_head[i](output))
+                if i == 0:
+                    dec_out_bboxes.append(inter_ref_bbox)
+                else:
+                    dec_out_bboxes.append(F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points)))
+
+            elif i == self.eval_idx:
+                dec_out_logits.append(score_head[i](output))
+                dec_out_bboxes.append(inter_ref_bbox)
+                break
+
+            ref_points = inter_ref_bbox
+            ref_points_detach = inter_ref_bbox.detach(
+            ) if self.training else inter_ref_bbox
+
+        return torch.stack(dec_out_bboxes), torch.stack(dec_out_logits)
+
+
+@register
+class RTDETRTransformer(nn.Module):
+    __share__ = ['num_classes']
+    def __init__(self,
+                 num_classes=80,
+                 hidden_dim=256,
+                 num_queries=300,
+                 position_embed_type='sine',
+                 feat_channels=[512, 1024, 2048],
+                 feat_strides=[8, 16, 32],
+                 num_levels=3,
+                 num_decoder_points=4,
+                 nhead=8,
+                 num_decoder_layers=6,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 num_denoising=100,
+                 label_noise_ratio=0.5,
+                 box_noise_scale=1.0,
+                 learnt_init_query=False,
+                 eval_spatial_size=None,
+                 eval_idx=-1,
+                 eps=1e-2, 
+                 aux_loss=True):
+
+        super(RTDETRTransformer, self).__init__()
+        assert position_embed_type in ['sine', 'learned'], \
+            f'ValueError: position_embed_type not supported {position_embed_type}!'
+        assert len(feat_channels) <= num_levels
+        assert len(feat_strides) == len(feat_channels)
+        for _ in range(num_levels - len(feat_strides)):
+            feat_strides.append(feat_strides[-1] * 2)
+
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+        self.feat_strides = feat_strides
+        self.num_levels = num_levels
+        self.num_classes = num_classes
+        self.num_queries = num_queries
+        self.eps = eps
+        self.num_decoder_layers = num_decoder_layers
+        self.eval_spatial_size = eval_spatial_size
+        self.aux_loss = aux_loss
+
+        # backbone feature projection
+        self._build_input_proj_layer(feat_channels)
+
+        # Transformer module
+        decoder_layer = TransformerDecoderLayer(hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels, num_decoder_points)
+        self.decoder = TransformerDecoder(hidden_dim, decoder_layer, num_decoder_layers, eval_idx)
+
+        self.num_denoising = num_denoising
+        self.label_noise_ratio = label_noise_ratio
+        self.box_noise_scale = box_noise_scale
+        # denoising part
+        if num_denoising > 0: 
+            # self.denoising_class_embed = nn.Embedding(num_classes, hidden_dim, padding_idx=num_classes-1) # TODO for load paddle weights
+            self.denoising_class_embed = nn.Embedding(num_classes+1, hidden_dim, padding_idx=num_classes)
+
+        # decoder embedding
+        self.learnt_init_query = learnt_init_query
+        if learnt_init_query:
+            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
+        self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2)
+
+        # encoder head
+        self.enc_output = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.LayerNorm(hidden_dim,)
+        )
+        self.enc_score_head = nn.Linear(hidden_dim, num_classes)
+        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+
+        # decoder head
+        self.dec_score_head = nn.ModuleList([
+            nn.Linear(hidden_dim, num_classes)
+            for _ in range(num_decoder_layers)
+        ])
+        self.dec_bbox_head = nn.ModuleList([
+            MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+            for _ in range(num_decoder_layers)
+        ])
+
+        # init encoder output anchors and valid_mask
+        if self.eval_spatial_size:
+            self.anchors, self.valid_mask = self._generate_anchors()
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        bias = bias_init_with_prob(0.01)
+
+        init.constant_(self.enc_score_head.bias, bias)
+        init.constant_(self.enc_bbox_head.layers[-1].weight, 0)
+        init.constant_(self.enc_bbox_head.layers[-1].bias, 0)
+
+        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
+            init.constant_(cls_.bias, bias)
+            init.constant_(reg_.layers[-1].weight, 0)
+            init.constant_(reg_.layers[-1].bias, 0)
+        
+        # linear_init_(self.enc_output[0])
+        init.xavier_uniform_(self.enc_output[0].weight)
+        if self.learnt_init_query:
+            init.xavier_uniform_(self.tgt_embed.weight)
+        init.xavier_uniform_(self.query_pos_head.layers[0].weight)
+        init.xavier_uniform_(self.query_pos_head.layers[1].weight)
+
+
+    def _build_input_proj_layer(self, feat_channels):
+        self.input_proj = nn.ModuleList()
+        for in_channels in feat_channels:
+            self.input_proj.append(
+                nn.Sequential(OrderedDict([
+                    ('conv', nn.Conv2d(in_channels, self.hidden_dim, 1, bias=False)), 
+                    ('norm', nn.BatchNorm2d(self.hidden_dim,))])
+                )
+            )
+
+        in_channels = feat_channels[-1]
+
+        for _ in range(self.num_levels - len(feat_channels)):
+            self.input_proj.append(
+                nn.Sequential(OrderedDict([
+                    ('conv', nn.Conv2d(in_channels, self.hidden_dim, 3, 2, padding=1, bias=False)),
+                    ('norm', nn.BatchNorm2d(self.hidden_dim))])
+                )
+            )
+            in_channels = self.hidden_dim
+
+    def _get_encoder_input(self, feats):
+        # get projection features
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+        if self.num_levels > len(proj_feats):
+            len_srcs = len(proj_feats)
+            for i in range(len_srcs, self.num_levels):
+                if i == len_srcs:
+                    proj_feats.append(self.input_proj[i](feats[-1]))
+                else:
+                    proj_feats.append(self.input_proj[i](proj_feats[-1]))
+
+        # get encoder inputs
+        feat_flatten = []
+        spatial_shapes = []
+        level_start_index = [0, ]
+        for i, feat in enumerate(proj_feats):
+            _, _, h, w = feat.shape
+            # [b, c, h, w] -> [b, h*w, c]
+            feat_flatten.append(feat.flatten(2).permute(0, 2, 1))
+            # [num_levels, 2]
+            spatial_shapes.append([h, w])
+            # [l], start index of each level
+            level_start_index.append(h * w + level_start_index[-1])
+
+        # [b, l, c]
+        feat_flatten = torch.concat(feat_flatten, 1)
+        level_start_index.pop()
+        return (feat_flatten, spatial_shapes, level_start_index)
+
+    def _generate_anchors(self,
+                          spatial_shapes=None,
+                          grid_size=0.05,
+                          dtype=torch.float32,
+                          device='cpu'):
+        if spatial_shapes is None:
+            spatial_shapes = [[int(self.eval_spatial_size[0] / s), int(self.eval_spatial_size[1] / s)]
+                for s in self.feat_strides
+            ]
+        anchors = []
+        for lvl, (h, w) in enumerate(spatial_shapes):
+            grid_y, grid_x = torch.meshgrid(\
+                torch.arange(end=h, dtype=dtype), \
+                torch.arange(end=w, dtype=dtype), indexing='ij')
+            grid_xy = torch.stack([grid_x, grid_y], -1)
+            valid_WH = torch.tensor([w, h]).to(dtype)
+            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
+            wh = torch.ones_like(grid_xy) * grid_size * (2.0 ** lvl)
+            anchors.append(torch.concat([grid_xy, wh], -1).reshape(-1, h * w, 4))
+
+        anchors = torch.concat(anchors, 1).to(device)
+        valid_mask = ((anchors > self.eps) * (anchors < 1 - self.eps)).all(-1, keepdim=True)
+        anchors = torch.log(anchors / (1 - anchors))
+        # anchors = torch.where(valid_mask, anchors, float('inf'))
+        # anchors[valid_mask] = torch.inf # valid_mask [1, 8400, 1]
+        anchors = torch.where(valid_mask, anchors, torch.inf)
+
+        return anchors, valid_mask
+
+
+    def _get_decoder_input(self,
+                           memory,
+                           spatial_shapes,
+                           denoising_class=None,
+                           denoising_bbox_unact=None):
+        bs, _, _ = memory.shape
+        # prepare input for decoder
+        if self.training or self.eval_spatial_size is None:
+            anchors, valid_mask = self._generate_anchors(spatial_shapes, device=memory.device)
+        else:
+            anchors, valid_mask = self.anchors.to(memory.device), self.valid_mask.to(memory.device)
+
+        # memory = torch.where(valid_mask, memory, 0)
+        memory = valid_mask.to(memory.dtype) * memory  # TODO fix type error for onnx export 
+
+        output_memory = self.enc_output(memory)
+
+        enc_outputs_class = self.enc_score_head(output_memory)
+        enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors
+
+        _, topk_ind = torch.topk(enc_outputs_class.max(-1).values, self.num_queries, dim=1)
+        
+        reference_points_unact = enc_outputs_coord_unact.gather(dim=1, \
+            index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_coord_unact.shape[-1]))
+
+        enc_topk_bboxes = F.sigmoid(reference_points_unact)
+        if denoising_bbox_unact is not None:
+            reference_points_unact = torch.concat(
+                [denoising_bbox_unact, reference_points_unact], 1)
+        
+        enc_topk_logits = enc_outputs_class.gather(dim=1, \
+            index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_class.shape[-1]))
+
+        # extract region features
+        if self.learnt_init_query:
+            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
+        else:
+            target = output_memory.gather(dim=1, \
+                index=topk_ind.unsqueeze(-1).repeat(1, 1, output_memory.shape[-1]))
+            target = target.detach()
+
+        if denoising_class is not None:
+            target = torch.concat([denoising_class, target], 1)
+
+        return target, reference_points_unact.detach(), enc_topk_bboxes, enc_topk_logits
+
+
+    def forward(self, feats, targets=None):
+
+        # input projection and embedding
+        (memory, spatial_shapes, level_start_index) = self._get_encoder_input(feats)
+        
+        # prepare denoising training
+        if self.training and self.num_denoising > 0:
+            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
+                get_contrastive_denoising_training_group(targets, \
+                    self.num_classes, 
+                    self.num_queries, 
+                    self.denoising_class_embed, 
+                    num_denoising=self.num_denoising, 
+                    label_noise_ratio=self.label_noise_ratio, 
+                    box_noise_scale=self.box_noise_scale, )
+        else:
+            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
+
+        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
+            self._get_decoder_input(memory, spatial_shapes, denoising_class, denoising_bbox_unact)
+
+        # decoder
+        out_bboxes, out_logits = self.decoder(
+            target,
+            init_ref_points_unact,
+            memory,
+            spatial_shapes,
+            level_start_index,
+            self.dec_bbox_head,
+            self.dec_score_head,
+            self.query_pos_head,
+            attn_mask=attn_mask)
+
+        if self.training and dn_meta is not None:
+            dn_out_bboxes, out_bboxes = torch.split(out_bboxes, dn_meta['dn_num_split'], dim=2)
+            dn_out_logits, out_logits = torch.split(out_logits, dn_meta['dn_num_split'], dim=2)
+
+        out = {'pred_logits': out_logits[-1], 'pred_boxes': out_bboxes[-1]}
+
+        if self.training and self.aux_loss:
+            out['aux_outputs'] = self._set_aux_loss(out_logits[:-1], out_bboxes[:-1])
+            out['aux_outputs'].extend(self._set_aux_loss([enc_topk_logits], [enc_topk_bboxes]))
+            
+            if self.training and dn_meta is not None:
+                out['dn_aux_outputs'] = self._set_aux_loss(dn_out_logits, dn_out_bboxes)
+                out['dn_meta'] = dn_meta
+
+        return out
+
+
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{'pred_logits': a, 'pred_boxes': b}
+                for a, b in zip(outputs_class, outputs_coord)]
diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py
new file mode 100644
index 0000000..7d70113
--- /dev/null
+++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py
@@ -0,0 +1,81 @@
+"""by lyuwenyu
+"""
+
+import torch 
+import torch.nn as nn 
+import torch.nn.functional as F 
+
+import torchvision
+
+from src.core import register
+
+
+__all__ = ['RTDETRPostProcessor']
+
+
+@register
+class RTDETRPostProcessor(nn.Module):
+    __share__ = ['num_classes', 'use_focal_loss', 'num_top_queries', 'remap_mscoco_category']
+    
+    def __init__(self, num_classes=80, use_focal_loss=True, num_top_queries=300, remap_mscoco_category=False) -> None:
+        super().__init__()
+        self.use_focal_loss = use_focal_loss
+        self.num_top_queries = num_top_queries
+        self.num_classes = num_classes
+        self.remap_mscoco_category = remap_mscoco_category 
+        self.deploy_mode = False 
+
+    def extra_repr(self) -> str:
+        return f'use_focal_loss={self.use_focal_loss}, num_classes={self.num_classes}, num_top_queries={self.num_top_queries}'
+    
+    # def forward(self, outputs, orig_target_sizes):
+    def forward(self, outputs, orig_target_sizes):
+
+        logits, boxes = outputs['pred_logits'], outputs['pred_boxes']
+        # orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)        
+
+        bbox_pred = torchvision.ops.box_convert(boxes, in_fmt='cxcywh', out_fmt='xyxy')
+        bbox_pred *= orig_target_sizes.repeat(1, 2).unsqueeze(1)
+
+        if self.use_focal_loss:
+            scores = F.sigmoid(logits)
+            scores, index = torch.topk(scores.flatten(1), self.num_top_queries, axis=-1)
+            labels = index % self.num_classes
+            index = index // self.num_classes
+            boxes = bbox_pred.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, bbox_pred.shape[-1]))
+            
+        else:
+            scores = F.softmax(logits)[:, :, :-1]
+            scores, labels = scores.max(dim=-1)
+            boxes = bbox_pred
+            if scores.shape[1] > self.num_top_queries:
+                scores, index = torch.topk(scores, self.num_top_queries, dim=-1)
+                labels = torch.gather(labels, dim=1, index=index)
+                boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1]))
+
+        # TODO for onnx export
+        if self.deploy_mode:
+            return labels, boxes, scores
+
+        # TODO
+        if self.remap_mscoco_category:
+            from ...data.coco import mscoco_label2category
+            labels = torch.tensor([mscoco_label2category[int(x.item())] for x in labels.flatten()])\
+                .to(boxes.device).reshape(labels.shape)
+
+        results = []
+        for lab, box, sco in zip(labels, boxes, scores):
+            result = dict(labels=lab, boxes=box, scores=sco)
+            results.append(result)
+        
+        return results
+        
+
+    def deploy(self, ):
+        self.eval()
+        self.deploy_mode = True
+        return self 
+
+    @property
+    def iou_types(self, ):
+        return ('bbox', )
diff --git a/rtdetr_pytorch/src/zoo/rtdetr/utils.py b/rtdetr_pytorch/src/zoo/rtdetr/utils.py
new file mode 100644
index 0000000..4f44cc5
--- /dev/null
+++ b/rtdetr_pytorch/src/zoo/rtdetr/utils.py
@@ -0,0 +1,101 @@
+"""by lyuwenyu
+"""
+
+import math
+import torch 
+import torch.nn as nn
+import torch.nn.functional as F 
+
+
+def inverse_sigmoid(x: torch.Tensor, eps: float=1e-5) -> torch.Tensor:
+    x = x.clip(min=0., max=1.)
+    return torch.log(x.clip(min=eps) / (1 - x).clip(min=eps))
+
+
+def deformable_attention_core_func(value, value_spatial_shapes, sampling_locations, attention_weights):
+    """
+    Args:
+        value (Tensor): [bs, value_length, n_head, c]
+        value_spatial_shapes (Tensor|List): [n_levels, 2]
+        value_level_start_index (Tensor|List): [n_levels]
+        sampling_locations (Tensor): [bs, query_length, n_head, n_levels, n_points, 2]
+        attention_weights (Tensor): [bs, query_length, n_head, n_levels, n_points]
+
+    Returns:
+        output (Tensor): [bs, Length_{query}, C]
+    """
+    bs, _, n_head, c = value.shape
+    _, Len_q, _, n_levels, n_points, _ = sampling_locations.shape
+
+    split_shape = [h * w for h, w in value_spatial_shapes]
+    value_list = value.split(split_shape, dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level, (h, w) in enumerate(value_spatial_shapes):
+        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
+        value_l_ = value_list[level].flatten(2).permute(
+            0, 2, 1).reshape(bs * n_head, c, h, w)
+        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, level].permute(
+            0, 2, 1, 3, 4).flatten(0, 1)
+        # N_*M_, D_, Lq_, P_
+        sampling_value_l_ = F.grid_sample(
+            value_l_,
+            sampling_grid_l_,
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_)
+    attention_weights = attention_weights.permute(0, 2, 1, 3, 4).reshape(
+        bs * n_head, 1, Len_q, n_levels * n_points)
+    output = (torch.stack(
+        sampling_value_list, dim=-2).flatten(-2) *
+              attention_weights).sum(-1).reshape(bs, n_head * c, Len_q)
+
+    return output.permute(0, 2, 1)
+
+
+import math 
+def bias_init_with_prob(prior_prob=0.01):
+    """initialize conv/fc bias value according to a given probability value."""
+    bias_init = float(-math.log((1 - prior_prob) / prior_prob))
+    return bias_init
+
+
+
+def get_activation(act: str, inpace: bool=True):
+    '''get activation
+    '''
+    act = act.lower()
+    
+    if act == 'silu':
+        m = nn.SiLU()
+
+    elif act == 'relu':
+        m = nn.ReLU()
+
+    elif act == 'leaky_relu':
+        m = nn.LeakyReLU()
+
+    elif act == 'silu':
+        m = nn.SiLU()
+    
+    elif act == 'gelu':
+        m = nn.GELU()
+        
+    elif act is None:
+        m = nn.Identity()
+    
+    elif isinstance(act, nn.Module):
+        m = act
+
+    else:
+        raise RuntimeError('')  
+
+    if hasattr(m, 'inplace'):
+        m.inplace = inpace
+    
+    return m 
+
+
diff --git a/rtdetr_pytorch/tools/README.md b/rtdetr_pytorch/tools/README.md
new file mode 100644
index 0000000..00eb9d1
--- /dev/null
+++ b/rtdetr_pytorch/tools/README.md
@@ -0,0 +1,24 @@
+
+
+Train/test script examples
+- `CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=4 --master-port=8989 tools/train.py -c path/to/config &> train.log 2>&1 &`
+- `-r path/to/checkpoint`
+- `--amp`
+- `--test-only` 
+
+
+Tuning script examples
+- `torchrun --master_port=8844 --nproc_per_node=4 tools/train.py -c configs/rtdetr/rtdetr_r18vd_6x_coco.yml -t https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r18vd_5x_coco_objects365_from_paddle.pth` 
+
+
+Export script examples
+- `python tools/export_onnx.py -c path/to/config -r path/to/checkpoint --check`
+
+
+GPU do not release memory
+- `ps aux | grep "tools/train.py" | awk '{print $2}' | xargs kill -9`
+
+
+Save all logs
+- Appending `&> train.log 2>&1 &` or `&> train.log 2>&1`
+
diff --git a/rtdetr_pytorch/tools/export_onnx.py b/rtdetr_pytorch/tools/export_onnx.py
new file mode 100644
index 0000000..789420f
--- /dev/null
+++ b/rtdetr_pytorch/tools/export_onnx.py
@@ -0,0 +1,147 @@
+"""by lyuwenyu
+"""
+
+import os 
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
+
+import argparse
+import numpy as np 
+
+from src.core import YAMLConfig
+
+import torch
+import torch.nn as nn 
+
+
+def main(args, ):
+    """main
+    """
+    cfg = YAMLConfig(args.config, resume=args.resume)
+
+    if args.resume:
+        checkpoint = torch.load(args.resume, map_location='cpu') 
+        if 'ema' in checkpoint:
+            state = checkpoint['ema']['module']
+        else:
+            state = checkpoint['model']
+    else:
+        raise AttributeError('only support resume to load model.state_dict by now.')
+
+    # NOTE load train mode state -> convert to deploy mode
+    cfg.model.load_state_dict(state)
+
+    class Model(nn.Module):
+        def __init__(self, ) -> None:
+            super().__init__()
+            self.model = cfg.model.deploy()
+            self.postprocessor = cfg.postprocessor.deploy()
+            print(self.postprocessor.deploy_mode)
+            
+        def forward(self, images, orig_target_sizes):
+            outputs = self.model(images)
+            return self.postprocessor(outputs, orig_target_sizes)
+    
+
+    model = Model()
+
+    dynamic_axes = {
+        'images': {0: 'N', },
+        'orig_target_sizes': {0: 'N'}
+    }
+
+    data = torch.rand(1, 3, 640, 640)
+    size = torch.tensor([[640, 640]])
+
+    torch.onnx.export(
+        model, 
+        (data, size), 
+        args.file_name,
+        input_names=['images', 'orig_target_sizes'],
+        output_names=['labels', 'boxes', 'scores'],
+        dynamic_axes=dynamic_axes,
+        opset_version=16, 
+        verbose=False
+    )
+
+
+    if args.check:
+        import onnx
+        onnx_model = onnx.load(args.file_name)
+        onnx.checker.check_model(onnx_model)
+        print('Check export onnx model done...')
+
+
+    if args.simplify:
+        import onnxsim
+        dynamic = True 
+        input_shapes = {'images': data.shape, 'orig_target_sizes': size.shape} if dynamic else None
+        onnx_model_simplify, check = onnxsim.simplify(args.file_name, input_shapes=input_shapes, dynamic_input_shape=dynamic)
+        onnx.save(onnx_model_simplify, args.file_name)
+        print(f'Simplify onnx model {check}...')
+
+
+    # import onnxruntime as ort 
+    # from PIL import Image, ImageDraw, ImageFont
+    # from torchvision.transforms import ToTensor
+    # from src.data.coco.coco_dataset import mscoco_category2name, mscoco_category2label, mscoco_label2category
+
+    # # print(onnx.helper.printable_graph(mm.graph))
+
+    # # Load the original image without resizing
+    # original_im = Image.open('./hongkong.jpg').convert('RGB')
+    # original_size = original_im.size
+
+    # # Resize the image for model input
+    # im = original_im.resize((640, 640))
+    # im_data = ToTensor()(im)[None]
+    # print(im_data.shape)
+
+    # sess = ort.InferenceSession(args.file_name)
+    # output = sess.run(
+    #     # output_names=['labels', 'boxes', 'scores'],
+    #     output_names=None,
+    #     input_feed={'images': im_data.data.numpy(), "orig_target_sizes": size.data.numpy()}
+    # )
+
+    # # print(type(output))
+    # # print([out.shape for out in output])
+
+    # labels, boxes, scores = output
+
+    # draw = ImageDraw.Draw(original_im)  # Draw on the original image
+    # thrh = 0.6
+
+    # for i in range(im_data.shape[0]):
+
+    #     scr = scores[i]
+    #     lab = labels[i][scr > thrh]
+    #     box = boxes[i][scr > thrh]
+
+    #     print(i, sum(scr > thrh))
+
+    #     for b, l in zip(box, lab):
+    #         # Scale the bounding boxes back to the original image size
+    #         b = [coord * original_size[j % 2] / 640 for j, coord in enumerate(b)]
+    #         # Get the category name from the label
+    #         category_name = mscoco_category2name[mscoco_label2category[l]]
+    #         draw.rectangle(list(b), outline='red', width=2)
+    #         font = ImageFont.truetype("Arial.ttf", 15)
+    #         draw.text((b[0], b[1]), text=category_name, fill='yellow', font=font)
+
+    # # Save the original image with bounding boxes
+    # original_im.save('test.jpg')
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config', '-c', type=str, )
+    parser.add_argument('--resume', '-r', type=str, )
+    parser.add_argument('--file-name', '-f', type=str, default='model.onnx')
+    parser.add_argument('--check',  action='store_true', default=False,)
+    parser.add_argument('--simplify',  action='store_true', default=False,)
+
+    args = parser.parse_args()
+
+    main(args)
diff --git a/rtdetr_pytorch/tools/infer.py b/rtdetr_pytorch/tools/infer.py
new file mode 100644
index 0000000..385ce80
--- /dev/null
+++ b/rtdetr_pytorch/tools/infer.py
@@ -0,0 +1,203 @@
+import torch
+import torch.nn as nn 
+import torchvision.transforms as T
+from torch.cuda.amp import autocast
+import numpy as np 
+from PIL import Image, ImageDraw, ImageFont
+import os 
+import sys 
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
+import argparse
+import src.misc.dist as dist 
+from src.core import YAMLConfig 
+from src.solver import TASKS
+import numpy as np
+
+def postprocess(labels, boxes, scores, iou_threshold=0.55):
+    def calculate_iou(box1, box2):
+        x1, y1, x2, y2 = box1
+        x3, y3, x4, y4 = box2
+        xi1 = max(x1, x3)
+        yi1 = max(y1, y3)
+        xi2 = min(x2, x4)
+        yi2 = min(y2, y4)
+        inter_width = max(0, xi2 - xi1)
+        inter_height = max(0, yi2 - yi1)
+        inter_area = inter_width * inter_height
+        box1_area = (x2 - x1) * (y2 - y1)
+        box2_area = (x4 - x3) * (y4 - y3)
+        union_area = box1_area + box2_area - inter_area
+        iou = inter_area / union_area if union_area != 0 else 0
+        return iou
+    merged_labels = []
+    merged_boxes = []
+    merged_scores = []
+    used_indices = set()
+    for i in range(len(boxes)):
+        if i in used_indices:
+            continue
+        current_box = boxes[i]
+        current_label = labels[i]
+        current_score = scores[i]
+        boxes_to_merge = [current_box]
+        scores_to_merge = [current_score]
+        used_indices.add(i)
+        for j in range(i + 1, len(boxes)):
+            if j in used_indices:
+                continue
+            if labels[j] != current_label:
+                continue  
+            other_box = boxes[j]
+            iou = calculate_iou(current_box, other_box)
+            if iou >= iou_threshold:
+                boxes_to_merge.append(other_box.tolist())  
+                scores_to_merge.append(scores[j])
+                used_indices.add(j)
+        xs = np.concatenate([[box[0], box[2]] for box in boxes_to_merge])
+        ys = np.concatenate([[box[1], box[3]] for box in boxes_to_merge])
+        merged_box = [np.min(xs), np.min(ys), np.max(xs), np.max(ys)]
+        merged_score = max(scores_to_merge)
+        merged_boxes.append(merged_box)
+        merged_labels.append(current_label)
+        merged_scores.append(merged_score)
+    return [np.array(merged_labels)], [np.array(merged_boxes)], [np.array(merged_scores)]
+def slice_image(image, slice_height, slice_width, overlap_ratio):
+    img_width, img_height = image.size
+    
+    slices = []
+    coordinates = []
+    step_x = int(slice_width * (1 - overlap_ratio))
+    step_y = int(slice_height * (1 - overlap_ratio))
+    
+    for y in range(0, img_height, step_y):
+        for x in range(0, img_width, step_x):
+            box = (x, y, min(x + slice_width, img_width), min(y + slice_height, img_height))
+            slice_img = image.crop(box)
+            slices.append(slice_img)
+            coordinates.append((x, y))
+    return slices, coordinates
+def merge_predictions(predictions, slice_coordinates, orig_image_size, slice_width, slice_height, threshold=0.30):
+    merged_labels = []
+    merged_boxes = []
+    merged_scores = []
+    orig_height, orig_width = orig_image_size
+    for i, (label, boxes, scores) in enumerate(predictions):
+        x_shift, y_shift = slice_coordinates[i]
+        scores = np.array(scores).reshape(-1)
+        valid_indices = scores > threshold
+        valid_labels = np.array(label).reshape(-1)[valid_indices]
+        valid_boxes = np.array(boxes).reshape(-1, 4)[valid_indices]
+        valid_scores = scores[valid_indices]
+        for j, box in enumerate(valid_boxes):
+            box[0] = np.clip(box[0] + x_shift, 0, orig_width)  
+            box[1] = np.clip(box[1] + y_shift, 0, orig_height)
+            box[2] = np.clip(box[2] + x_shift, 0, orig_width)  
+            box[3] = np.clip(box[3] + y_shift, 0, orig_height) 
+            valid_boxes[j] = box
+        merged_labels.extend(valid_labels)
+        merged_boxes.extend(valid_boxes)
+        merged_scores.extend(valid_scores)
+    return np.array(merged_labels), np.array(merged_boxes), np.array(merged_scores)
+def draw(images, labels, boxes, scores, thrh = 0.6, path = ""):
+    for i, im in enumerate(images):
+        draw = ImageDraw.Draw(im)
+        scr = scores[i]
+        lab = labels[i][scr > thrh]
+        box = boxes[i][scr > thrh]
+        scrs = scores[i][scr > thrh]
+        for j,b in enumerate(box):
+            draw.rectangle(list(b), outline='red',)
+            draw.text((b[0], b[1]), text=f"label: {lab[j].item()} {round(scrs[j].item(),2)}", font=ImageFont.load_default(), fill='blue')
+        if path == "":
+            im.save(f'results_{i}.jpg')
+        else:
+            im.save(path)
+            
+def main(args, ):
+    """main
+    """
+    cfg = YAMLConfig(args.config, resume=args.resume)
+    if args.resume:
+        checkpoint = torch.load(args.resume, map_location='cpu') 
+        if 'ema' in checkpoint:
+            state = checkpoint['ema']['module']
+        else:
+            state = checkpoint['model']
+    else:
+        raise AttributeError('Only support resume to load model.state_dict by now.')
+    # NOTE load train mode state -> convert to deploy mode
+    cfg.model.load_state_dict(state)
+    class Model(nn.Module):
+        def __init__(self, ) -> None:
+            super().__init__()
+            self.model = cfg.model.deploy()
+            self.postprocessor = cfg.postprocessor.deploy()
+            
+        def forward(self, images, orig_target_sizes):
+            outputs = self.model(images)
+            outputs = self.postprocessor(outputs, orig_target_sizes)
+            return outputs
+    
+    model = Model().to(args.device)
+    im_pil = Image.open(args.im_file).convert('RGB')
+    w, h = im_pil.size
+    orig_size = torch.tensor([w, h])[None].to(args.device)
+    
+    transforms = T.Compose([
+        T.Resize((640, 640)),  
+        T.ToTensor(),
+    ])
+    im_data = transforms(im_pil)[None].to(args.device)
+    if args.sliced:
+        num_boxes = args.numberofboxes
+        
+        aspect_ratio = w / h
+        num_cols = int(np.sqrt(num_boxes * aspect_ratio)) 
+        num_rows = int(num_boxes / num_cols)
+        slice_height = h // num_rows
+        slice_width = w // num_cols
+        overlap_ratio = 0.2
+        slices, coordinates = slice_image(im_pil, slice_height, slice_width, overlap_ratio)
+        predictions = []
+        for i, slice_img in enumerate(slices):
+            slice_tensor = transforms(slice_img)[None].to(args.device)
+            with autocast():  # Use AMP for each slice
+                output = model(slice_tensor, torch.tensor([[slice_img.size[0], slice_img.size[1]]]).to(args.device))
+            torch.cuda.empty_cache() 
+            labels, boxes, scores = output
+            
+            labels = labels.cpu().detach().numpy()
+            boxes = boxes.cpu().detach().numpy()
+            scores = scores.cpu().detach().numpy()
+            predictions.append((labels, boxes, scores))
+        
+        merged_labels, merged_boxes, merged_scores = merge_predictions(predictions, coordinates, (h, w), slice_width, slice_height)
+        labels, boxes, scores = postprocess(merged_labels, merged_boxes, merged_scores)
+    else:
+        output = model(im_data, orig_size)
+        labels, boxes, scores = output
+        
+    draw([im_pil], labels, boxes, scores, 0.6)
+  
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, )
+    parser.add_argument('-r', '--resume', type=str, )
+    parser.add_argument('-f', '--im-file', type=str, )
+    parser.add_argument('-s', '--sliced', type=bool, default=False)
+    parser.add_argument('-d', '--device', type=str, default='cpu')
+    parser.add_argument('-nc', '--numberofboxes', type=int, default=25)
+    args = parser.parse_args()
+    main(args)
+
+
+
+
+
+
+
+
+
+
+
diff --git a/rtdetr_pytorch/tools/train.py b/rtdetr_pytorch/tools/train.py
new file mode 100644
index 0000000..31b31ef
--- /dev/null
+++ b/rtdetr_pytorch/tools/train.py
@@ -0,0 +1,50 @@
+"""by lyuwenyu
+"""
+
+import os 
+import sys 
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
+import argparse
+
+import src.misc.dist as dist 
+from src.core import YAMLConfig 
+from src.solver import TASKS
+
+
+def main(args, ) -> None:
+    '''main
+    '''
+    dist.init_distributed()
+    if args.seed is not None:
+        dist.set_seed(args.seed)
+
+    assert not all([args.tuning, args.resume]), \
+        'Only support from_scrach or resume or tuning at one time'
+
+    cfg = YAMLConfig(
+        args.config,
+        resume=args.resume, 
+        use_amp=args.amp,
+        tuning=args.tuning
+    )
+
+    solver = TASKS[cfg.yaml_cfg['task']](cfg)
+    
+    if args.test_only:
+        solver.val()
+    else:
+        solver.fit()
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config', '-c', type=str, )
+    parser.add_argument('--resume', '-r', type=str, )
+    parser.add_argument('--tuning', '-t', type=str, )
+    parser.add_argument('--test-only', action='store_true', default=False,)
+    parser.add_argument('--amp', action='store_true', default=False,)
+    parser.add_argument('--seed', type=int, help='seed',)
+    args = parser.parse_args()
+
+    main(args)
diff --git a/rtdetrv2_paddle/readme.md b/rtdetrv2_paddle/readme.md
new file mode 100644
index 0000000..6d4f01b
--- /dev/null
+++ b/rtdetrv2_paddle/readme.md
@@ -0,0 +1 @@
+see https://github.com/PaddlePaddle/PaddleDetection
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/Dockerfile b/rtdetrv2_pytorch/Dockerfile
new file mode 100644
index 0000000..4682732
--- /dev/null
+++ b/rtdetrv2_pytorch/Dockerfile
@@ -0,0 +1,10 @@
+FROM nvcr.io/nvidia/pytorch:25.06-py3
+
+WORKDIR /workspace
+
+COPY requirements.txt .
+
+RUN pip install --upgrade pip && \
+    pip install -r requirements.txt
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/README.md b/rtdetrv2_pytorch/README.md
new file mode 100644
index 0000000..69b6f13
--- /dev/null
+++ b/rtdetrv2_pytorch/README.md
@@ -0,0 +1,168 @@
+
+## Quick start
+
+<details >
+<summary>Setup</summary>
+
+```shell
+
+pip install -r requirements.txt
+```
+
+The following is the corresponding `torch` and `torchvision` versions.
+`rtdetr` | `torch` | `torchvision`
+|---|---|---|
+| `-` | `2.4` | `0.19` |
+| `-` | `2.2` | `0.17` |
+| `-` | `2.1` | `0.16` |
+| `-` | `2.0` | `0.15` |
+
+</details>
+
+<details open>
+<summary>Fig</summary>
+
+<div align="center">
+<img width="500" alt="image" src="https://github.com/user-attachments/assets/437877e9-1d4f-4d30-85e8-aafacfa0ec56">
+</div>
+
+</details>
+
+
+## Model Zoo
+
+### Base models
+
+| Model | Dataset | Input Size | AP<sup>val</sup> | AP<sub>50</sub><sup>val</sup> | #Params(M) | FPS | config| checkpoint | 
+| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |:---: |
+**RT-DETRv2-S** | COCO | 640 | **48.1** <font color=green>(+1.6)</font> | **65.1** | 20 | 217 | [config](./configs/rtdetrv2/rtdetrv2_r18vd_120e_coco.yml) | [url](https://github.com/lyuwenyu/storage/releases/download/v0.2/rtdetrv2_r18vd_120e_coco_rerun_48.1.pth) |
+**RT-DETRv2-M**<sup>*<sup> | COCO | 640 | **49.9** <font color=green>(+1.0)</font> | **67.5** | 31 | 161 | [config](./configs/rtdetrv2/rtdetrv2_r34vd_120e_coco.yml) | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r34vd_120e_coco_ema.pth)
+**RT-DETRv2-M** | COCO | 640 | **51.9** <font color=green>(+0.6)</font> | **69.9** | 36 | 145 | [config](./configs/rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml) | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_m_7x_coco_ema.pth)
+**RT-DETRv2-L** | COCO | 640 | **53.4** <font color=green>(+0.3)</font> | **71.6** | 42 | 108 | [config](./configs/rtdetrv2/rtdetrv2_r50vd_6x_coco.yml) | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_6x_coco_ema.pth)
+**RT-DETRv2-X** | COCO | 640 | 54.3 | **72.8** <font color=green>(+0.1)</font> | 76 | 74 | [config](./configs/rtdetrv2/rtdetrv2_r101vd_6x_coco.yml) | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r101vd_6x_coco_from_paddle.pth)
+<!-- rtdetrv2_hgnetv2_l | COCO | 640 | 52.9 | 71.5 | 32 | 114 | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_hgnetv2_l_6x_coco_from_paddle.pth) 
+rtdetrv2_hgnetv2_x | COCO | 640 | 54.7 | 72.9 | 67 | 74 | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_hgnetv2_x_6x_coco_from_paddle.pth) 
+rtdetrv2_hgnetv2_h | COCO | 640 | 56.3 | 74.8 | 123 | 40 | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_hgnetv2_h_6x_coco_from_paddle.pth) 
+rtdetrv2_18vd | COCO+Objects365 | 640 | 49.0 | 66.5 | 20 | 217 | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r18vd_5x_coco_objects365_from_paddle.pth)
+rtdetrv2_r50vd | COCO+Objects365 | 640 | 55.2 | 73.4 | 42 | 108 | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_2x_coco_objects365_from_paddle.pth)
+rtdetrv2_r101vd | COCO+Objects365 | 640 | 56.2 | 74.5 | 76 | 74 | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r101vd_2x_coco_objects365_from_paddle.pth)
+ -->
+
+**Notes:**
+- `AP` is evaluated on *MSCOCO val2017* dataset.
+- `FPS` is evaluated on a single T4 GPU with $batch\\_size = 1$, $fp16$, and $TensorRT>=8.5.1$.
+- `COCO + Objects365` in the table means finetuned model on `COCO` using pretrained weights trained on `Objects365`.
+
+
+
+### Models of discrete sampling
+
+| Model | Sampling Method | AP<sup>val</sup> | AP<sub>50</sub><sup>val</sup> | config| checkpoint 
+| :---: | :---: | :---: | :---: | :---: | :---: |
+**RT-DETRv2-S_dsp** | discrete_sampling | 47.4 | 64.8 <font color=red>(-0.1)</font> | [config](./configs/rtdetrv2/rtdetrv2_r18vd_dsp_3x_coco.yml) | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r18vd_dsp_3x_coco.pth)
+**RT-DETRv2-M**<sup>*</sup>**_dsp** | discrete_sampling | 49.2 | 67.1 <font color=red>(-0.4)</font> | [config](./configs/rtdetrv2/rtdetrv2_r34vd_dsp_1x_coco.yml) | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rrtdetrv2_r34vd_dsp_1x_coco.pth)
+**RT-DETRv2-M_dsp** | discrete_sampling | 51.4 | 69.7 <font color=red>(-0.2)</font> | [config](./configs/rtdetrv2/rtdetrv2_r50vd_m_dsp_3x_coco.yml) | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_m_dsp_3x_coco.pth)
+**RT-DETRv2-L_dsp** | discrete_sampling | 52.9 | 71.3 <font color=red>(-0.3)</font> |[config](./configs/rtdetrv2/rtdetrv2_r50vd_dsp_1x_coco.yml)| [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_dsp_1x_coco.pth)
+
+
+<!-- **rtdetrv2_r18vd_dsp1** | discrete_sampling | 21600 | 46.3 | 63.9 | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r18vd_dsp1_1x_coco.pth) -->
+
+<!-- rtdetrv2_r18vd_dsp1 | discrete_sampling | 21600 | 45.5 | 63.0 | 4.34 | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r18vd_dsp1_120e_coco.pth) -->
+<!-- 4.3 -->
+
+**Notes:**
+- The impact on inference speed is related to specific device and software.
+- `*_dsp*` is the model inherit `*_sp*` model's knowledge and adapt to `discrete_sampling` strategy. **You can use TensorRT 8.4 (or even older versions) to inference for these models**
+<!-- - `grid_sampling` use `grid_sample` to sample attention map, `discrete_sampling` use `index_select` method to sample attention map.  -->
+
+
+### Ablation on sampling points
+
+<!-- Flexible samping strategy in cross attenstion layer for devices that do **not** optimize (or not support) `grid_sampling` well. You can choose models based on specific scenarios and the trade-off between speed and accuracy. -->
+
+| Model | Sampling Method | #Points | AP<sup>val</sup> | AP<sub>50</sub><sup>val</sup> | checkpoint 
+| :---: | :---: | :---: | :---: | :---: | :---: |
+**rtdetrv2_r18vd_sp1** | grid_sampling | 21,600 | 47.3 | 64.3 <font color=red>(-0.6) | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r18vd_sp1_120e_coco.pth)
+**rtdetrv2_r18vd_sp2** | grid_sampling | 43,200 | 47.7 | 64.7 <font color=red>(-0.2) | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r18vd_sp2_120e_coco.pth)
+**rtdetrv2_r18vd_sp3** | grid_sampling | 64,800 | 47.8 | 64.8 <font color=red>(-0.1) | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r18vd_sp3_120e_coco.pth)
+rtdetrv2_r18vd(_sp4)| grid_sampling | 86,400 | 47.9 | 64.9 | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r18vd_120e_coco.pth) 
+
+**Notes:**
+- The impact on inference speed is related to specific device and software.
+- `#points` the total number of sampling points in decoder for per image inference.
+
+
+## Usage
+<details>
+<summary> details </summary>
+
+<!-- <summary>1. Training </summary> -->
+1. Training
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=9909 --nproc_per_node=4 tools/train.py -c path/to/config --use-amp --seed=0 &> log.txt 2>&1 &
+```
+
+<!-- <summary>2. Testing </summary> -->
+2. Testing
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=9909 --nproc_per_node=4 tools/train.py -c path/to/config -r path/to/checkpoint --test-only
+```
+
+<!-- <summary>3. Tuning </summary> -->
+3. Tuning
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=9909 --nproc_per_node=4 tools/train.py -c path/to/config -t path/to/checkpoint --use-amp --seed=0 &> log.txt 2>&1 &
+```
+
+<!-- <summary>4. Export onnx </summary> -->
+4. Export onnx
+```shell
+python tools/export_onnx.py -c path/to/config -r path/to/checkpoint --check
+```
+
+<!-- <summary>5. Export tensorrt </summary> -->
+5. Export tensorrt
+```shell
+python tools/export_trt.py -i path/to/onnxfile
+```
+
+<!-- <summary>6. Inference </summary> -->
+5. Inference
+
+Support torch, onnxruntime, tensorrt and openvino, see details in *references/deploy*
+```shell
+python references/deploy/rtdetrv2_onnxruntime.py --onnx-file=model.onnx --im-file=xxxx
+python references/deploy/rtdetrv2_tensorrt.py --trt-file=model.trt --im-file=xxxx
+python references/deploy/rtdetrv2_torch.py -c path/to/config -r path/to/checkpoint --im-file=xxx --device=cuda:0
+```
+</details>
+
+
+
+## Citation
+If you use `RTDETR` or `RTDETRv2` in your work, please use the following BibTeX entries:
+
+<details>
+<summary> bibtex </summary>
+
+```latex
+@misc{lv2023detrs,
+      title={DETRs Beat YOLOs on Real-time Object Detection},
+      author={Wenyu Lv and Shangliang Xu and Yian Zhao and Guanzhong Wang and Jinman Wei and Cheng Cui and Yuning Du and Qingqing Dang and Yi Liu},
+      year={2023},
+      eprint={2304.08069},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+
+@misc{lv2024rtdetrv2improvedbaselinebagoffreebies,
+      title={RT-DETRv2: Improved Baseline with Bag-of-Freebies for Real-Time Detection Transformer}, 
+      author={Wenyu Lv and Yian Zhao and Qinyao Chang and Kui Huang and Guanzhong Wang and Yi Liu},
+      year={2024},
+      eprint={2407.17140},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2407.17140}, 
+}
+```
+</details>
diff --git a/rtdetrv2_pytorch/configs/dataset/coco_detection.yml b/rtdetrv2_pytorch/configs/dataset/coco_detection.yml
new file mode 100644
index 0000000..270b319
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/dataset/coco_detection.yml
@@ -0,0 +1,48 @@
+task: detection
+
+evaluator:
+  type: CocoEvaluator
+  iou_types: ['bbox', ]
+
+# num_classes: 365
+# remap_mscoco_category: False
+
+# num_classes: 91
+# remap_mscoco_category: False
+
+num_classes: 80
+remap_mscoco_category: True
+
+
+train_dataloader: 
+  type: DataLoader
+  dataset: 
+    type: CocoDetection
+    img_folder: ./dataset/coco/train2017/
+    ann_file: ./dataset/coco/annotations/instances_train2017.json
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: True
+  num_workers: 4
+  drop_last: True 
+  collate_fn:
+    type: BatchImageCollateFunction
+
+
+val_dataloader:
+  type: DataLoader
+  dataset: 
+    type: CocoDetection
+    img_folder: ./dataset/coco/val2017/
+    ann_file: ./dataset/coco/annotations/instances_val2017.json
+    return_masks: False
+    transforms:
+      type: Compose
+      ops: ~ 
+  shuffle: False
+  num_workers: 4
+  drop_last: False
+  collate_fn:
+    type: BatchImageCollateFunction
diff --git a/rtdetrv2_pytorch/configs/dataset/voc_detection.yml b/rtdetrv2_pytorch/configs/dataset/voc_detection.yml
new file mode 100644
index 0000000..7f6f155
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/dataset/voc_detection.yml
@@ -0,0 +1,40 @@
+task: detection
+
+evaluator:
+  type: CocoEvaluator
+  iou_types: ['bbox', ]
+
+num_classes: 20
+
+train_dataloader: 
+  type: DataLoader
+  dataset: 
+    type: VOCDetection
+    root: ./dataset/voc/
+    ann_file: trainval.txt
+    label_file: label_list.txt
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: True
+  num_workers: 4
+  drop_last: True 
+  collate_fn:
+    type: BatchImageCollateFunction
+
+
+val_dataloader:
+  type: DataLoader
+  dataset: 
+    type: VOCDetection
+    root: ./dataset/voc/
+    ann_file: test.txt
+    label_file: label_list.txt
+    transforms:
+      type: Compose
+      ops: ~
+  shuffle: False
+  num_workers: 4
+  drop_last: False
+  collate_fn:
+    type: BatchImageCollateFunction
diff --git a/rtdetrv2_pytorch/configs/rtdetr/include/dataloader.yml b/rtdetrv2_pytorch/configs/rtdetr/include/dataloader.yml
new file mode 100644
index 0000000..64d6dc7
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetr/include/dataloader.yml
@@ -0,0 +1,31 @@
+
+train_dataloader: 
+  dataset: 
+    return_masks: False
+    transforms:
+      ops:
+        - {type: RandomPhotometricDistort, p: 0.5}
+        - {type: RandomZoomOut, fill: 0}
+        - {type: RandomIoUCrop, p: 0.8}
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: RandomHorizontalFlip}
+        - {type: Resize, size: [640, 640], }
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}   
+        - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}  
+  collate_fn:
+    type: BatchImageCollateFunction
+    scales: [480, 512, 544, 576, 608, 640, 640, 640, 672, 704, 736, 768, 800]
+  shuffle: True
+  num_workers: 4
+  total_batch_size: 16
+
+val_dataloader:
+  dataset: 
+    transforms:
+      ops: 
+        - {type: Resize, size: [640, 640]}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}   
+  shuffle: False
+  total_batch_size: 16
+  num_workers: 8
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/configs/rtdetr/include/optimizer.yml b/rtdetrv2_pytorch/configs/rtdetr/include/optimizer.yml
new file mode 100644
index 0000000..29abdd8
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetr/include/optimizer.yml
@@ -0,0 +1,40 @@
+
+use_ema: True 
+ema:
+  type: ModelEMA
+  decay: 0.9999
+  warmups: 2000
+
+
+epoches: 72
+clip_max_norm: 0.1
+
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*(?:norm|bn)).*$'
+      lr: 0.00001
+    -
+      params: '^(?=.*backbone)(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+      lr: 0.00001
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+
+lr_scheduler:
+  type: MultiStepLR
+  milestones: [1000]
+  gamma: 0.1
+
+
+lr_warmup_scheduler:
+  type: LinearWarmup
+  warmup_duration: 2000
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/configs/rtdetr/include/rtdetr_r50vd.yml b/rtdetrv2_pytorch/configs/rtdetr/include/rtdetr_r50vd.yml
new file mode 100644
index 0000000..f21615e
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetr/include/rtdetr_r50vd.yml
@@ -0,0 +1,79 @@
+task: detection
+
+model: RTDETR
+criterion: RTDETRCriterion
+postprocessor: RTDETRPostProcessor
+
+
+use_focal_loss: True
+eval_spatial_size: [640, 640] # h w
+
+
+RTDETR: 
+  backbone: PResNet
+  encoder: HybridEncoder
+  decoder: RTDETRTransformer
+  
+
+PResNet:
+  depth: 50
+  variant: d
+  freeze_at: 0
+  return_idx: [1, 2, 3]
+  num_stages: 4
+  freeze_norm: True
+  pretrained: True 
+
+
+HybridEncoder:
+  in_channels: [512, 1024, 2048]
+  feat_strides: [8, 16, 32]
+
+  # intra
+  hidden_dim: 256
+  use_encoder_idx: [2]
+  num_encoder_layers: 1
+  nhead: 8
+  dim_feedforward: 1024
+  dropout: 0.
+  enc_act: 'gelu'
+  
+  # cross
+  expansion: 1.0
+  depth_mult: 1
+  act: 'silu'
+
+  version: v1
+
+RTDETRTransformer:
+  feat_channels: [256, 256, 256]
+  feat_strides: [8, 16, 32]
+  hidden_dim: 256
+  num_levels: 3
+
+  num_layers: 6
+  num_queries: 300
+
+  num_denoising: 100
+  label_noise_ratio: 0.5
+  box_noise_scale: 1.0 # 1.0 0.4
+
+  eval_idx: -1
+
+
+RTDETRPostProcessor:
+  num_top_queries: 300
+
+
+RTDETRCriterion:
+  weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2,}
+  losses: ['vfl', 'boxes', ]
+  alpha: 0.75
+  gamma: 2.0
+
+  matcher:
+    type: HungarianMatcher
+    weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2}
+    alpha: 0.25
+    gamma: 2.0
+
diff --git a/rtdetrv2_pytorch/configs/rtdetr/readme.md b/rtdetrv2_pytorch/configs/rtdetr/readme.md
new file mode 100644
index 0000000..46ccd52
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetr/readme.md
@@ -0,0 +1,111 @@
+# DETRs Beat YOLOs on Real-time Object Detection
+
+## Introduction
+This repository is the official pytorch implementation of [*RTDETR*](https://arxiv.org/abs/2304.08069v1), and is compatiable with [RT-DETR/rtdetr_pytorch](https://github.com/lyuwenyu/RT-DETR/tree/main). For paddle version implementation, please refer to [RT-DETR/rtdetr_paddle](https://github.com/lyuwenyu/RT-DETR/tree/main). **If you are using rtdetr for the first time, it is highly recommended to use [rtdetrv2](../rtdetrv2/)**.
+
+<details open>
+<summary> Fig </summary>
+<div align="center">
+  <img src="https://github.com/lyuwenyu/RT-DETR/assets/17582080/42636690-1ecf-4647-b075-842ecb9bc562" width=500>
+</div>
+</details>
+
+<!-- 
+<div align="center">
+  <img src="https://github.com/lyuwenyu/RT-DETR/assets/17582080/42636690-1ecf-4647-b075-842ecb9bc562" width=500>
+</div> -->
+
+
+## Model Zoo
+| Model | Dataset | Input Size | AP<sup>val</sup> | AP<sub>50</sub><sup>val</sup> | #Params(M) | FPS |  checkpoint |
+| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
+rtdetr_r18vd | COCO | 640 | 46.4 | 63.7 | 20 | 217 | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r18vd_dec3_6x_coco_from_paddle.pth)
+rtdetr_r34vd | COCO | 640 | 48.9 | 66.8 | 31 | 161 | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r34vd_dec4_6x_coco_from_paddle.pth)
+rtdetr_r50vd_m | COCO | 640 | 51.3 | 69.5 | 36 | 145 | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_m_6x_coco_from_paddle.pth)
+rtdetr_r50vd | COCO | 640 | 53.1 | 71.2| 42 | 108 | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_6x_coco_from_paddle.pth)
+rtdetr_r101vd | COCO | 640 | 54.3 | 72.8 | 76 | 74 | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r101vd_6x_coco_from_paddle.pth)
+rtdetr_18vd | COCO+Objects365 | 640 | 49.0 | 66.5 | 20 | 217 | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r18vd_5x_coco_objects365_from_paddle.pth)
+rtdetr_r50vd | COCO+Objects365 | 640 | 55.2 | 73.4 | 42 | 108 | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_2x_coco_objects365_from_paddle.pth)
+rtdetr_r101vd | COCO+Objects365 | 640 | 56.2 | 74.5 | 76 | 74 | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r101vd_2x_coco_objects365_from_paddle.pth)
+
+<!-- rtdetr_r18vd | COCO | 640 | 46.5 | 63.6 | 20 | 217 | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r18vd_6x_coco.pth) -->
+
+<!-- rtdetr_r18vd | Objects365 | 640 | 22.9 |  31.2| - | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r18vd_5x_coco_objects365_from_paddle.pth)
+rtdetr_r50vd | Objects365 | 640 | 35.1 | 46.2 | - | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_2x_coco_objects365_from_paddle.pth)
+rtdetr_r101vd | Objects365 | 640 | 36.8 | 48.3 | - | [url<sup>*</sup>](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r101vd_2x_coco_objects365_from_paddle.pth) -->
+
+Notes
+<!-- - AP is evaluated on coco 2017 val dataset -->
+<!-- RT-DETR was trained on COCO train2017 and evaluated on val2017. -->
+- `COCO + Objects365` in the table means finetuned model on `COCO` using pretrained weights trained on `Objects365`.
+- `FPS` is evaluated on a single T4 GPU with $batch\\_size = 1$ and $tensorrt\\_fp16$ mode
+- `url`<sup>`*`</sup> is the url of the pretrained weights, converted from the paddle model to save energy. *There may be slight differences between this table and the paper.
+
+
+## Usage
+<details>
+<summary> details </summary>
+
+<!-- <summary>1. Training </summary> -->
+1. Training
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=9909 --nproc_per_node=4 tools/train.py -c path/to/config &> log.txt 2>&1 &
+```
+
+<!-- <summary>2. Testing </summary> -->
+2. Testing
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=9909 --nproc_per_node=4 tools/train.py -c path/to/config -r path/to/checkpoint --test-only
+```
+
+<!-- <summary>3. Tuning </summary> -->
+3. Tuning
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=9909 --nproc_per_node=4 tools/train.py -c path/to/config -t path/to/checkpoint &> log.txt 2>&1 &
+```
+
+<!-- <summary>4. Export onnx </summary> -->
+4. Export onnx
+```shell
+python tools/export_onnx.py -c path/to/config -r path/to/checkpoint --check
+```
+
+<!-- <summary>5. Inference </summary> -->
+5. Inference
+
+Support torch, onnxruntime, tensorrt and openvino, see details in *references/deploy*
+```shell
+python references/deploy/rtdetrv2_onnx.py --onnx-file=model.onnx --im-file=xxxx
+python references/deploy/rtdetrv2_tensorrt.py --trt-file=model.trt --im-file=xxxx
+python references/deploy/rtdetrv2_torch.py -c path/to/config -r path/to/checkpoint --im-file=xxx --device=cuda:0
+```
+</details>
+
+
+## Citation
+If you use `RTDETR` in your work, please use the following BibTeX entries:
+
+<details>
+<summary> bibtex </summary>
+
+```latex
+@misc{lv2023detrs,
+      title={DETRs Beat YOLOs on Real-time Object Detection},
+      author={Wenyu Lv and Shangliang Xu and Yian Zhao and Guanzhong Wang and Jinman Wei and Cheng Cui and Yuning Du and Qingqing Dang and Yi Liu},
+      year={2023},
+      eprint={2304.08069},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+
+@software{Lv_rtdetr_by_cvperception_2023,
+author = {Lv, Wenyu},
+license = {Apache-2.0},
+month = oct,
+title = {{rtdetr by cvperception}},
+url = {https://github.com/lyuwenyu/cvperception/},
+version = {0.0.1dev},
+year = {2023}
+}
+```
+</details>
diff --git a/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r101vd_6x_coco.yml b/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r101vd_6x_coco.yml
new file mode 100644
index 0000000..82dc545
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r101vd_6x_coco.yml
@@ -0,0 +1,41 @@
+
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetr_r50vd.yml',
+]
+
+
+output_dir: ./output/rtdetr_r101vd_6x_coco
+
+
+PResNet:
+  depth: 101
+
+
+HybridEncoder:
+  # intra
+  hidden_dim: 384
+  dim_feedforward: 2048
+
+
+RTDETRTransformer:
+  feat_channels: [384, 384, 384]
+
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.000001
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
diff --git a/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r18vd_6x_coco.yml b/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r18vd_6x_coco.yml
new file mode 100644
index 0000000..5e4f95a
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r18vd_6x_coco.yml
@@ -0,0 +1,48 @@
+
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetr_r50vd.yml',
+]
+
+
+output_dir: ./output/rtdetr_r18vd_6x_coco
+
+
+PResNet:
+  depth: 18
+  freeze_at: -1
+  freeze_norm: False
+  pretrained: True
+
+
+HybridEncoder:
+  in_channels: [128, 256, 512]
+  hidden_dim: 256
+  expansion: 0.5
+
+
+RTDETRTransformer:
+  num_layers: 3
+
+
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?=.*norm|bn).*$'
+      weight_decay: 0.
+      lr: 0.00001
+    - 
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.00001
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
diff --git a/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r34vd_6x_coco.yml b/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r34vd_6x_coco.yml
new file mode 100644
index 0000000..f857644
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r34vd_6x_coco.yml
@@ -0,0 +1,48 @@
+
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetr_r50vd.yml',
+]
+
+
+output_dir: ./output/rtdetr_r34vd_6x_coco
+
+
+PResNet:
+  depth: 34
+  freeze_at: -1
+  freeze_norm: False
+  pretrained: True
+
+
+HybridEncoder:
+  in_channels: [128, 256, 512]
+  hidden_dim: 256
+  expansion: 0.5
+
+
+RTDETRTransformer:
+  num_layers: 4
+
+
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?=.*norm|bn).*$'
+      weight_decay: 0.
+      lr: 0.00001
+    - 
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.00001
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
diff --git a/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r50vd_6x_coco.yml b/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r50vd_6x_coco.yml
new file mode 100644
index 0000000..bc39f4a
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r50vd_6x_coco.yml
@@ -0,0 +1,14 @@
+
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetr_r50vd.yml',
+]
+
+
+output_dir: ./output/rtdetr_r50vd_6x_coco
+
+
+
diff --git a/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml b/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml
new file mode 100644
index 0000000..25d5ad8
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml
@@ -0,0 +1,34 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetr_r50vd.yml',
+]
+
+output_dir: ./output/rtdetr_r50vd_m_6x_coco
+
+
+HybridEncoder:
+  expansion: 0.5
+
+
+RTDETRTransformer:
+  eval_idx: 2 # use 3th decoder layer to eval
+
+
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.00001
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/include/dataloader.yml b/rtdetrv2_pytorch/configs/rtdetrv2/include/dataloader.yml
new file mode 100644
index 0000000..d55a411
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetrv2/include/dataloader.yml
@@ -0,0 +1,38 @@
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      ops:
+        - {type: RandomPhotometricDistort, p: 0.5}
+        - {type: RandomZoomOut, fill: 0}
+        - {type: RandomIoUCrop, p: 0.8}
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: RandomHorizontalFlip}
+        - {type: Resize, size: [640, 640], }
+        - {type: SanitizeBoundingBoxes, min_size: 1}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}   
+        - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True}
+      policy:
+        name: stop_epoch
+        epoch: 71 # epoch in [71, ~) stop `ops`
+        ops: ['RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop']
+  
+  collate_fn:
+    type: BatchImageCollateFunction
+    scales: [480, 512, 544, 576, 608, 640, 640, 640, 672, 704, 736, 768, 800]
+    stop_epoch: 71 # epoch in [71, ~) stop `multiscales`
+
+  shuffle: True
+  total_batch_size: 16 # total batch size equals to 16 (4 * 4)
+  num_workers: 4
+
+
+val_dataloader:
+  dataset: 
+    transforms:
+      ops: 
+        - {type: Resize, size: [640, 640]}
+        - {type: ConvertPILImage, dtype: 'float32', scale: True}   
+  shuffle: False
+  total_batch_size: 32
+  num_workers: 4
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/include/optimizer.yml b/rtdetrv2_pytorch/configs/rtdetrv2/include/optimizer.yml
new file mode 100644
index 0000000..189a9a1
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetrv2/include/optimizer.yml
@@ -0,0 +1,37 @@
+
+use_amp: True
+use_ema: True 
+ema:
+  type: ModelEMA
+  decay: 0.9999
+  warmups: 2000
+
+
+epoches: 72
+clip_max_norm: 0.1
+
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*norm).*$'
+      lr: 0.00001
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+
+lr_scheduler:
+  type: MultiStepLR
+  milestones: [1000]
+  gamma: 0.1
+
+
+lr_warmup_scheduler:
+  type: LinearWarmup
+  warmup_duration: 2000
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/include/rtdetrv2_r50vd.yml b/rtdetrv2_pytorch/configs/rtdetrv2/include/rtdetrv2_r50vd.yml
new file mode 100644
index 0000000..a5c1490
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetrv2/include/rtdetrv2_r50vd.yml
@@ -0,0 +1,83 @@
+task: detection
+
+model: RTDETR
+criterion: RTDETRCriterionv2
+postprocessor: RTDETRPostProcessor
+
+
+use_focal_loss: True
+eval_spatial_size: [640, 640] # h w
+
+
+RTDETR: 
+  backbone: PResNet
+  encoder: HybridEncoder
+  decoder: RTDETRTransformerv2
+  
+
+PResNet:
+  depth: 50
+  variant: d
+  freeze_at: 0
+  return_idx: [1, 2, 3]
+  num_stages: 4
+  freeze_norm: True
+  pretrained: True 
+
+
+HybridEncoder:
+  in_channels: [512, 1024, 2048]
+  feat_strides: [8, 16, 32]
+
+  # intra
+  hidden_dim: 256
+  use_encoder_idx: [2]
+  num_encoder_layers: 1
+  nhead: 8
+  dim_feedforward: 1024
+  dropout: 0.
+  enc_act: 'gelu'
+  
+  # cross
+  expansion: 1.0
+  depth_mult: 1
+  act: 'silu'
+
+
+RTDETRTransformerv2:
+  feat_channels: [256, 256, 256]
+  feat_strides: [8, 16, 32]
+  hidden_dim: 256
+  num_levels: 3
+
+  num_layers: 6
+  num_queries: 300
+
+  num_denoising: 100
+  label_noise_ratio: 0.5
+  box_noise_scale: 1.0 # 1.0 0.4
+
+  eval_idx: -1
+
+  # NEW
+  num_points: [4, 4, 4] # [3,3,3] [2,2,2]
+  cross_attn_method: default # default, discrete
+  query_select_method: default # default, agnostic 
+
+
+RTDETRPostProcessor:
+  num_top_queries: 300
+
+
+RTDETRCriterionv2:
+  weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2,}
+  losses: ['vfl', 'boxes', ]
+  alpha: 0.75
+  gamma: 2.0
+
+  matcher:
+    type: HungarianMatcher
+    weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2}
+    alpha: 0.25
+    gamma: 2.0
+
diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_hgnetv2_h_6x_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_hgnetv2_h_6x_coco.yml
new file mode 100644
index 0000000..7bb3546
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_hgnetv2_h_6x_coco.yml
@@ -0,0 +1,50 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetrv2_r50vd.yml',
+]
+
+
+output_dir: ./output/rtdetrv2_hgnetv2_h_6x_coco
+
+
+RTDETR:
+  backbone: HGNetv2
+
+
+HGNetv2:
+  name: 'H'
+  return_idx: [1, 2, 3]
+  freeze_at: 0
+  freeze_norm: True
+  pretrained: True
+
+
+HybridEncoder:
+  # intra
+  hidden_dim: 512
+  dim_feedforward: 2048
+  num_encoder_layers: 2
+
+
+RTDETRTransformerv2:
+  feat_channels: [512, 512, 512]
+
+
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.000005
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_hgnetv2_l_6x_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_hgnetv2_l_6x_coco.yml
new file mode 100644
index 0000000..5602496
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_hgnetv2_l_6x_coco.yml
@@ -0,0 +1,38 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetrv2_r50vd.yml',
+]
+
+
+output_dir: ./output/rtdetrv2_hgnetv2_l_6x_coco
+
+
+RTDETR:
+  backbone: HGNetv2
+
+
+HGNetv2:
+  name: 'L'
+  return_idx: [1, 2, 3]
+  freeze_at: 0
+  freeze_norm: True
+  pretrained: True
+
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.000005
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_hgnetv2_x_6x_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_hgnetv2_x_6x_coco.yml
new file mode 100644
index 0000000..b85d8a5
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_hgnetv2_x_6x_coco.yml
@@ -0,0 +1,50 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetrv2_r50vd.yml',
+]
+
+
+output_dir: ./output/rtdetrv2_hgnetv2_x_6x_coco
+
+
+RTDETR:
+  backbone: HGNetv2
+
+
+HGNetv2:
+  name: 'X'
+  return_idx: [1, 2, 3]
+  freeze_at: 0
+  freeze_norm: True
+  pretrained: True
+
+
+
+HybridEncoder:
+  # intra
+  hidden_dim: 384
+  dim_feedforward: 2048
+
+
+RTDETRTransformerv2:
+  feat_channels: [384, 384, 384]
+
+
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.000001
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r101vd_6x_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r101vd_6x_coco.yml
new file mode 100644
index 0000000..f0171de
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r101vd_6x_coco.yml
@@ -0,0 +1,40 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetrv2_r50vd.yml',
+]
+
+
+output_dir: ./output/rtdetrv2_r101vd_6x_coco
+
+
+PResNet:
+  depth: 101
+
+
+HybridEncoder:
+  # intra
+  hidden_dim: 384
+  dim_feedforward: 2048
+
+
+RTDETRTransformerv2:
+  feat_channels: [384, 384, 384]
+
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.000001
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_120e_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_120e_coco.yml
new file mode 100644
index 0000000..0a4557b
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_120e_coco.yml
@@ -0,0 +1,46 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetrv2_r50vd.yml',
+]
+
+
+output_dir: ./output/rtdetrv2_r18vd_120e_coco
+
+
+PResNet:
+  depth: 18
+  freeze_at: -1
+  freeze_norm: False
+  pretrained: True
+
+
+HybridEncoder:
+  in_channels: [128, 256, 512]
+  hidden_dim: 256
+  expansion: 0.5
+
+
+RTDETRTransformerv2:
+  num_layers: 3
+
+
+epoches: 120 
+
+optimizer:
+  type: AdamW
+  params:
+    - 
+      params: '^(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: 117
+  collate_fn:
+    scales: ~
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_120e_voc.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_120e_voc.yml
new file mode 100644
index 0000000..28b9873
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_120e_voc.yml
@@ -0,0 +1,46 @@
+__include__: [
+  '../dataset/voc_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetrv2_r50vd.yml',
+]
+
+
+output_dir: ./output/rtdetrv2_r18vd_120e_voc
+
+
+PResNet:
+  depth: 18
+  freeze_at: -1
+  freeze_norm: False
+  pretrained: True
+
+
+HybridEncoder:
+  in_channels: [128, 256, 512]
+  hidden_dim: 256
+  expansion: 0.5
+
+
+RTDETRTransformerv2:
+  num_layers: 3
+
+
+epoches: 120 
+
+optimizer:
+  type: AdamW
+  params:
+    - 
+      params: '^(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: 117
+  collate_fn:
+    scales: ~
+  total_batch_size: 32
diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_dsp_3x_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_dsp_3x_coco.yml
new file mode 100644
index 0000000..a3a3a58
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_dsp_3x_coco.yml
@@ -0,0 +1,49 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetrv2_r50vd.yml',
+]
+
+
+tuning: https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r18vd_120e_coco.pth
+
+output_dir: ./output/rtdetrv2_r18vd_dsp_3x_coco
+
+PResNet:
+  depth: 18
+  freeze_at: -1
+  freeze_norm: False
+  pretrained: True
+
+
+HybridEncoder:
+  in_channels: [128, 256, 512]
+  hidden_dim: 256
+  expansion: 0.5
+
+
+RTDETRTransformerv2:
+  num_layers: 3
+  num_points: [4, 4, 4]
+  cross_attn_method: discrete
+
+
+epoches: 36
+
+optimizer:
+  type: AdamW
+  params:
+    - 
+      params: '^(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: 33
+  collate_fn:
+    scales: ~
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_sp1_120e_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_sp1_120e_coco.yml
new file mode 100644
index 0000000..ed029c1
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_sp1_120e_coco.yml
@@ -0,0 +1,47 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetrv2_r50vd.yml',
+]
+
+
+output_dir: ./output/rtdetrv2_r18vd_sp1_120e_coco
+
+
+PResNet:
+  depth: 18
+  freeze_at: -1
+  freeze_norm: False
+  pretrained: True
+
+
+HybridEncoder:
+  in_channels: [128, 256, 512]
+  hidden_dim: 256
+  expansion: 0.5
+
+
+RTDETRTransformerv2:
+  num_layers: 3
+  num_points: [1, 1, 1]
+
+
+epoches: 120 
+
+optimizer:
+  type: AdamW
+  params:
+    - 
+      params: '^(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: 117
+  collate_fn:
+    scales: ~
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_sp2_120e_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_sp2_120e_coco.yml
new file mode 100644
index 0000000..c75d0d7
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_sp2_120e_coco.yml
@@ -0,0 +1,47 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetrv2_r50vd.yml',
+]
+
+
+output_dir: ./output/rtdetrv2_r18vd_sp2_120e_coco
+
+
+PResNet:
+  depth: 18
+  freeze_at: -1
+  freeze_norm: False
+  pretrained: True
+
+
+HybridEncoder:
+  in_channels: [128, 256, 512]
+  hidden_dim: 256
+  expansion: 0.5
+
+
+RTDETRTransformerv2:
+  num_layers: 3
+  num_points: [2, 2, 2]
+
+
+epoches: 120 
+
+optimizer:
+  type: AdamW
+  params:
+    - 
+      params: '^(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: 117
+  collate_fn:
+    scales: ~
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_sp3_120e_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_sp3_120e_coco.yml
new file mode 100644
index 0000000..2a00b1c
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_sp3_120e_coco.yml
@@ -0,0 +1,47 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetrv2_r50vd.yml',
+]
+
+
+output_dir: ./output/rtdetrv2_r18vd_sp3_120e_coco
+
+
+PResNet:
+  depth: 18
+  freeze_at: -1
+  freeze_norm: False
+  pretrained: True
+
+
+HybridEncoder:
+  in_channels: [128, 256, 512]
+  hidden_dim: 256
+  expansion: 0.5
+
+
+RTDETRTransformerv2:
+  num_layers: 3
+  num_points: [3, 3, 3]
+
+
+epoches: 120 
+
+optimizer:
+  type: AdamW
+  params:
+    - 
+      params: '^(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: 117
+  collate_fn:
+    scales: ~
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r34vd_120e_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r34vd_120e_coco.yml
new file mode 100644
index 0000000..348c0e9
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r34vd_120e_coco.yml
@@ -0,0 +1,57 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetrv2_r50vd.yml',
+]
+
+
+output_dir: ./output/rtdetrv2_r34vd_120e_coco
+
+
+PResNet:
+  depth: 34
+  freeze_at: -1
+  freeze_norm: False
+  pretrained: True
+
+
+HybridEncoder:
+  in_channels: [128, 256, 512]
+  hidden_dim: 256
+  expansion: 0.5
+
+
+RTDETRTransformerv2:
+  num_layers: 4
+
+
+epoches: 120
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.00005
+    - 
+      params: '^(?=.*backbone)(?=.*norm|bn).*$'
+      lr: 0.00005
+      weight_decay: 0.
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: 117
+  collate_fn:
+    stop_epoch: 117
diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r34vd_dsp_1x_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r34vd_dsp_1x_coco.yml
new file mode 100644
index 0000000..064d5f3
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r34vd_dsp_1x_coco.yml
@@ -0,0 +1,59 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetrv2_r50vd.yml',
+]
+
+tuning: https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r34vd_120e_coco_ema.pth
+
+output_dir: ./output/rtdetrv2_r34vd_dsp_1x_coco
+
+
+PResNet:
+  depth: 34
+  freeze_at: -1
+  freeze_norm: False
+  pretrained: True
+
+
+HybridEncoder:
+  in_channels: [128, 256, 512]
+  hidden_dim: 256
+  expansion: 0.5
+
+
+RTDETRTransformerv2:
+  num_layers: 4
+  cross_attn_method: discrete
+
+
+epoches: 12
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*norm|bn).*$'
+      lr: 0.00005
+    - 
+      params: '^(?=.*backbone)(?=.*norm|bn).*$'
+      lr: 0.00005
+      weight_decay: 0.
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$'
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: 10
+  collate_fn:
+    stop_epoch: 10
diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_6x_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_6x_coco.yml
new file mode 100644
index 0000000..63f0bd6
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_6x_coco.yml
@@ -0,0 +1,27 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetrv2_r50vd.yml',
+]
+
+
+output_dir: ./output/rtdetrv2_r50vd_6x_coco
+
+
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*norm).*$'
+      lr: 0.00001
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_dsp_1x_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_dsp_1x_coco.yml
new file mode 100644
index 0000000..1c1cfad
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_dsp_1x_coco.yml
@@ -0,0 +1,27 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetrv2_r50vd.yml',
+]
+
+
+tuning: https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_6x_coco_ema.pth
+
+output_dir: ./output/rtdetrv2_r50vd_dsp_1x_coco
+
+
+RTDETRTransformerv2:
+  cross_attn_method: discrete
+
+
+epoches: 12
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: 10
+  collate_fn:
+    stop_epoch: 10
diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml
new file mode 100644
index 0000000..43ab113
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml
@@ -0,0 +1,43 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetrv2_r50vd.yml',
+]
+
+output_dir: ./output/rtdetrv2_r50vd_m_6x_coco
+
+
+HybridEncoder:
+  expansion: 0.5
+
+
+RTDETRTransformerv2:
+  eval_idx: 2 # use 3th decoder layer to eval
+
+
+epoches: 84
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*norm).*$'
+      lr: 0.00001
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: 81
+  collate_fn:
+    stop_epoch: 81
diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_m_dsp_3x_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_m_dsp_3x_coco.yml
new file mode 100644
index 0000000..af617ff
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_m_dsp_3x_coco.yml
@@ -0,0 +1,44 @@
+__include__: [
+  '../dataset/coco_detection.yml',
+  '../runtime.yml',
+  './include/dataloader.yml',
+  './include/optimizer.yml',
+  './include/rtdetrv2_r50vd.yml',
+]
+
+output_dir: ./output/rtdetrv2_r50vd_m_dsp_3x_coco
+tuning: https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_m_7x_coco_ema.pth
+
+HybridEncoder:
+  expansion: 0.5
+
+
+RTDETRTransformerv2:
+  eval_idx: 2 # use 3th decoder layer to eval
+  cross_attn_method: discrete
+
+
+epoches: 36
+
+optimizer:
+  type: AdamW
+  params: 
+    - 
+      params: '^(?=.*backbone)(?!.*norm).*$'
+      lr: 0.00001
+    - 
+      params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$'
+      weight_decay: 0.
+
+  lr: 0.0001
+  betas: [0.9, 0.999]
+  weight_decay: 0.0001
+
+
+train_dataloader: 
+  dataset: 
+    transforms:
+      policy:
+        epoch: 33
+  collate_fn:
+    stop_epoch: 33
diff --git a/rtdetrv2_pytorch/configs/runtime.yml b/rtdetrv2_pytorch/configs/runtime.yml
new file mode 100644
index 0000000..4217b9e
--- /dev/null
+++ b/rtdetrv2_pytorch/configs/runtime.yml
@@ -0,0 +1,21 @@
+
+print_freq: 100
+output_dir: './logs'
+checkpoint_freq: 1
+
+
+sync_bn: True
+find_unused_parameters: False
+
+
+use_amp: False
+scaler:
+  type: GradScaler
+  enabled: True
+
+
+use_ema: False
+ema:
+  type: ModelEMA
+  decay: 0.9999
+  warmups: 2000
diff --git a/rtdetrv2_pytorch/docker-compose.yml b/rtdetrv2_pytorch/docker-compose.yml
new file mode 100644
index 0000000..7d07984
--- /dev/null
+++ b/rtdetrv2_pytorch/docker-compose.yml
@@ -0,0 +1,23 @@
+services:
+  tensorrt-container:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    image: rtdetr-v2:25.06
+    container_name: rtdetr-v2-trt
+    ports:
+      - "6006:6006" # tensorboard
+    volumes:
+      - ./:/workspace
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    working_dir: /workspace
+    restart: unless-stopped
+    stdin_open: true
+    tty: true
+    command: bash
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/references/deploy/readme.md b/rtdetrv2_pytorch/references/deploy/readme.md
new file mode 100644
index 0000000..ed3c9a4
--- /dev/null
+++ b/rtdetrv2_pytorch/references/deploy/readme.md
@@ -0,0 +1,2 @@
+# Deployment
+
diff --git a/rtdetrv2_pytorch/references/deploy/rtdetrv2_onnxruntime.py b/rtdetrv2_pytorch/references/deploy/rtdetrv2_onnxruntime.py
new file mode 100644
index 0000000..0f94dd2
--- /dev/null
+++ b/rtdetrv2_pytorch/references/deploy/rtdetrv2_onnxruntime.py
@@ -0,0 +1,61 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torchvision.transforms as T
+
+import numpy as np 
+import onnxruntime as ort 
+from PIL import Image, ImageDraw
+
+
+def draw(images, labels, boxes, scores, thrh = 0.6):
+    for i, im in enumerate(images):
+        draw = ImageDraw.Draw(im)
+
+        scr = scores[i]
+        lab = labels[i][scr > thrh]
+        box = boxes[i][scr > thrh]
+
+        for b in box:
+            draw.rectangle(list(b), outline='red',)
+            draw.text((b[0], b[1]), text=str(lab[i].item()), fill='blue', )
+
+        im.save(f'results_{i}.jpg')
+
+
+def main(args, ):
+    """main
+    """
+    sess = ort.InferenceSession(args.onnx_file)
+    print(ort.get_device())
+
+    im_pil = Image.open(args.im_file).convert('RGB')
+    w, h = im_pil.size
+    orig_size = torch.tensor([w, h])[None]
+
+    transforms = T.Compose([
+        T.Resize((640, 640)),
+        T.ToTensor(),
+    ])
+    im_data = transforms(im_pil)[None]
+
+    output = sess.run(
+        # output_names=['labels', 'boxes', 'scores'],
+        output_names=None,
+        input_feed={'images': im_data.data.numpy(), "orig_target_sizes": orig_size.data.numpy()}
+    )
+
+    labels, boxes, scores = output
+
+    draw([im_pil], labels, boxes, scores)
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--onnx-file', type=str, )
+    parser.add_argument('--im-file', type=str, )
+    # parser.add_argument('-d', '--device', type=str, default='cpu')
+    args = parser.parse_args()
+    main(args)
diff --git a/rtdetrv2_pytorch/references/deploy/rtdetrv2_openvino.py b/rtdetrv2_pytorch/references/deploy/rtdetrv2_openvino.py
new file mode 100644
index 0000000..bc8a7d7
--- /dev/null
+++ b/rtdetrv2_pytorch/references/deploy/rtdetrv2_openvino.py
@@ -0,0 +1,5 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+
+# please reference: https://github.com/guojin-yan/RT-DETR-OpenVINO
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/references/deploy/rtdetrv2_tensorrt.py b/rtdetrv2_pytorch/references/deploy/rtdetrv2_tensorrt.py
new file mode 100644
index 0000000..8bf2a56
--- /dev/null
+++ b/rtdetrv2_pytorch/references/deploy/rtdetrv2_tensorrt.py
@@ -0,0 +1,258 @@
+# Copyright 2023 lyuwenyu. All Rights Reserved.
+# Copyright (c) 2025 Hitbee-dev. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+# NOTICE: This file has been heavily modified by [Hitbee-dev] from the original source.
+# Modifications include restructuring for broader GPU architecture compatibility
+# (including NVIDIA Blackwell), improved modularity, and enhanced testability.
+# ==============================================================================
+
+import time
+import numpy as np
+import torch
+import tensorrt as trt
+from collections import OrderedDict
+from PIL import Image, ImageDraw, ImageFont
+
+class TRTInference(object):
+    """
+    A high-level wrapper for TensorRT inference, designed for ease of use and flexibility.
+    This class handles engine loading, context creation, and dynamic buffer allocation.
+    """
+    def __init__(self, engine_path, device='cuda:0', verbose=False):
+        """
+        Initializes the TRTInference instance.
+
+        Args:
+            engine_path (str): Path to the serialized TensorRT engine file.
+            device (str): The device to run inference on (e.g., 'cuda:0').
+            verbose (bool): If True, enables verbose logging from the TensorRT logger.
+        """
+        self.engine_path = engine_path
+        self.device = torch.device(device)
+        self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO)
+        
+        trt.init_libnvinfer_plugins(self.logger, '')
+        self.runtime = trt.Runtime(self.logger)
+        self.engine = self._load_engine(engine_path)
+        self.context = self.engine.create_execution_context()
+
+        self.input_names, self.output_names = self._get_io_names()
+
+        self.buffers_allocated = False
+        self.gpu_buffers = OrderedDict()
+        self.binding_addrs = OrderedDict()
+
+        print(f"[TRTInference] Initialized successfully. Engine: '{engine_path}'.")
+
+    def _load_engine(self, path):
+        """Loads a TensorRT engine from a file."""
+        with open(path, 'rb') as f:
+            engine = self.runtime.deserialize_cuda_engine(f.read())
+        if engine is None:
+            raise RuntimeError(f"Failed to load TensorRT engine from '{path}'.")
+        return engine
+
+    def _get_io_names(self):
+        """Parses input and output tensor names from the engine."""
+        input_names, output_names = [], []
+        for i in range(self.engine.num_io_tensors):
+            name = self.engine.get_tensor_name(i)
+            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
+                input_names.append(name)
+            else:
+                output_names.append(name)
+        return input_names, output_names
+
+    def _allocate_buffers(self, blob: dict):
+        """
+        Allocates GPU buffers for inputs and outputs based on the first inference request.
+        This "lazy allocation" strategy handles dynamic input shapes gracefully.
+        """
+        print("[TRTInference] First inference call detected. Allocating GPU buffers...")
+        for name in self.input_names:
+            tensor = blob[name]
+            shape = tuple(tensor.shape)
+            dtype = tensor.dtype
+            self.context.set_input_shape(name, shape)
+            self.gpu_buffers[name] = torch.empty(shape, dtype=dtype, device=self.device)
+            self.binding_addrs[name] = self.gpu_buffers[name].data_ptr()
+            print(f"  - Input '{name}': allocated buffer with shape {shape}.")
+
+        for name in self.output_names:
+            shape = tuple(self.context.get_tensor_shape(name))
+            dtype = trt.nptype(self.engine.get_tensor_dtype(name))
+            torch_dtype = torch.from_numpy(np.array(0, dtype=dtype)).dtype
+            self.gpu_buffers[name] = torch.empty(shape, dtype=torch_dtype, device=self.device)
+            self.binding_addrs[name] = self.gpu_buffers[name].data_ptr()
+            print(f"  - Output '{name}': allocated buffer with shape {shape}.")
+
+        self.buffers_allocated = True
+        print("[TRTInference] GPU buffers allocated successfully.")
+
+    def __call__(self, blob: dict):
+        """
+        Executes inference on the loaded TensorRT engine.
+
+        Args:
+            blob (dict): A dictionary mapping input tensor names to their corresponding
+                         torch.Tensor data on the GPU.
+
+        Returns:
+            dict: A dictionary mapping output tensor names to their corresponding
+                  torch.Tensor results on the GPU.
+        """
+        if not self.buffers_allocated:
+            self._allocate_buffers(blob)
+            
+        for name in self.input_names:
+            self.gpu_buffers[name].copy_(blob[name])
+
+        self.context.execute_v2(bindings=list(self.binding_addrs.values()))
+        
+        return {name: self.gpu_buffers[name] for name in self.output_names}
+
+# --- Visualization Utility Function ---
+COCO_CLASSES = [
+    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
+    'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
+    'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+    'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
+    'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+    'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
+    'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+    'scissors', 'teddy bear', 'hair drier', 'toothbrush'
+]
+
+def visualize_detections(image_pil, boxes, scores, labels, class_names=COCO_CLASSES, threshold=0.5):
+    """
+    Draws bounding boxes on a PIL image. This function is a general-purpose utility.
+
+    Args:
+        image_pil (PIL.Image.Image): The image to draw on.
+        boxes (torch.Tensor): A tensor of bounding boxes (shape: [N, 4]).
+        scores (torch.Tensor): A tensor of confidence scores (shape: [N]).
+        labels (torch.Tensor): A tensor of class labels (shape: [N]).
+        class_names (list): A list of strings corresponding to class labels.
+        threshold (float): The confidence threshold for displaying detections.
+
+    Returns:
+        PIL.Image.Image: The image with detections drawn on it.
+    """
+    img_draw = image_pil.copy()
+    draw = ImageDraw.Draw(img_draw)
+    
+    # Ensure tensors are on CPU and converted to NumPy for processing
+    boxes = boxes.cpu().numpy()
+    scores = scores.cpu().numpy()
+    labels = labels.cpu().numpy()
+    
+    count = 0
+    for i in range(len(scores)):
+        score = scores[i]
+        if score < threshold:
+            continue
+        
+        count += 1
+        box = boxes[i]
+        label_idx = int(labels[i])
+        
+        xmin, ymin, xmax, ymax = box
+        class_name = class_names[label_idx] if label_idx < len(class_names) else f'CLS-{label_idx}'
+        color = 'red' # Keep it simple or use a color map
+        
+        draw.rectangle(((xmin, ymin), (xmax, ymax)), outline=color, width=3)
+        
+        text = f"{class_name}: {score:.2f}"
+        
+        try:
+            font = ImageFont.truetype("arial.ttf", 20)
+        except IOError:
+            font = ImageFont.load_default()
+
+        text_bbox = draw.textbbox((xmin, ymin), text, font=font)
+        draw.rectangle(text_bbox, fill=color)
+        draw.text((xmin, ymin), text, fill="white", font=font)
+        
+    print(f"   - Found {count} objects above threshold {threshold}.")
+    return img_draw
+
+if __name__ == '__main__':
+    import argparse
+    import torchvision.transforms as T
+    import os
+
+    parser = argparse.ArgumentParser(description="Test script for the TRTInference wrapper.")
+    parser.add_argument('--engine', type=str, required=True, help="Path to the TensorRT engine file.")
+    parser.add_argument('--image', type=str, required=True, help="Path to the input image file.")
+    parser.add_argument('--output', type=str, default='output.jpg', help="Path to save the output image with detections.")
+    parser.add_argument('--device', type=str, default='cuda:0', help="Device to run inference on.")
+    parser.add_argument('--threshold', type=float, default=0.5, help="Confidence threshold for displaying detections.")
+    args = parser.parse_args()
+    
+    if not torch.cuda.is_available():
+        raise SystemExit("CUDA is not available. This script requires a GPU.")
+    
+    print("--- TRTInference Wrapper Test ---")
+    
+    print("\n1. Initializing TRTInference...")
+    trt_model = TRTInference(args.engine, device=args.device)
+    
+    print("\n2. Preprocessing input image...")
+    image_pil = Image.open(args.image).convert('RGB')
+    w, h = image_pil.size
+    
+    transforms = T.Compose([
+        T.Resize((640, 640)),
+        T.ToTensor(),
+    ])
+    
+    image_tensor = transforms(image_pil).unsqueeze(0).to(args.device)
+    orig_size_tensor = torch.tensor([[w, h]], dtype=torch.int64, device=args.device)
+
+    blob = {
+        'images': image_tensor,
+        'orig_target_sizes': orig_size_tensor
+    }
+    print(f"   - Original image size: {w}x{h}")
+    print(f"   - Input tensor shape: {image_tensor.shape}")
+
+    print("\n3. Running inference...")
+    start_time = time.time()
+    output_gpu = trt_model(blob)
+    torch.cuda.synchronize()
+    end_time = time.time()
+    
+    print(f"\n4. Inference complete in { (end_time - start_time) * 1000:.2f} ms.")
+    
+    print("\n5. Post-processing and saving output image...")
+    output_labels = output_gpu['labels'][0]
+    output_boxes = output_gpu['boxes'][0]
+    output_scores = output_gpu['scores'][0]
+    
+    # Use the new, separate visualization function
+    result_image = visualize_detections(
+        image_pil, 
+        output_boxes, 
+        output_scores, 
+        output_labels, 
+        threshold=args.threshold
+    )
+    
+    result_image.save(args.output)
+    print(f"   - Output image with detections saved to: {os.path.abspath(args.output)}")
+
+    print("\n--- Test finished successfully ---")
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/references/deploy/rtdetrv2_torch.py b/rtdetrv2_pytorch/references/deploy/rtdetrv2_torch.py
new file mode 100644
index 0000000..3748530
--- /dev/null
+++ b/rtdetrv2_pytorch/references/deploy/rtdetrv2_torch.py
@@ -0,0 +1,84 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torch.nn as nn 
+import torchvision.transforms as T
+
+import numpy as np 
+from PIL import Image, ImageDraw
+
+from src.core import YAMLConfig
+
+
+def draw(images, labels, boxes, scores, thrh = 0.6):
+    for i, im in enumerate(images):
+        draw = ImageDraw.Draw(im)
+
+        scr = scores[i]
+        lab = labels[i][scr > thrh]
+        box = boxes[i][scr > thrh]
+        scrs = scores[i][scr > thrh]
+
+        for j,b in enumerate(box):
+            draw.rectangle(list(b), outline='red',)
+            draw.text((b[0], b[1]), text=f"{lab[j].item()} {round(scrs[j].item(),2)}", fill='blue', )
+
+        im.save(f'results_{i}.jpg')
+
+
+def main(args, ):
+    """main
+    """
+    cfg = YAMLConfig(args.config, resume=args.resume)
+
+    if args.resume:
+        checkpoint = torch.load(args.resume, map_location='cpu') 
+        if 'ema' in checkpoint:
+            state = checkpoint['ema']['module']
+        else:
+            state = checkpoint['model']
+    else:
+        raise AttributeError('Only support resume to load model.state_dict by now.')
+
+    # NOTE load train mode state -> convert to deploy mode
+    cfg.model.load_state_dict(state)
+
+    class Model(nn.Module):
+        def __init__(self, ) -> None:
+            super().__init__()
+            self.model = cfg.model.deploy()
+            self.postprocessor = cfg.postprocessor.deploy()
+            
+        def forward(self, images, orig_target_sizes):
+            outputs = self.model(images)
+            outputs = self.postprocessor(outputs, orig_target_sizes)
+            return outputs
+
+    model = Model().to(args.device)
+
+    im_pil = Image.open(args.im_file).convert('RGB')
+    w, h = im_pil.size
+    orig_size = torch.tensor([w, h])[None].to(args.device)
+
+    transforms = T.Compose([
+        T.Resize((640, 640)),
+        T.ToTensor(),
+    ])
+    im_data = transforms(im_pil)[None].to(args.device)
+
+    output = model(im_data, orig_size)
+    labels, boxes, scores = output
+
+    draw([im_pil], labels, boxes, scores)
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, )
+    parser.add_argument('-r', '--resume', type=str, )
+    parser.add_argument('-f', '--im-file', type=str, )
+    parser.add_argument('-d', '--device', type=str, default='cpu')
+    args = parser.parse_args()
+    main(args)
diff --git a/rtdetrv2_pytorch/requirements.txt b/rtdetrv2_pytorch/requirements.txt
new file mode 100644
index 0000000..24b7c2d
--- /dev/null
+++ b/rtdetrv2_pytorch/requirements.txt
@@ -0,0 +1,9 @@
+torch>=2.0.1
+torchvision>=0.15.2
+faster-coco-eval>=1.6.6
+PyYAML
+tensorboard
+scipy
+pycocotools
+onnx
+onnxruntime-gpu
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/src/__init__.py b/rtdetrv2_pytorch/src/__init__.py
new file mode 100644
index 0000000..5901b01
--- /dev/null
+++ b/rtdetrv2_pytorch/src/__init__.py
@@ -0,0 +1,8 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+# for register purpose
+from . import optim
+from . import data 
+from . import nn
+from . import zoo
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/src/core/__init__.py b/rtdetrv2_pytorch/src/core/__init__.py
new file mode 100644
index 0000000..f9ca39f
--- /dev/null
+++ b/rtdetrv2_pytorch/src/core/__init__.py
@@ -0,0 +1,7 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+from .workspace import GLOBAL_CONFIG, register, create
+from .yaml_utils import *
+from ._config import BaseConfig
+from .yaml_config import YAMLConfig
diff --git a/rtdetrv2_pytorch/src/core/_config.py b/rtdetrv2_pytorch/src/core/_config.py
new file mode 100644
index 0000000..0bc5aeb
--- /dev/null
+++ b/rtdetrv2_pytorch/src/core/_config.py
@@ -0,0 +1,290 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch 
+import torch.nn as nn 
+from torch.utils.data import Dataset, DataLoader
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LRScheduler
+from torch.cuda.amp.grad_scaler import GradScaler
+from torch.utils.tensorboard import SummaryWriter
+
+from pathlib import Path 
+from typing import Callable, List, Dict
+
+
+__all__ = ['BaseConfig', ]
+
+
+class BaseConfig(object):
+    # TODO property
+
+    def __init__(self) -> None:
+        super().__init__()
+
+        self.task :str = None 
+
+        # instance / function 
+        self._model :nn.Module = None 
+        self._postprocessor :nn.Module = None 
+        self._criterion :nn.Module = None 
+        self._optimizer :Optimizer = None 
+        self._lr_scheduler :LRScheduler = None 
+        self._lr_warmup_scheduler: LRScheduler = None 
+        self._train_dataloader :DataLoader = None 
+        self._val_dataloader :DataLoader = None 
+        self._ema :nn.Module = None 
+        self._scaler :GradScaler = None 
+        self._train_dataset :Dataset = None 
+        self._val_dataset :Dataset = None
+        self._collate_fn :Callable = None
+        self._evaluator :Callable[[nn.Module, DataLoader, str], ] = None
+        self._writer: SummaryWriter = None
+        
+        # dataset 
+        self.num_workers :int = 0
+        self.batch_size :int = None
+        self._train_batch_size :int = None
+        self._val_batch_size :int = None
+        self._train_shuffle: bool = None  
+        self._val_shuffle: bool = None 
+
+        # runtime
+        self.resume :str = None
+        self.tuning :str = None 
+
+        self.epoches :int = None
+        self.last_epoch :int = -1
+
+        self.use_amp :bool = False 
+        self.use_ema :bool = False 
+        self.ema_decay :float = 0.9999
+        self.ema_warmups: int = 2000
+        self.sync_bn :bool = False 
+        self.clip_max_norm : float = 0.
+        self.find_unused_parameters :bool = None
+
+        self.seed :int = None
+        self.print_freq :int = None 
+        self.checkpoint_freq :int = 1
+        self.output_dir :str = None
+        self.summary_dir :str = None
+        self.device : str = ''
+
+    @property
+    def model(self, ) -> nn.Module:
+        return self._model 
+    
+    @model.setter
+    def model(self, m):
+        assert isinstance(m, nn.Module), f'{type(m)} != nn.Module, please check your model class'
+        self._model = m 
+
+    @property
+    def postprocessor(self, ) -> nn.Module:
+        return self._postprocessor
+    
+    @postprocessor.setter
+    def postprocessor(self, m):
+        assert isinstance(m, nn.Module), f'{type(m)} != nn.Module, please check your model class'
+        self._postprocessor = m 
+
+    @property
+    def criterion(self, ) -> nn.Module:
+        return self._criterion
+    
+    @criterion.setter
+    def criterion(self, m):
+        assert isinstance(m, nn.Module), f'{type(m)} != nn.Module, please check your model class'
+        self._criterion = m 
+
+    @property
+    def optimizer(self, ) -> Optimizer:
+        return self._optimizer
+    
+    @optimizer.setter
+    def optimizer(self, m):
+        assert isinstance(m, Optimizer), f'{type(m)} != optim.Optimizer, please check your model class'
+        self._optimizer = m 
+
+    @property
+    def lr_scheduler(self, ) -> LRScheduler:
+        return self._lr_scheduler
+    
+    @lr_scheduler.setter
+    def lr_scheduler(self, m):
+        assert isinstance(m, LRScheduler), f'{type(m)} != LRScheduler, please check your model class'
+        self._lr_scheduler = m 
+
+    @property
+    def lr_warmup_scheduler(self, ) -> LRScheduler:
+        return self._lr_warmup_scheduler
+
+    @lr_warmup_scheduler.setter
+    def lr_warmup_scheduler(self, m):
+        self._lr_warmup_scheduler = m 
+
+    @property
+    def train_dataloader(self) -> DataLoader:
+        if self._train_dataloader is None and self.train_dataset is not None:
+            loader = DataLoader(self.train_dataset, 
+                                batch_size=self.train_batch_size, 
+                                num_workers=self.num_workers, 
+                                collate_fn=self.collate_fn,
+                                shuffle=self.train_shuffle, )
+            loader.shuffle = self.train_shuffle
+            self._train_dataloader = loader
+
+        return self._train_dataloader
+
+    @train_dataloader.setter
+    def train_dataloader(self, loader):
+        self._train_dataloader = loader 
+
+    @property
+    def val_dataloader(self) -> DataLoader:
+        if self._val_dataloader is None and self.val_dataset is not None:
+            loader = DataLoader(self.val_dataset, 
+                                batch_size=self.val_batch_size, 
+                                num_workers=self.num_workers, 
+                                drop_last=False,
+                                collate_fn=self.collate_fn, 
+                                shuffle=self.val_shuffle)
+            loader.shuffle = self.val_shuffle
+            self._val_dataloader = loader
+
+        return self._val_dataloader
+    
+    @val_dataloader.setter
+    def val_dataloader(self, loader):
+        self._val_dataloader = loader 
+
+    @property
+    def ema(self, ) -> nn.Module:
+        if self._ema is None and self.use_ema and self.model is not None:
+            from ..optim import ModelEMA
+            self._ema = ModelEMA(self.model, self.ema_decay, self.ema_warmups)
+        return self._ema
+
+    @ema.setter
+    def ema(self, obj):
+        self._ema = obj
+
+    @property
+    def scaler(self) -> GradScaler: 
+        if self._scaler is None and self.use_amp and torch.cuda.is_available():
+            self._scaler = GradScaler()
+        return self._scaler
+    
+    @scaler.setter
+    def scaler(self, obj: GradScaler):
+        self._scaler = obj
+
+    @property
+    def val_shuffle(self) -> bool:
+        if self._val_shuffle is None:
+            print('warning: set default val_shuffle=False')
+            return False
+        return self._val_shuffle
+
+    @val_shuffle.setter
+    def val_shuffle(self, shuffle):
+        assert isinstance(shuffle, bool), 'shuffle must be bool'
+        self._val_shuffle = shuffle
+
+    @property
+    def train_shuffle(self) -> bool:
+        if self._train_shuffle is None:
+            print('warning: set default train_shuffle=True')
+            return True
+        return self._train_shuffle
+
+    @train_shuffle.setter
+    def train_shuffle(self, shuffle):
+        assert isinstance(shuffle, bool), 'shuffle must be bool'
+        self._train_shuffle = shuffle
+
+
+    @property
+    def train_batch_size(self) -> int:
+        if self._train_batch_size is None and isinstance(self.batch_size, int):
+            print(f'warning: set train_batch_size=batch_size={self.batch_size}')
+            return self.batch_size
+        return self._train_batch_size
+
+    @train_batch_size.setter
+    def train_batch_size(self, batch_size):
+        assert isinstance(batch_size, int), 'batch_size must be int'
+        self._train_batch_size = batch_size
+
+    @property
+    def val_batch_size(self) -> int:
+        if self._val_batch_size is None:
+            print(f'warning: set val_batch_size=batch_size={self.batch_size}')
+            return self.batch_size
+        return self._val_batch_size
+
+    @val_batch_size.setter
+    def val_batch_size(self, batch_size):
+        assert isinstance(batch_size, int), 'batch_size must be int'
+        self._val_batch_size = batch_size
+
+
+    @property
+    def train_dataset(self) -> Dataset:
+        return self._train_dataset
+
+    @train_dataset.setter
+    def train_dataset(self, dataset):
+        assert isinstance(dataset, Dataset), f'{type(dataset)} must be Dataset'
+        self._train_dataset = dataset
+
+
+    @property
+    def val_dataset(self) -> Dataset:
+        return self._val_dataset
+
+    @val_dataset.setter
+    def val_dataset(self, dataset):
+        assert isinstance(dataset, Dataset), f'{type(dataset)} must be Dataset'
+        self._val_dataset = dataset
+
+    @property
+    def collate_fn(self) -> Callable:
+        return self._collate_fn
+
+    @collate_fn.setter
+    def collate_fn(self, fn):
+        assert isinstance(fn, Callable), f'{type(fn)} must be Callable'
+        self._collate_fn = fn
+
+    @property
+    def evaluator(self) -> Callable:
+        return self._evaluator
+
+    @evaluator.setter
+    def evaluator(self, fn):
+        assert isinstance(fn, Callable), f'{type(fn)} must be Callable'
+        self._evaluator = fn
+
+    @property
+    def writer(self) -> SummaryWriter:
+        if self._writer is None: 
+            if self.summary_dir:
+                self._writer = SummaryWriter(self.summary_dir)
+            elif self.output_dir:
+                self._writer = SummaryWriter(Path(self.output_dir) / 'summary')
+        return self._writer
+    
+    @writer.setter
+    def writer(self, m):
+        assert isinstance(m, SummaryWriter), f'{type(m)} must be SummaryWriter'
+        self._writer = m
+
+    def __repr__(self, ):
+        s = ''
+        for k, v in self.__dict__.items():
+            if not k.startswith('_'):
+                s +=  f'{k}: {v}\n'
+        return s 
+
diff --git a/rtdetrv2_pytorch/src/core/workspace.py b/rtdetrv2_pytorch/src/core/workspace.py
new file mode 100644
index 0000000..e9b3c12
--- /dev/null
+++ b/rtdetrv2_pytorch/src/core/workspace.py
@@ -0,0 +1,179 @@
+""""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import inspect
+import importlib
+import functools
+import inspect
+from collections import defaultdict
+from typing import Any, Dict, Optional, List
+
+
+GLOBAL_CONFIG = defaultdict(dict)
+
+
+def register(dct :Any=GLOBAL_CONFIG, name=None, force=False):
+    """
+        dct:
+            if dct is Dict, register foo into dct as key-value pair
+            if dct is Clas, register as modules attibute
+        force 
+            whether force register.
+    """
+    def decorator(foo):
+        register_name = foo.__name__ if name is None else name
+        if not force:
+            if inspect.isclass(dct):
+                assert not hasattr(dct, foo.__name__), \
+                    f'module {dct.__name__} has {foo.__name__}'
+            else:
+                assert foo.__name__ not in dct, \
+                f'{foo.__name__} has been already registered'
+
+        if inspect.isfunction(foo):
+            @functools.wraps(foo)
+            def wrap_func(*args, **kwargs):
+                return foo(*args, **kwargs)
+            if isinstance(dct, dict):
+                dct[foo.__name__] = wrap_func
+            elif inspect.isclass(dct):
+                setattr(dct, foo.__name__, wrap_func)
+            else:
+                raise AttributeError('')
+            return wrap_func
+
+        elif inspect.isclass(foo):
+            dct[register_name] = extract_schema(foo) 
+
+        else:
+            raise ValueError(f'Do not support {type(foo)} register')
+
+        return foo
+
+    return decorator
+
+
+
+def extract_schema(module: type):
+    """
+    Args:
+        module (type),
+    Return:
+        Dict, 
+    """
+    argspec = inspect.getfullargspec(module.__init__)
+    arg_names = [arg for arg in argspec.args if arg != 'self']
+    num_defualts = len(argspec.defaults) if argspec.defaults is not None else 0
+    num_requires = len(arg_names) - num_defualts
+
+    schame = dict()
+    schame['_name'] = module.__name__
+    schame['_pymodule'] = importlib.import_module(module.__module__)
+    schame['_inject'] = getattr(module, '__inject__', [])
+    schame['_share'] = getattr(module, '__share__', [])
+    schame['_kwargs'] = {}
+    for i, name in enumerate(arg_names):
+        if name in schame['_share']:
+            assert i >= num_requires, 'share config must have default value.'
+            value = argspec.defaults[i - num_requires]
+        
+        elif i >= num_requires:
+            value = argspec.defaults[i - num_requires]
+
+        else:
+            value = None 
+
+        schame[name] = value
+        schame['_kwargs'][name] = value 
+        
+    return schame
+
+
+def create(type_or_name, global_cfg=GLOBAL_CONFIG, **kwargs):
+    """
+    """
+    assert type(type_or_name) in (type, str), 'create should be modules or name.'
+
+    name = type_or_name if isinstance(type_or_name, str) else type_or_name.__name__
+
+    if name in global_cfg:
+        if hasattr(global_cfg[name], '__dict__'):
+            return global_cfg[name]
+    else:
+        raise ValueError('The module {} is not registered'.format(name))
+
+    cfg = global_cfg[name]
+
+    if isinstance(cfg, dict) and 'type' in cfg:
+        _cfg: dict = global_cfg[cfg['type']]
+        # clean args
+        _keys = [k for k in _cfg.keys() if not k.startswith('_')]
+        for _arg in _keys:
+            del _cfg[_arg]
+        _cfg.update(_cfg['_kwargs']) # restore default args
+        _cfg.update(cfg) # load config args 
+        _cfg.update(kwargs) # TODO recive extra kwargs
+        name = _cfg.pop('type') # pop extra key `type` (from cfg)
+        
+        return create(name, global_cfg)
+    
+    module = getattr(cfg['_pymodule'], name)    
+    module_kwargs = {}
+    module_kwargs.update(cfg)
+    
+    # shared var
+    for k in cfg['_share']:
+        if k in global_cfg:
+            module_kwargs[k] = global_cfg[k]
+        else:
+            module_kwargs[k] = cfg[k]
+
+    # inject
+    for k in cfg['_inject']:
+        _k = cfg[k]
+
+        if _k is None:
+            continue
+
+        if isinstance(_k, str):            
+            if _k not in global_cfg:
+                raise ValueError(f'Missing inject config of {_k}.')
+
+            _cfg = global_cfg[_k]
+            
+            if isinstance(_cfg, dict):
+                module_kwargs[k] = create(_cfg['_name'], global_cfg)
+            else:
+                module_kwargs[k] = _cfg 
+
+        elif isinstance(_k, dict):
+            if 'type' not in _k.keys():
+                raise ValueError(f'Missing inject for `type` style.')
+
+            _type = str(_k['type'])
+            if _type not in global_cfg:
+                raise ValueError(f'Missing {_type} in inspect stage.')
+
+            # TODO 
+            _cfg: dict = global_cfg[_type]
+            # clean args
+            _keys = [k for k in _cfg.keys() if not k.startswith('_')]
+            for _arg in _keys:
+                del _cfg[_arg]
+            _cfg.update(_cfg['_kwargs']) # restore default values
+            _cfg.update(_k) # load config args
+            name = _cfg.pop('type') # pop extra key (`type` from _k)
+            module_kwargs[k] = create(name, global_cfg)
+
+        else:
+            raise ValueError(f'Inject does not support {_k}')
+    
+    # TODO hard code
+    module_kwargs = {k: v for k, v in module_kwargs.items() if not k.startswith('_')}
+
+    # TODO for **kwargs
+    # extra_args = set(module_kwargs.keys()) - set(arg_names)
+    # if len(extra_args) > 0:
+    #     raise RuntimeError(f'Error: unknown args {extra_args} for {module}')
+
+    return module(**module_kwargs)
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/src/core/yaml_config.py b/rtdetrv2_pytorch/src/core/yaml_config.py
new file mode 100644
index 0000000..3b6a46e
--- /dev/null
+++ b/rtdetrv2_pytorch/src/core/yaml_config.py
@@ -0,0 +1,172 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch 
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+
+import re 
+import copy
+
+from ._config import BaseConfig
+from .workspace import create
+from .yaml_utils import load_config, merge_config, merge_dict
+
+class YAMLConfig(BaseConfig):
+    def __init__(self, cfg_path: str, **kwargs) -> None:
+        super().__init__()
+
+        cfg = load_config(cfg_path)
+        cfg = merge_dict(cfg, kwargs)
+
+        self.yaml_cfg = copy.deepcopy(cfg) 
+        
+        for k in super().__dict__:
+            if not k.startswith('_') and k in cfg:
+                self.__dict__[k] = cfg[k]
+
+    @property
+    def global_cfg(self, ):
+        return merge_config(self.yaml_cfg, inplace=False, overwrite=False)
+    
+    @property
+    def model(self, ) -> torch.nn.Module:
+        if self._model is None and 'model' in self.yaml_cfg:
+            self._model = create(self.yaml_cfg['model'], self.global_cfg)
+        return super().model 
+
+    @property
+    def postprocessor(self, ) -> torch.nn.Module:
+        if self._postprocessor is None and 'postprocessor' in self.yaml_cfg:
+            self._postprocessor = create(self.yaml_cfg['postprocessor'], self.global_cfg)
+        return super().postprocessor
+
+    @property
+    def criterion(self, ) -> torch.nn.Module:
+        if self._criterion is None and 'criterion' in self.yaml_cfg:
+            self._criterion = create(self.yaml_cfg['criterion'], self.global_cfg)
+        return super().criterion
+    
+    @property
+    def optimizer(self, ) -> optim.Optimizer:
+        if self._optimizer is None and 'optimizer' in self.yaml_cfg:
+            params = self.get_optim_params(self.yaml_cfg['optimizer'], self.model)
+            self._optimizer = create('optimizer', self.global_cfg, params=params)
+        return super().optimizer
+    
+    @property
+    def lr_scheduler(self, ) -> optim.lr_scheduler.LRScheduler:
+        if self._lr_scheduler is None and 'lr_scheduler' in self.yaml_cfg:
+            self._lr_scheduler = create('lr_scheduler', self.global_cfg, optimizer=self.optimizer)
+            print(f'Initial lr: {self._lr_scheduler.get_last_lr()}')
+        return super().lr_scheduler
+    
+    @property
+    def lr_warmup_scheduler(self, ) -> optim.lr_scheduler.LRScheduler:
+        if self._lr_warmup_scheduler is None and 'lr_warmup_scheduler' in self.yaml_cfg :
+            self._lr_warmup_scheduler = create('lr_warmup_scheduler', self.global_cfg, lr_scheduler=self.lr_scheduler)
+        return super().lr_warmup_scheduler
+
+    @property
+    def train_dataloader(self, ) -> DataLoader:
+        if self._train_dataloader is None and 'train_dataloader' in self.yaml_cfg:
+            self._train_dataloader = self.build_dataloader('train_dataloader')
+        return super().train_dataloader
+
+    @property
+    def val_dataloader(self, ) -> DataLoader:
+        if self._val_dataloader is None and 'val_dataloader' in self.yaml_cfg:
+            self._val_dataloader = self.build_dataloader('val_dataloader')
+        return super().val_dataloader
+    
+    @property
+    def ema(self, ) -> torch.nn.Module:
+        if self._ema is None and self.yaml_cfg.get('use_ema', False):
+            self._ema = create('ema', self.global_cfg, model=self.model)
+        return super().ema
+    
+    @property
+    def scaler(self, ):
+        if self._scaler is None and self.yaml_cfg.get('use_amp', False):
+            self._scaler = create('scaler', self.global_cfg)
+        return super().scaler
+
+    @property
+    def evaluator(self, ):
+        if self._evaluator is None and 'evaluator' in self.yaml_cfg:
+            if self.yaml_cfg['evaluator']['type'] == 'CocoEvaluator':
+                from ..data import get_coco_api_from_dataset
+                base_ds = get_coco_api_from_dataset(self.val_dataloader.dataset)                
+                self._evaluator = create('evaluator', self.global_cfg, coco_gt=base_ds)
+            else:
+                raise NotImplementedError(f"{self.yaml_cfg['evaluator']['type']}")
+        return super().evaluator
+
+    @staticmethod
+    def get_optim_params(cfg: dict, model: nn.Module):
+        """
+        E.g.:
+            ^(?=.*a)(?=.*b).*$  means including a and b
+            ^(?=.*(?:a|b)).*$   means including a or b
+            ^(?=.*a)(?!.*b).*$  means including a, but not b
+        """
+        assert 'type' in cfg, ''
+        cfg = copy.deepcopy(cfg)
+
+        if 'params' not in cfg:
+            return model.parameters() 
+
+        assert isinstance(cfg['params'], list), ''
+
+        param_groups = []
+        visited = []
+        for pg in cfg['params']:
+            pattern = pg['params']
+            params = {k: v for k, v in model.named_parameters() if v.requires_grad and len(re.findall(pattern, k)) > 0}
+            pg['params'] = params.values()
+            param_groups.append(pg)
+            visited.extend(list(params.keys()))
+            # print(params.keys())
+
+        names = [k for k, v in model.named_parameters() if v.requires_grad]
+
+        if len(visited) < len(names):
+            unseen = set(names) - set(visited)
+            params = {k: v for k, v in model.named_parameters() if v.requires_grad and k in unseen}
+            param_groups.append({'params': params.values()})
+            visited.extend(list(params.keys()))
+            # print(params.keys())
+
+        assert len(visited) == len(names), ''
+
+        return param_groups
+
+    @staticmethod
+    def get_rank_batch_size(cfg):
+        """compute batch size for per rank if total_batch_size is provided.
+        """
+        assert ('total_batch_size' in cfg or 'batch_size' in cfg) \
+            and not ('total_batch_size' in cfg and 'batch_size' in cfg), \
+                '`batch_size` or `total_batch_size` should be choosed one'
+
+        total_batch_size = cfg.get('total_batch_size', None)
+        if total_batch_size is None:
+            bs = cfg.get('batch_size')
+        else:
+            from ..misc import dist_utils
+            assert total_batch_size % dist_utils.get_world_size() == 0, \
+                'total_batch_size should be divisible by world size'
+            bs = total_batch_size // dist_utils.get_world_size()
+        return bs 
+
+    def build_dataloader(self, name: str):
+        bs = self.get_rank_batch_size(self.yaml_cfg[name])
+        global_cfg = self.global_cfg
+        if 'total_batch_size' in global_cfg[name]:
+            # pop unexpected key for dataloader init
+            _ = global_cfg[name].pop('total_batch_size')
+        print(f'building {name} with batch_size={bs}...')
+        loader = create(name, global_cfg, batch_size=bs)
+        loader.shuffle = self.yaml_cfg[name].get('shuffle', False)      
+        return loader
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/src/core/yaml_utils.py b/rtdetrv2_pytorch/src/core/yaml_utils.py
new file mode 100644
index 0000000..d5732c3
--- /dev/null
+++ b/rtdetrv2_pytorch/src/core/yaml_utils.py
@@ -0,0 +1,124 @@
+""""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import os
+import copy
+import yaml 
+from typing import Any, Dict, Optional, List
+
+from .workspace import GLOBAL_CONFIG
+
+__all__ = [
+    'load_config', 
+    'merge_config', 
+    'merge_dict', 
+    'parse_cli',
+]
+
+
+INCLUDE_KEY = '__include__'
+
+
+def load_config(file_path, cfg=dict()):
+    """load config
+    """
+    _, ext = os.path.splitext(file_path)
+    assert ext in ['.yml', '.yaml'], "only support yaml files"
+
+    with open(file_path) as f:
+        file_cfg = yaml.load(f, Loader=yaml.Loader)
+        if file_cfg is None:
+            return {}
+
+    if INCLUDE_KEY in file_cfg:
+        base_yamls = list(file_cfg[INCLUDE_KEY])
+        for base_yaml in base_yamls:
+            if base_yaml.startswith('~'):
+                base_yaml = os.path.expanduser(base_yaml)
+
+            if not base_yaml.startswith('/'):
+                base_yaml = os.path.join(os.path.dirname(file_path), base_yaml)
+
+            with open(base_yaml) as f:
+                base_cfg = load_config(base_yaml, cfg)
+                merge_dict(cfg, base_cfg)
+
+    return merge_dict(cfg, file_cfg)
+
+
+def merge_dict(dct, another_dct, inplace=True) -> Dict:
+    """merge another_dct into dct
+    """
+    def _merge(dct, another) -> Dict:
+        for k in another:
+            if (k in dct and isinstance(dct[k], dict) and isinstance(another[k], dict)):
+                _merge(dct[k], another[k])
+            else:
+                dct[k] = another[k]
+
+        return dct
+    
+    if not inplace:
+        dct = copy.deepcopy(dct)
+    
+    return _merge(dct, another_dct)
+
+
+def dictify(s: str, v: Any) -> Dict:
+    if '.' not in s:
+        return {s: v}
+    key, rest = s.split('.', 1)
+    return {key: dictify(rest, v)}
+
+
+def parse_cli(nargs: List[str]) -> Dict:
+    """
+    parse command-line arguments
+        convert `a.c=3 b=10` to `{'a': {'c': 3}, 'b': 10}`
+    """
+    cfg = {}
+    if nargs is None or len(nargs) == 0:
+        return cfg
+
+    for s in nargs:
+        s = s.strip()
+        k, v = s.split('=', 1)
+        d = dictify(k, yaml.load(v, Loader=yaml.Loader))
+        cfg = merge_dict(cfg, d)
+
+    return cfg
+
+
+
+def merge_config(cfg, another_cfg=GLOBAL_CONFIG, inplace: bool=False, overwrite: bool=False):
+    """
+    Merge another_cfg into cfg, return the merged config
+
+    Example:
+
+        cfg1 = load_config('./rtdetrv2_r18vd_6x_coco.yml')
+        cfg1 = merge_config(cfg, inplace=True)
+
+        cfg2 = load_config('./rtdetr_r50vd_6x_coco.yml')
+        cfg2 = merge_config(cfg2, inplace=True)
+
+        model1 = create(cfg1['model'], cfg1)
+        model2 = create(cfg2['model'], cfg2)
+    """
+    def _merge(dct, another):
+        for k in another:
+            if k not in dct:
+                dct[k] = another[k]
+            
+            elif isinstance(dct[k], dict) and isinstance(another[k], dict):
+                _merge(dct[k], another[k])   
+            
+            elif overwrite:
+                dct[k] = another[k]
+
+        return cfg
+    
+    if not inplace:
+        cfg = copy.deepcopy(cfg)
+
+    return _merge(cfg, another_cfg)
diff --git a/rtdetrv2_pytorch/src/data/__init__.py b/rtdetrv2_pytorch/src/data/__init__.py
new file mode 100644
index 0000000..e42581e
--- /dev/null
+++ b/rtdetrv2_pytorch/src/data/__init__.py
@@ -0,0 +1,21 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+from .dataset import *
+from .transforms import *
+from .dataloader import *
+
+from ._misc import convert_to_tv_tensor
+
+
+
+
+# def set_epoch(self, epoch) -> None:
+#     self.epoch = epoch 
+# def _set_epoch_func(datasets):
+#     """Add `set_epoch` for datasets
+#     """
+#     from ..core import register
+#     for ds in datasets:
+#         register(ds)(set_epoch)
+# _set_epoch_func([CIFAR10, VOCDetection, CocoDetection])
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/src/data/_misc.py b/rtdetrv2_pytorch/src/data/_misc.py
new file mode 100644
index 0000000..ae0e225
--- /dev/null
+++ b/rtdetrv2_pytorch/src/data/_misc.py
@@ -0,0 +1,55 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import importlib.metadata
+from torch import Tensor 
+
+if importlib.metadata.version('torchvision') == '0.15.2':
+    import torchvision
+    torchvision.disable_beta_transforms_warning()
+
+    from torchvision.datapoints import BoundingBox as BoundingBoxes
+    from torchvision.datapoints import BoundingBoxFormat, Mask, Image, Video
+    from torchvision.transforms.v2 import SanitizeBoundingBox as SanitizeBoundingBoxes
+    _boxes_keys = ['format', 'spatial_size']
+
+elif '0.17' > importlib.metadata.version('torchvision') >= '0.16':
+    import torchvision
+    torchvision.disable_beta_transforms_warning()
+
+    from torchvision.transforms.v2 import SanitizeBoundingBoxes
+    from torchvision.tv_tensors import (
+        BoundingBoxes, BoundingBoxFormat, Mask, Image, Video)
+    _boxes_keys = ['format', 'canvas_size']
+
+elif importlib.metadata.version('torchvision') >= '0.17':
+    import torchvision
+    from torchvision.transforms.v2 import SanitizeBoundingBoxes
+    from torchvision.tv_tensors import (
+        BoundingBoxes, BoundingBoxFormat, Mask, Image, Video)
+    _boxes_keys = ['format', 'canvas_size']
+
+else:
+    raise RuntimeError('Please make sure torchvision version >= 0.15.2')
+
+
+
+def convert_to_tv_tensor(tensor: Tensor, key: str, box_format='xyxy', spatial_size=None) -> Tensor:
+    """
+    Args:
+        tensor (Tensor): input tensor
+        key (str): transform to key
+
+    Return:
+        Dict[str, TV_Tensor]
+    """
+    assert key in ('boxes', 'masks', ), "Only support 'boxes' and 'masks'"
+    
+    if key == 'boxes':
+        box_format = getattr(BoundingBoxFormat, box_format.upper())
+        _kwargs = dict(zip(_boxes_keys, [box_format, spatial_size]))
+        return BoundingBoxes(tensor, **_kwargs)
+
+    if key == 'masks':
+       return Mask(tensor)
+
diff --git a/rtdetrv2_pytorch/src/data/dataloader.py b/rtdetrv2_pytorch/src/data/dataloader.py
new file mode 100644
index 0000000..d7f5302
--- /dev/null
+++ b/rtdetrv2_pytorch/src/data/dataloader.py
@@ -0,0 +1,107 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch 
+import torch.utils.data as data
+import torch.nn.functional as F
+from torch.utils.data import default_collate
+
+import torchvision
+torchvision.disable_beta_transforms_warning()
+import torchvision.transforms.v2 as VT
+from torchvision.transforms.v2 import functional as VF, InterpolationMode
+
+import random
+from functools import partial
+
+from ..core import register
+
+
+__all__ = [
+    'DataLoader',
+    'BaseCollateFunction', 
+    'BatchImageCollateFunction',
+    'batch_image_collate_fn'
+]
+
+
+@register()
+class DataLoader(data.DataLoader):
+    __inject__ = ['dataset', 'collate_fn']
+
+    def __repr__(self) -> str:
+        format_string = self.__class__.__name__ + "("
+        for n in ['dataset', 'batch_size', 'num_workers', 'drop_last', 'collate_fn']:
+            format_string += "\n"
+            format_string += "    {0}: {1}".format(n, getattr(self, n))
+        format_string += "\n)"
+        return format_string
+
+    def set_epoch(self, epoch):
+        self._epoch = epoch 
+        self.dataset.set_epoch(epoch)
+        self.collate_fn.set_epoch(epoch)
+    
+    @property
+    def epoch(self):
+        return self._epoch if hasattr(self, '_epoch') else -1
+
+    @property
+    def shuffle(self):
+        return self._shuffle
+
+    @shuffle.setter
+    def shuffle(self, shuffle):
+        assert isinstance(shuffle, bool), 'shuffle must be a boolean'
+        self._shuffle = shuffle
+
+
+@register()
+def batch_image_collate_fn(items):
+    """only batch image
+    """
+    return torch.cat([x[0][None] for x in items], dim=0), [x[1] for x in items]
+
+
+class BaseCollateFunction(object):
+    def set_epoch(self, epoch):
+        self._epoch = epoch 
+
+    @property
+    def epoch(self):
+        return self._epoch if hasattr(self, '_epoch') else -1
+
+    def __call__(self, items):
+        raise NotImplementedError('')
+
+
+@register()
+class BatchImageCollateFunction(BaseCollateFunction):
+    def __init__(
+        self, 
+        scales=None, 
+        stop_epoch=None, 
+    ) -> None:
+        super().__init__()
+        self.scales = scales
+        self.stop_epoch = stop_epoch if stop_epoch is not None else 100000000
+        # self.interpolation = interpolation
+
+    def __call__(self, items):
+        images = torch.cat([x[0][None] for x in items], dim=0)
+        targets = [x[1] for x in items]
+
+        if self.scales is not None and self.epoch < self.stop_epoch:
+            # sz = random.choice(self.scales)
+            # sz = [sz] if isinstance(sz, int) else list(sz)
+            # VF.resize(inpt, sz, interpolation=self.interpolation)
+
+            sz = random.choice(self.scales)
+            images = F.interpolate(images, size=sz)
+            if 'masks' in targets[0]:
+                for tg in targets:
+                    tg['masks'] = F.interpolate(tg['masks'], size=sz, mode='nearest')
+                raise NotImplementedError('')
+
+        return images, targets
+
diff --git a/rtdetrv2_pytorch/src/data/dataset/__init__.py b/rtdetrv2_pytorch/src/data/dataset/__init__.py
new file mode 100644
index 0000000..f4b85bb
--- /dev/null
+++ b/rtdetrv2_pytorch/src/data/dataset/__init__.py
@@ -0,0 +1,16 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+# from ._dataset import DetDataset
+from .cifar_dataset import CIFAR10
+from .coco_dataset import CocoDetection
+from .coco_dataset import (
+    CocoDetection, 
+    mscoco_category2name, 
+    mscoco_category2label,
+    mscoco_label2category,
+)
+from .coco_eval import CocoEvaluator
+from .coco_utils import get_coco_api_from_dataset
+from .voc_detection import VOCDetection
+from .voc_eval import VOCEvaluator
diff --git a/rtdetrv2_pytorch/src/data/dataset/_dataset.py b/rtdetrv2_pytorch/src/data/dataset/_dataset.py
new file mode 100644
index 0000000..c4448f3
--- /dev/null
+++ b/rtdetrv2_pytorch/src/data/dataset/_dataset.py
@@ -0,0 +1,22 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch 
+import torch.utils.data as data
+
+class DetDataset(data.Dataset):
+    def __getitem__(self, index):
+        img, target = self.load_item(index)
+        if self.transforms is not None:
+            img, target, _ = self.transforms(img, target, self)
+        return img, target
+
+    def load_item(self, index):
+        raise NotImplementedError("Please implement this function to return item before `transforms`.")
+
+    def set_epoch(self, epoch) -> None:
+        self._epoch = epoch 
+
+    @property
+    def epoch(self):
+        return self._epoch if hasattr(self, '_epoch') else -1
diff --git a/rtdetrv2_pytorch/src/data/dataset/cifar_dataset.py b/rtdetrv2_pytorch/src/data/dataset/cifar_dataset.py
new file mode 100644
index 0000000..2fc05f7
--- /dev/null
+++ b/rtdetrv2_pytorch/src/data/dataset/cifar_dataset.py
@@ -0,0 +1,16 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+
+import torchvision
+from typing import Optional, Callable
+
+from ...core import register
+
+@register()
+class CIFAR10(torchvision.datasets.CIFAR10):
+    __inject__ = ['transform', 'target_transform']
+    
+    def __init__(self, root: str, train: bool = True, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None, download: bool = False) -> None:
+        super().__init__(root, train, transform, target_transform, download)
+
diff --git a/rtdetrv2_pytorch/src/data/dataset/coco_dataset.py b/rtdetrv2_pytorch/src/data/dataset/coco_dataset.py
new file mode 100644
index 0000000..053fb13
--- /dev/null
+++ b/rtdetrv2_pytorch/src/data/dataset/coco_dataset.py
@@ -0,0 +1,261 @@
+"""
+Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
+
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+from faster_coco_eval.utils.pytorch import FasterCocoDetection
+import torchvision
+
+from PIL import Image 
+from faster_coco_eval.core import mask as coco_mask
+
+from ._dataset import DetDataset
+from .._misc import convert_to_tv_tensor
+from ...core import register
+
+__all__ = ['CocoDetection']
+
+torchvision.disable_beta_transforms_warning()
+
+@register()
+class CocoDetection(FasterCocoDetection, DetDataset):
+    __inject__ = ['transforms', ]
+    __share__ = ['remap_mscoco_category']
+    
+    def __init__(self, img_folder, ann_file, transforms, return_masks=False, remap_mscoco_category=False):
+        super(FasterCocoDetection, self).__init__(img_folder, ann_file)
+        self._transforms = transforms
+        self.prepare = ConvertCocoPolysToMask(return_masks)
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self.return_masks = return_masks
+        self.remap_mscoco_category = remap_mscoco_category
+
+    def __getitem__(self, idx):
+        img, target = self.load_item(idx)
+        if self._transforms is not None:
+            img, target, _ = self._transforms(img, target, self)
+        return img, target
+
+    def load_item(self, idx):
+        image, target = super(FasterCocoDetection, self).__getitem__(idx)
+        image_id = self.ids[idx]
+        target = {'image_id': image_id, 'annotations': target}
+
+        if self.remap_mscoco_category:
+            image, target = self.prepare(image, target, category2label=mscoco_category2label)
+            # image, target = self.prepare(image, target, category2label=self.category2label)
+        else:
+            image, target = self.prepare(image, target)
+
+        target['idx'] = torch.tensor([idx])
+
+        if 'boxes' in target:
+            target['boxes'] = convert_to_tv_tensor(target['boxes'], key='boxes', spatial_size=image.size[::-1])
+
+        if 'masks' in target:
+            target['masks'] = convert_to_tv_tensor(target['masks'], key='masks')
+        
+        return image, target
+
+    def extra_repr(self) -> str:
+        s = f' img_folder: {self.img_folder}\n ann_file: {self.ann_file}\n'
+        s += f' return_masks: {self.return_masks}\n'
+        if hasattr(self, '_transforms') and self._transforms is not None:
+            s += f' transforms:\n   {repr(self._transforms)}'
+        if hasattr(self, '_preset') and self._preset is not None:
+            s += f' preset:\n   {repr(self._preset)}'
+        return s 
+
+    @property
+    def categories(self, ):
+        return self.coco.dataset['categories']
+
+    @property
+    def category2name(self, ):
+        return {cat['id']: cat['name'] for cat in self.categories}
+
+    @property
+    def category2label(self, ):
+        return {cat['id']: i for i, cat in enumerate(self.categories)}
+
+    @property
+    def label2category(self, ):
+        return {i: cat['id'] for i, cat in enumerate(self.categories)}
+
+
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+
+
+class ConvertCocoPolysToMask(object):
+    def __init__(self, return_masks=False):
+        self.return_masks = return_masks
+
+    def __call__(self, image: Image.Image, target, **kwargs):
+        w, h = image.size
+
+        image_id = target["image_id"]
+        image_id = torch.tensor([image_id])
+
+        anno = target["annotations"]
+
+        anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+
+        category2label = kwargs.get('category2label', None)
+        if category2label is not None:
+            labels = [category2label[obj["category_id"]] for obj in anno]
+        else:
+            labels = [obj["category_id"] for obj in anno]
+            
+        labels = torch.tensor(labels, dtype=torch.int64)
+
+        if self.return_masks:
+            segmentations = [obj["segmentation"] for obj in anno]
+            masks = convert_coco_poly_to_mask(segmentations, h, w)
+
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.view(num_keypoints, -1, 3)
+
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        labels = labels[keep]
+        if self.return_masks:
+            masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = labels
+        if self.return_masks:
+            target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
+        target["area"] = area[keep]
+        target["iscrowd"] = iscrowd[keep]
+
+        target["orig_size"] = torch.as_tensor([int(w), int(h)])
+        # target["size"] = torch.as_tensor([int(w), int(h)])
+    
+        return image, target
+
+
+mscoco_category2name = {
+    1: 'person',
+    2: 'bicycle',
+    3: 'car',
+    4: 'motorcycle',
+    5: 'airplane',
+    6: 'bus',
+    7: 'train',
+    8: 'truck',
+    9: 'boat',
+    10: 'traffic light',
+    11: 'fire hydrant',
+    13: 'stop sign',
+    14: 'parking meter',
+    15: 'bench',
+    16: 'bird',
+    17: 'cat',
+    18: 'dog',
+    19: 'horse',
+    20: 'sheep',
+    21: 'cow',
+    22: 'elephant',
+    23: 'bear',
+    24: 'zebra',
+    25: 'giraffe',
+    27: 'backpack',
+    28: 'umbrella',
+    31: 'handbag',
+    32: 'tie',
+    33: 'suitcase',
+    34: 'frisbee',
+    35: 'skis',
+    36: 'snowboard',
+    37: 'sports ball',
+    38: 'kite',
+    39: 'baseball bat',
+    40: 'baseball glove',
+    41: 'skateboard',
+    42: 'surfboard',
+    43: 'tennis racket',
+    44: 'bottle',
+    46: 'wine glass',
+    47: 'cup',
+    48: 'fork',
+    49: 'knife',
+    50: 'spoon',
+    51: 'bowl',
+    52: 'banana',
+    53: 'apple',
+    54: 'sandwich',
+    55: 'orange',
+    56: 'broccoli',
+    57: 'carrot',
+    58: 'hot dog',
+    59: 'pizza',
+    60: 'donut',
+    61: 'cake',
+    62: 'chair',
+    63: 'couch',
+    64: 'potted plant',
+    65: 'bed',
+    67: 'dining table',
+    70: 'toilet',
+    72: 'tv',
+    73: 'laptop',
+    74: 'mouse',
+    75: 'remote',
+    76: 'keyboard',
+    77: 'cell phone',
+    78: 'microwave',
+    79: 'oven',
+    80: 'toaster',
+    81: 'sink',
+    82: 'refrigerator',
+    84: 'book',
+    85: 'clock',
+    86: 'vase',
+    87: 'scissors',
+    88: 'teddy bear',
+    89: 'hair drier',
+    90: 'toothbrush'
+}
+
+mscoco_category2label = {k: i for i, k in enumerate(mscoco_category2name.keys())}
+mscoco_label2category = {v: k for k, v in mscoco_category2label.items()}
diff --git a/rtdetrv2_pytorch/src/data/dataset/coco_eval.py b/rtdetrv2_pytorch/src/data/dataset/coco_eval.py
new file mode 100644
index 0000000..b50b287
--- /dev/null
+++ b/rtdetrv2_pytorch/src/data/dataset/coco_eval.py
@@ -0,0 +1,16 @@
+"""
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+COCO evaluator that works in distributed mode.
+Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py
+The difference is that there is less copy-pasting from pycocotools
+in the end of the file, as python3 can suppress prints with contextlib
+
+# MiXaiLL76 replacing pycocotools with faster-coco-eval for better performance and support.
+"""
+
+from ...core import register
+from faster_coco_eval.utils.pytorch import FasterCocoEvaluator
+
+@register()
+class CocoEvaluator(FasterCocoEvaluator):
+    pass
diff --git a/rtdetrv2_pytorch/src/data/dataset/coco_utils.py b/rtdetrv2_pytorch/src/data/dataset/coco_utils.py
new file mode 100644
index 0000000..be2b915
--- /dev/null
+++ b/rtdetrv2_pytorch/src/data/dataset/coco_utils.py
@@ -0,0 +1,194 @@
+"""
+copy and modified https://github.com/pytorch/vision/blob/main/references/detection/coco_utils.py
+
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+
+import torch
+import torch.utils.data
+import torchvision
+import torchvision.transforms.functional as TVF
+from faster_coco_eval import COCO
+import faster_coco_eval.core.mask as mask_util
+
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = mask_util.frPyObjects(polygons, height, width)
+        mask = mask_util.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+
+
+class ConvertCocoPolysToMask:
+    def __call__(self, image, target):
+        w, h = image.size
+
+        image_id = target["image_id"]
+
+        anno = target["annotations"]
+
+        anno = [obj for obj in anno if obj["iscrowd"] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+
+        classes = [obj["category_id"] for obj in anno]
+        classes = torch.tensor(classes, dtype=torch.int64)
+
+        segmentations = [obj["segmentation"] for obj in anno]
+        masks = convert_coco_poly_to_mask(segmentations, h, w)
+
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.view(num_keypoints, -1, 3)
+
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] for obj in anno])
+        target["area"] = area
+        target["iscrowd"] = iscrowd
+
+        return image, target
+
+
+def _coco_remove_images_without_annotations(dataset, cat_list=None):
+    def _has_only_empty_bbox(anno):
+        return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
+
+    def _count_visible_keypoints(anno):
+        return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno)
+
+    min_keypoints_per_image = 10
+
+    def _has_valid_annotation(anno):
+        # if it's empty, there is no annotation
+        if len(anno) == 0:
+            return False
+        # if all boxes have close to zero area, there is no annotation
+        if _has_only_empty_bbox(anno):
+            return False
+        # keypoints task have a slight different criteria for considering
+        # if an annotation is valid
+        if "keypoints" not in anno[0]:
+            return True
+        # for keypoint detection tasks, only consider valid images those
+        # containing at least min_keypoints_per_image
+        if _count_visible_keypoints(anno) >= min_keypoints_per_image:
+            return True
+        return False
+
+    ids = []
+    for ds_idx, img_id in enumerate(dataset.ids):
+        ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None)
+        anno = dataset.coco.loadAnns(ann_ids)
+        if cat_list:
+            anno = [obj for obj in anno if obj["category_id"] in cat_list]
+        if _has_valid_annotation(anno):
+            ids.append(ds_idx)
+
+    dataset = torch.utils.data.Subset(dataset, ids)
+    return dataset
+
+
+def convert_to_coco_api(ds):
+    coco_ds = COCO()
+    # annotation IDs need to start at 1, not 0, see torchvision issue #1530
+    ann_id = 1
+    dataset = {"images": [], "categories": [], "annotations": []}
+    categories = set()
+    for img_idx in range(len(ds)):
+        # find better way to get target
+        # targets = ds.get_annotations(img_idx)
+        # img, targets = ds[img_idx]
+
+        # TODO (by lyuwenyu), load image and targets before `transforms`
+        img, targets = ds.load_item(img_idx)
+        width, height = img.size
+        
+        image_id = targets["image_id"].item()
+        img_dict = {}
+        img_dict["id"] = image_id
+        img_dict["width"] = width
+        img_dict["height"] = height
+        dataset["images"].append(img_dict)
+        bboxes = targets["boxes"].clone()
+        bboxes[:, 2:] -= bboxes[:, :2] # xyxy -> xywh
+        bboxes = bboxes.tolist()
+        labels = targets["labels"].tolist()
+        areas = targets["area"].tolist()
+        iscrowd = targets["iscrowd"].tolist()
+        if "masks" in targets:
+            masks = targets["masks"]
+            # make masks Fortran contiguous for coco_mask
+            masks = masks.permute(0, 2, 1).contiguous().permute(0, 2, 1)
+        if "keypoints" in targets:
+            keypoints = targets["keypoints"]
+            keypoints = keypoints.reshape(keypoints.shape[0], -1).tolist()
+        num_objs = len(bboxes)
+        for i in range(num_objs):
+            ann = {}
+            ann["image_id"] = image_id
+            ann["bbox"] = bboxes[i]
+            ann["category_id"] = labels[i]
+            categories.add(labels[i])
+            ann["area"] = areas[i]
+            ann["iscrowd"] = iscrowd[i]
+            ann["id"] = ann_id
+            if "masks" in targets:
+                ann["segmentation"] = mask_util.encode(masks[i].numpy())
+            if "keypoints" in targets:
+                ann["keypoints"] = keypoints[i]
+                ann["num_keypoints"] = sum(k != 0 for k in keypoints[i][2::3])
+            dataset["annotations"].append(ann)
+            ann_id += 1
+    dataset["categories"] = [{"id": i} for i in sorted(categories)]
+    coco_ds.dataset = dataset
+    coco_ds.createIndex()
+    return coco_ds
+
+
+def get_coco_api_from_dataset(dataset):
+    # FIXME: This is... awful?
+    for _ in range(10):
+        if isinstance(dataset, torchvision.datasets.CocoDetection):
+            break
+        if isinstance(dataset, torch.utils.data.Subset):
+            dataset = dataset.dataset
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
+        return dataset.coco
+    return convert_to_coco_api(dataset)
+
+
diff --git a/rtdetrv2_pytorch/src/data/dataset/voc_detection.py b/rtdetrv2_pytorch/src/data/dataset/voc_detection.py
new file mode 100644
index 0000000..a926a1f
--- /dev/null
+++ b/rtdetrv2_pytorch/src/data/dataset/voc_detection.py
@@ -0,0 +1,75 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+from sympy import im
+import torch
+import torchvision
+import torchvision.transforms.functional as TVF 
+
+import os
+from PIL import Image
+from typing import Optional, Callable
+
+try:
+    from defusedxml.ElementTree import parse as ET_parse
+except ImportError:
+    from xml.etree.ElementTree import parse as ET_parse
+
+from ._dataset import DetDataset
+from .._misc import convert_to_tv_tensor
+from ...core import register
+
+@register()
+class VOCDetection(torchvision.datasets.VOCDetection, DetDataset):
+    __inject__ = ['transforms', ]
+
+    def __init__(self, root: str, ann_file: str = "trainval.txt", label_file: str = "label_list.txt", transforms: Optional[Callable] = None):
+
+        with open(os.path.join(root, ann_file), 'r') as f:
+            lines = [x.strip() for x in f.readlines()]
+            lines = [x.split(' ') for x in lines]
+
+        self.images = [os.path.join(root, lin[0]) for lin in lines]
+        self.targets = [os.path.join(root, lin[1]) for lin in lines]
+        assert len(self.images) == len(self.targets)
+
+        with open(os.path.join(root + label_file), 'r') as f:
+            labels = f.readlines()
+            labels = [lab.strip() for lab in labels]
+
+        self.transforms = transforms
+        self.labels_map = {lab: i for i, lab in enumerate(labels)}
+        
+    def __getitem__(self, index: int):
+        image, target = self.load_item(index)
+        if self.transforms is not None:
+            image, target, _ = self.transforms(image, target, self)        
+        # target["orig_size"] = torch.tensor(TVF.get_image_size(image))
+        return image, target
+
+    def load_item(self, index: int):
+        image = Image.open(self.images[index]).convert("RGB")
+        target = self.parse_voc_xml(ET_parse(self.annotations[index]).getroot())
+        
+        output = {}
+        output["image_id"] = torch.tensor([index])
+        for k in ['area', 'boxes', 'labels', 'iscrowd']:
+            output[k] = []
+            
+        for blob in target['annotation']['object']:
+            box = [float(v) for v in blob['bndbox'].values()]
+            output["boxes"].append(box)
+            output["labels"].append(blob['name'])
+            output["area"].append((box[2] - box[0]) * (box[3] - box[1]))
+            output["iscrowd"].append(0)
+
+        w, h = image.size
+        boxes = torch.tensor(output["boxes"]) if len(output["boxes"]) > 0 else torch.zeros(0, 4)
+        output['boxes'] = convert_to_tv_tensor(boxes, 'boxes', box_format='xyxy', spatial_size=[h, w])
+        output['labels'] = torch.tensor([self.labels_map[lab] for lab in output["labels"]])
+        output['area'] = torch.tensor(output['area'])
+        output["iscrowd"] = torch.tensor(output["iscrowd"])
+        output["orig_size"] = torch.tensor([w, h])
+        
+        return image, output
+    
diff --git a/rtdetrv2_pytorch/src/data/dataset/voc_eval.py b/rtdetrv2_pytorch/src/data/dataset/voc_eval.py
new file mode 100644
index 0000000..efe200f
--- /dev/null
+++ b/rtdetrv2_pytorch/src/data/dataset/voc_eval.py
@@ -0,0 +1,10 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torchvision
+
+
+class VOCEvaluator(object):
+    def __init__(self) -> None:
+        pass
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/src/data/transforms/__init__.py b/rtdetrv2_pytorch/src/data/transforms/__init__.py
new file mode 100644
index 0000000..9adb329
--- /dev/null
+++ b/rtdetrv2_pytorch/src/data/transforms/__init__.py
@@ -0,0 +1,20 @@
+""""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+
+from ._transforms import (
+    EmptyTransform,
+    RandomPhotometricDistort,
+    RandomZoomOut,
+    RandomIoUCrop,
+    RandomHorizontalFlip,
+    Resize,
+    PadToSize,
+    SanitizeBoundingBoxes,
+    RandomCrop,
+    Normalize,
+    ConvertBoxes,
+    ConvertPILImage,
+)
+from .container import Compose
+from .mosaic import Mosaic
diff --git a/rtdetrv2_pytorch/src/data/transforms/_transforms.py b/rtdetrv2_pytorch/src/data/transforms/_transforms.py
new file mode 100644
index 0000000..5758c91
--- /dev/null
+++ b/rtdetrv2_pytorch/src/data/transforms/_transforms.py
@@ -0,0 +1,148 @@
+""""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch 
+import torch.nn as nn 
+
+import torchvision
+torchvision.disable_beta_transforms_warning()
+
+import torchvision.transforms.v2 as T
+import torchvision.transforms.v2.functional as F
+
+import PIL
+import PIL.Image
+
+from typing import Any, Dict, List, Optional
+
+from .._misc import convert_to_tv_tensor, _boxes_keys
+from .._misc import Image, Video, Mask, BoundingBoxes
+from .._misc import SanitizeBoundingBoxes
+
+from ...core import register
+
+
+RandomPhotometricDistort = register()(T.RandomPhotometricDistort)
+RandomZoomOut = register()(T.RandomZoomOut)
+RandomHorizontalFlip = register()(T.RandomHorizontalFlip)
+Resize = register()(T.Resize)
+# ToImageTensor = register()(T.ToImageTensor)
+# ConvertDtype = register()(T.ConvertDtype)
+# PILToTensor = register()(T.PILToTensor)
+SanitizeBoundingBoxes = register(name='SanitizeBoundingBoxes')(SanitizeBoundingBoxes)
+RandomCrop = register()(T.RandomCrop)
+Normalize = register()(T.Normalize)
+
+
+@register()
+class EmptyTransform(T.Transform):
+    def __init__(self, ) -> None:
+        super().__init__()
+
+    def forward(self, *inputs):
+        inputs = inputs if len(inputs) > 1 else inputs[0]
+        return inputs
+
+
+@register()
+class PadToSize(T.Pad):
+    _transformed_types = (
+        PIL.Image.Image,
+        Image,
+        Video,
+        Mask,
+        BoundingBoxes,
+    )
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        sp = F.get_spatial_size(flat_inputs[0])
+        h, w = self.size[1] - sp[0], self.size[0] - sp[1]
+        self.padding = [0, 0, w, h]
+        return dict(padding=self.padding)
+
+    def make_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        return self._get_params(flat_inputs)
+
+    def __init__(self, size, fill=0, padding_mode='constant') -> None:
+        if isinstance(size, int):
+            size = (size, size)
+        self.size = size
+        super().__init__(0, fill, padding_mode)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:        
+        fill = self._fill[type(inpt)]
+        padding = params['padding']
+        return F.pad(inpt, padding=padding, fill=fill, padding_mode=self.padding_mode)  # type: ignore[arg-type]
+
+    def transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._transform(inpt, params)
+
+    def __call__(self, *inputs: Any) -> Any:
+        outputs = super().forward(*inputs)
+        if len(outputs) > 1 and isinstance(outputs[1], dict):
+            outputs[1]['padding'] = torch.tensor(self.padding)
+        return outputs
+
+
+@register()
+class RandomIoUCrop(T.RandomIoUCrop):
+    def __init__(self, min_scale: float = 0.3, max_scale: float = 1, min_aspect_ratio: float = 0.5, max_aspect_ratio: float = 2, sampler_options: Optional[List[float]] = None, trials: int = 40, p: float = 1.0):
+        super().__init__(min_scale, max_scale, min_aspect_ratio, max_aspect_ratio, sampler_options, trials)
+        self.p = p 
+
+    def __call__(self, *inputs: Any) -> Any:
+        if torch.rand(1) >= self.p:
+            return inputs if len(inputs) > 1 else inputs[0]
+
+        return super().forward(*inputs)
+
+
+@register()
+class ConvertBoxes(T.Transform):
+    _transformed_types = (
+        BoundingBoxes,
+    )
+    def __init__(self, fmt='', normalize=False) -> None:
+        super().__init__()
+        self.fmt = fmt
+        self.normalize = normalize
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:  
+        spatial_size = getattr(inpt, _boxes_keys[1])
+        if self.fmt:
+            in_fmt = inpt.format.value.lower()
+            inpt = torchvision.ops.box_convert(inpt, in_fmt=in_fmt, out_fmt=self.fmt.lower())
+            inpt = convert_to_tv_tensor(inpt, key='boxes', box_format=self.fmt.upper(), spatial_size=spatial_size)
+            
+        if self.normalize:
+            inpt = inpt / torch.tensor(spatial_size[::-1]).tile(2)[None]
+
+        return inpt
+
+    def transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._transform(inpt, params)
+
+
+@register()
+class ConvertPILImage(T.Transform):
+    _transformed_types = (
+        PIL.Image.Image,
+    )
+    def __init__(self, dtype='float32', scale=True) -> None:
+        super().__init__()
+        self.dtype = dtype
+        self.scale = scale
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:  
+        inpt = F.pil_to_tensor(inpt)
+        if self.dtype == 'float32':
+            inpt = inpt.float()
+
+        if self.scale:
+            inpt = inpt / 255.
+
+        inpt = Image(inpt)
+
+        return inpt
+
+    def transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._transform(inpt, params)
diff --git a/rtdetrv2_pytorch/src/data/transforms/container.py b/rtdetrv2_pytorch/src/data/transforms/container.py
new file mode 100644
index 0000000..bf567bb
--- /dev/null
+++ b/rtdetrv2_pytorch/src/data/transforms/container.py
@@ -0,0 +1,95 @@
+""""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch 
+import torch.nn as nn 
+
+import torchvision
+torchvision.disable_beta_transforms_warning()
+import torchvision.transforms.v2 as T
+
+from typing import Any, Dict, List, Optional
+
+from ._transforms import EmptyTransform
+from ...core import register, GLOBAL_CONFIG
+
+
+@register()
+class Compose(T.Compose):
+    def __init__(self, ops, policy=None) -> None:
+        transforms = []
+        if ops is not None:
+            for op in ops:
+                if isinstance(op, dict):
+                    name = op.pop('type')
+                    transfom = getattr(GLOBAL_CONFIG[name]['_pymodule'], GLOBAL_CONFIG[name]['_name'])(**op)
+                    transforms.append(transfom)
+                    op['type'] = name
+
+                elif isinstance(op, nn.Module):
+                    transforms.append(op)
+
+                else:
+                    raise ValueError('')
+        else:
+            transforms =[EmptyTransform(), ]
+ 
+        super().__init__(transforms=transforms)
+
+        if policy is None:
+            policy = {'name': 'default'}
+
+        self.policy = policy
+        self.global_samples = 0
+
+    def forward(self, *inputs: Any) -> Any:
+        return self.get_forward(self.policy['name'])(*inputs)
+
+    def get_forward(self, name):
+        forwards = {
+            'default': self.default_forward,
+            'stop_epoch': self.stop_epoch_forward,
+            'stop_sample': self.stop_sample_forward,
+        }
+        return forwards[name]
+
+    def default_forward(self, *inputs: Any) -> Any:
+        sample = inputs if len(inputs) > 1 else inputs[0]
+        for transform in self.transforms:
+            sample = transform(sample)
+        return sample
+
+    def stop_epoch_forward(self, *inputs: Any):
+        sample = inputs if len(inputs) > 1 else inputs[0]
+        dataset = sample[-1]
+        
+        cur_epoch = dataset.epoch
+        policy_ops = self.policy['ops']
+        policy_epoch = self.policy['epoch']
+
+        for transform in self.transforms:
+            if type(transform).__name__ in policy_ops and cur_epoch >= policy_epoch:
+                pass
+            else:
+                sample = transform(sample)
+
+        return sample
+
+
+    def stop_sample_forward(self, *inputs: Any):
+        sample = inputs if len(inputs) > 1 else inputs[0]
+        dataset = sample[-1]
+        
+        cur_epoch = dataset.epoch
+        policy_ops = self.policy['ops']
+        policy_sample = self.policy['sample']
+
+        for transform in self.transforms:
+            if type(transform).__name__ in policy_ops and self.global_samples >= policy_sample:
+                pass
+            else:
+                sample = transform(sample)
+
+        self.global_samples += 1
+
+        return sample
diff --git a/rtdetrv2_pytorch/src/data/transforms/functional.py b/rtdetrv2_pytorch/src/data/transforms/functional.py
new file mode 100644
index 0000000..336baa2
--- /dev/null
+++ b/rtdetrv2_pytorch/src/data/transforms/functional.py
@@ -0,0 +1,169 @@
+import torch
+import torchvision.transforms.functional as F
+
+from packaging import version
+from typing import Optional, List
+from torch import Tensor
+
+# needed due to empty tensor bug in pytorch and torchvision 0.5
+import torchvision
+if version.parse(torchvision.__version__) < version.parse('0.7'):
+    from torchvision.ops import _new_empty_tensor
+    from torchvision.ops.misc import _output_size
+
+
+def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
+    # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
+    """
+    Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
+    This will eventually be supported natively by PyTorch, and this
+    class can go away.
+    """
+    if version.parse(torchvision.__version__) < version.parse('0.7'):
+        if input.numel() > 0:
+            return torch.nn.functional.interpolate(
+                input, size, scale_factor, mode, align_corners
+            )
+
+        output_shape = _output_size(2, input, size, scale_factor)
+        output_shape = list(input.shape[:-2]) + list(output_shape)
+        return _new_empty_tensor(input, output_shape)
+    else:
+        return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)
+
+
+
+def crop(image, target, region):
+    cropped_image = F.crop(image, *region)
+
+    target = target.copy()
+    i, j, h, w = region
+
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor([h, w])
+
+    fields = ["labels", "area", "iscrowd"]
+
+    if "boxes" in target:
+        boxes = target["boxes"]
+        max_size = torch.as_tensor([w, h], dtype=torch.float32)
+        cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+        cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+        cropped_boxes = cropped_boxes.clamp(min=0)
+        area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
+        target["boxes"] = cropped_boxes.reshape(-1, 4)
+        target["area"] = area
+        fields.append("boxes")
+
+    if "masks" in target:
+        # FIXME should we update the area here if there are no boxes?
+        target['masks'] = target['masks'][:, i:i + h, j:j + w]
+        fields.append("masks")
+
+    # remove elements for which the boxes or masks that have zero area
+    if "boxes" in target or "masks" in target:
+        # favor boxes selection when defining which elements to keep
+        # this is compatible with previous implementation
+        if "boxes" in target:
+            cropped_boxes = target['boxes'].reshape(-1, 2, 2)
+            keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+        else:
+            keep = target['masks'].flatten(1).any(1)
+
+        for field in fields:
+            target[field] = target[field][keep]
+
+    return cropped_image, target
+
+
+def hflip(image, target):
+    flipped_image = F.hflip(image)
+
+    w, h = image.size
+
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
+        target["boxes"] = boxes
+
+    if "masks" in target:
+        target['masks'] = target['masks'].flip(-1)
+
+    return flipped_image, target
+
+
+def resize(image, target, size, max_size=None):
+    # size can be min_size (scalar) or (w, h) tuple
+
+    def get_size_with_aspect_ratio(image_size, size, max_size=None):
+        w, h = image_size
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(round(max_size * min_original_size / max_original_size))
+
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (h, w)
+
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+            
+        # r = min(size / min(h, w), max_size / max(h, w))
+        # ow = int(w * r)
+        # oh = int(h * r)
+
+        return (oh, ow)
+
+    def get_size(image_size, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(image_size, size, max_size)
+
+    size = get_size(image.size, size, max_size)
+    rescaled_image = F.resize(image, size)
+
+    if target is None:
+        return rescaled_image, None
+
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+    ratio_width, ratio_height = ratios
+
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
+        target["boxes"] = scaled_boxes
+
+    if "area" in target:
+        area = target["area"]
+        scaled_area = area * (ratio_width * ratio_height)
+        target["area"] = scaled_area
+
+    h, w = size
+    target["size"] = torch.tensor([h, w])
+
+    if "masks" in target:
+        target['masks'] = interpolate(
+            target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
+
+    return rescaled_image, target
+
+
+def pad(image, target, padding):
+    # assumes that we only pad on the bottom right corners
+    padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
+    if target is None:
+        return padded_image, None
+    target = target.copy()
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor(padded_image.size[::-1])
+    if "masks" in target:
+        target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
+    return padded_image, target
diff --git a/rtdetrv2_pytorch/src/data/transforms/mosaic.py b/rtdetrv2_pytorch/src/data/transforms/mosaic.py
new file mode 100644
index 0000000..fc08d1b
--- /dev/null
+++ b/rtdetrv2_pytorch/src/data/transforms/mosaic.py
@@ -0,0 +1,72 @@
+""""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch 
+import torchvision
+torchvision.disable_beta_transforms_warning()
+import torchvision.transforms.v2 as T
+import torchvision.transforms.v2.functional as F
+
+import random
+from PIL import Image 
+
+from .._misc import convert_to_tv_tensor
+from ...core import register
+
+
+@register()
+class Mosaic(T.Transform):
+    def __init__(self, size, max_size=None, ) -> None:
+        super().__init__()
+        self.resize = T.Resize(size=size, max_size=max_size)
+        self.crop = T.RandomCrop(size=max_size if max_size else size)
+        
+        # TODO add arg `output_size` for affine`
+        # self.random_perspective = T.RandomPerspective(distortion_scale=0.5, p=1., )
+        self.random_affine = T.RandomAffine(degrees=0, translate=(0.1, 0.1), scale=(0.5, 1.5), fill=114)
+
+    def forward(self, *inputs):
+        inputs = inputs if len(inputs) > 1 else inputs[0]
+        image, target, dataset = inputs
+
+        images = []
+        targets = []
+        indices = random.choices(range(len(dataset)), k=3)
+        for i in indices:
+            image, target = dataset.load_item(i)
+            image, target = self.resize(image, target)
+            images.append(image)
+            targets.append(target)
+
+        h, w = F.get_spatial_size(images[0])
+        offset = [[0, 0], [w, 0], [0, h], [w, h]]
+        image = Image.new(mode=images[0].mode, size=(w * 2, h * 2), color=0)
+        for i, im in enumerate(images):
+            image.paste(im, offset[i])
+
+        offset = torch.tensor([[0, 0], [w, 0], [0, h], [w, h]]).repeat(1, 2)
+        target = {}
+        for k in targets[0]:
+            if k == 'boxes':
+                v = [t[k] + offset[i] for i, t in enumerate(targets)]
+            else: 
+                v = [t[k] for t in targets]
+            
+            if isinstance(v[0], torch.Tensor):
+                v = torch.cat(v, dim=0)
+
+            target[k] = v
+
+        if 'boxes' in target:
+            # target['boxes'] = target['boxes'].clamp(0, 640 * 2 - 1)
+            w, h = image.size
+            target['boxes'] = convert_to_tv_tensor(target['boxes'], 'boxes', box_format='xyxy', spatial_size=[h, w])
+        
+        if 'masks' in target:
+            target['masks'] = convert_to_tv_tensor(target['masks'], 'masks')
+
+        image, target = self.random_affine(image, target)
+        # image, target = self.resize(image, target)
+        image, target = self.crop(image, target)
+
+        return image, target, dataset
diff --git a/rtdetrv2_pytorch/src/data/transforms/presets.py b/rtdetrv2_pytorch/src/data/transforms/presets.py
new file mode 100644
index 0000000..137af31
--- /dev/null
+++ b/rtdetrv2_pytorch/src/data/transforms/presets.py
@@ -0,0 +1,2 @@
+""""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
diff --git a/rtdetrv2_pytorch/src/misc/__init__.py b/rtdetrv2_pytorch/src/misc/__init__.py
new file mode 100644
index 0000000..cbe60a6
--- /dev/null
+++ b/rtdetrv2_pytorch/src/misc/__init__.py
@@ -0,0 +1,7 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+from .logger import *
+from .visualizer import *
+from .dist_utils import setup_seed, setup_print
+from .profiler_utils import stats
diff --git a/rtdetrv2_pytorch/src/misc/box_ops.py b/rtdetrv2_pytorch/src/misc/box_ops.py
new file mode 100644
index 0000000..6c4c946
--- /dev/null
+++ b/rtdetrv2_pytorch/src/misc/box_ops.py
@@ -0,0 +1,103 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torchvision
+from torch import Tensor 
+from typing import List, Tuple
+
+
+def generalized_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    return torchvision.ops.generalized_box_iou(boxes1, boxes2)
+
+
+# elementwise
+def elementwise_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
+    """
+    Args:
+        boxes1, [N, 4]
+        boxes2, [N, 4]
+    Returns:
+        iou, [N, ]
+        union, [N, ]
+    """
+    area1 = torchvision.ops.box_area(boxes1) # [N, ]
+    area2 = torchvision.ops.box_area(boxes2) # [N, ]
+    lt = torch.max(boxes1[:, :2], boxes2[:, :2])  # [N, 2]
+    rb = torch.min(boxes1[:, 2:], boxes2[:, 2:])  # [N, 2]
+    wh = (rb - lt).clamp(min=0)  # [N, 2]
+    inter = wh[:, 0] * wh[:, 1]  # [N, ]
+    union = area1 + area2 - inter
+    iou = inter / union
+    return iou, union
+
+
+def elementwise_generalized_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
+    """
+    Args:
+        boxes1, [N, 4] with [x1, y1, x2, y2]
+        boxes2, [N, 4] with [x1, y1, x2, y2]
+    Returns:
+        giou, [N, ]
+    """
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    iou, union = elementwise_box_iou(boxes1, boxes2)
+    lt = torch.min(boxes1[:, :2], boxes2[:, :2]) # [N, 2]
+    rb = torch.max(boxes1[:, 2:], boxes2[:, 2:]) # [N, 2]
+    wh = (rb - lt).clamp(min=0)  # [N, 2]
+    area = wh[:, 0] * wh[:, 1]
+    return iou - (area - union) / area
+
+
+def check_point_inside_box(points: Tensor, boxes: Tensor, eps=1e-9) -> Tensor:
+    """
+    Args:
+        points, [K, 2], (x, y)
+        boxes, [N, 4], (x1, y1, y2, y2)
+    Returns:
+        Tensor (bool), [K, N]
+    """
+    x, y = [p.unsqueeze(-1) for p in points.unbind(-1)]
+    x1, y1, x2, y2 = [x.unsqueeze(0) for x in boxes.unbind(-1)]
+
+    l = x - x1
+    t = y - y1 
+    r = x2 - x
+    b = y2 - y
+    
+    ltrb = torch.stack([l, t, r, b], dim=-1)
+    mask = ltrb.min(dim=-1).values > eps
+
+    return mask
+
+
+def point_box_distance(points: Tensor, boxes: Tensor) -> Tensor:
+    """
+    Args:
+        boxes, [N, 4], (x1, y1, x2, y2)
+        points, [N, 2], (x, y)
+    Returns:
+        Tensor (N, 4), (l, t, r, b)
+    """
+    x1y1, x2y2 = torch.split(boxes, 2, dim=-1)
+    lt = points - x1y1
+    rb = x2y2 - points
+    return torch.concat([lt, rb], dim=-1)
+
+
+def point_distance_box(points: Tensor, distances: Tensor) -> Tensor:
+    """
+    Args:
+        points (Tensor), [N, 2], (x, y)
+        distances (Tensor), [N, 4], (l, t, r, b)
+    Returns:
+        boxes (Tensor),  (N, 4), (x1, y1, x2, y2)
+    """
+    lt, rb = torch.split(distances, 2, dim=-1)
+    x1y1 = -lt + points
+    x2y2 = rb + points
+    boxes = torch.concat([x1y1, x2y2], dim=-1)
+    return boxes
diff --git a/rtdetrv2_pytorch/src/misc/dist_utils.py b/rtdetrv2_pytorch/src/misc/dist_utils.py
new file mode 100644
index 0000000..79f7944
--- /dev/null
+++ b/rtdetrv2_pytorch/src/misc/dist_utils.py
@@ -0,0 +1,267 @@
+"""
+reference
+- https://github.com/pytorch/vision/blob/main/references/detection/utils.py
+- https://github.com/facebookresearch/detr/blob/master/util/misc.py#L406
+
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import os
+import random
+import numpy as np 
+import atexit
+
+import torch
+import torch.nn as nn 
+import torch.distributed
+import torch.backends.cudnn
+
+from torch.nn.parallel import DataParallel as DP
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+
+from torch.utils.data import DistributedSampler
+# from torch.utils.data.dataloader import DataLoader
+from ..data import DataLoader 
+
+
+def setup_distributed(print_rank: int=0, print_method: str='builtin', seed: int=None, ):
+    """
+    env setup
+    args:
+        print_rank, 
+        print_method, (builtin, rich)
+        seed, 
+    """
+    try:
+        # https://pytorch.org/docs/stable/elastic/run.html
+        RANK = int(os.getenv('RANK', -1))
+        LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1))  
+        WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1))
+        
+        # torch.distributed.init_process_group(backend=backend, init_method='env://')
+        torch.distributed.init_process_group(init_method='env://')
+        torch.distributed.barrier()
+
+        rank = torch.distributed.get_rank()
+        torch.cuda.set_device(rank)
+        torch.cuda.empty_cache()
+        enabled_dist = True
+        print('Initialized distributed mode...')
+
+    except:
+        enabled_dist = False
+        print('Not init distributed mode.')
+
+    setup_print(get_rank() == print_rank, method=print_method)
+    if seed is not None:
+        setup_seed(seed)
+
+    return enabled_dist
+
+
+def setup_print(is_main, method='builtin'):
+    """This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+
+    if method == 'builtin':
+        builtin_print = __builtin__.print
+
+    elif method == 'rich':
+        import rich 
+        builtin_print = rich.print
+
+    else:
+        raise AttributeError('')
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_main or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_available_and_initialized():
+    if not torch.distributed.is_available():
+        return False
+    if not torch.distributed.is_initialized():
+        return False
+    return True
+
+
+@atexit.register
+def cleanup():
+    """cleanup distributed environment
+    """
+    if is_dist_available_and_initialized():
+        torch.distributed.barrier()
+        torch.distributed.destroy_process_group()
+
+
+def get_rank():
+    if not is_dist_available_and_initialized():
+        return 0
+    return torch.distributed.get_rank()
+
+
+def get_world_size():
+    if not is_dist_available_and_initialized():
+        return 1
+    return torch.distributed.get_world_size()
+
+    
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+
+
+def warp_model(
+    model: torch.nn.Module, 
+    sync_bn: bool=False, 
+    dist_mode: str='ddp', 
+    find_unused_parameters: bool=False, 
+    compile: bool=False, 
+    compile_mode: str='reduce-overhead', 
+    **kwargs
+):
+    if is_dist_available_and_initialized():
+        rank = get_rank()
+        model = nn.SyncBatchNorm.convert_sync_batchnorm(model) if sync_bn else model 
+        if dist_mode == 'dp':
+            model = DP(model, device_ids=[rank], output_device=rank)
+        elif dist_mode == 'ddp':
+            model = DDP(model, device_ids=[rank], output_device=rank, find_unused_parameters=find_unused_parameters)
+        else:
+            raise AttributeError('')
+
+    if compile:
+        model = torch.compile(model, mode=compile_mode)
+
+    return model
+
+def de_model(model):
+    return de_parallel(de_complie(model))
+
+
+def warp_loader(loader, shuffle=False):        
+    if is_dist_available_and_initialized():
+        sampler = DistributedSampler(loader.dataset, shuffle=shuffle)
+        loader = DataLoader(loader.dataset, 
+                            loader.batch_size, 
+                            sampler=sampler, 
+                            drop_last=loader.drop_last, 
+                            collate_fn=loader.collate_fn, 
+                            pin_memory=loader.pin_memory,
+                            num_workers=loader.num_workers, )
+    return loader
+
+
+
+def is_parallel(model) -> bool:
+    # Returns True if model is of type DP or DDP
+    return type(model) in (torch.nn.parallel.DataParallel, torch.nn.parallel.DistributedDataParallel)
+
+
+def de_parallel(model) -> nn.Module:
+    # De-parallelize a model: returns single-GPU model if model is of type DP or DDP
+    return model.module if is_parallel(model) else model
+
+
+def reduce_dict(data, avg=True):
+    """
+    Args 
+        data dict: input, {k: v, ...}
+        avg bool: true
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return data
+    
+    with torch.no_grad():
+        keys, values = [], []
+        for k in sorted(data.keys()):
+            keys.append(k)
+            values.append(data[k])
+
+        values = torch.stack(values, dim=0)
+        torch.distributed.all_reduce(values)
+
+        if avg is True:
+            values /= world_size
+        
+        return {k: v for k, v in zip(keys, values)}
+        
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+    data_list = [None] * world_size
+    torch.distributed.all_gather_object(data_list, data)
+    return data_list
+
+    
+import time 
+def sync_time():
+    """sync_time
+    """
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+
+    return time.time()
+
+
+
+def setup_seed(seed: int, deterministic=False):
+    """setup_seed for reproducibility
+    torch.manual_seed(3407) is all you need. https://arxiv.org/abs/2109.08203
+    """
+    seed = seed + get_rank()
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+
+    # memory will be large when setting deterministic to True
+    if torch.backends.cudnn.is_available() and deterministic:
+        torch.backends.cudnn.deterministic = True
+
+
+# for torch.compile
+def check_compile():
+    import torch
+    import warnings
+    gpu_ok = False
+    if torch.cuda.is_available():
+        device_cap = torch.cuda.get_device_capability()
+        if device_cap in ((7, 0), (8, 0), (9, 0)):
+            gpu_ok = True
+    if not gpu_ok:
+        warnings.warn(
+            "GPU is not NVIDIA V100, A100, or H100. Speedup numbers may be lower "
+            "than expected."
+        )
+    return gpu_ok
+
+def is_compile(model):
+    import torch._dynamo
+    return type(model) in (torch._dynamo.OptimizedModule, )
+
+def de_complie(model):
+    return model._orig_mod if is_compile(model) else model
diff --git a/rtdetrv2_pytorch/src/misc/lazy_loader.py b/rtdetrv2_pytorch/src/misc/lazy_loader.py
new file mode 100644
index 0000000..e99ce59
--- /dev/null
+++ b/rtdetrv2_pytorch/src/misc/lazy_loader.py
@@ -0,0 +1,70 @@
+"""
+https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/util/lazy_loader.py
+"""
+
+
+import types
+import importlib
+
+class LazyLoader(types.ModuleType):
+  """Lazily import a module, mainly to avoid pulling in large dependencies.
+
+  `paddle`, and `ffmpeg` are examples of modules that are large and not always
+  needed, and this allows them to only be loaded when they are used.
+  """
+
+  # The lint error here is incorrect.
+  def __init__(self, local_name, parent_module_globals, name, warning=None):
+    self._local_name = local_name
+    self._parent_module_globals = parent_module_globals
+    self._warning = warning
+
+    # These members allows doctest correctly process this module member without
+    # triggering self._load(). self._load() mutates parant_module_globals and
+    # triggers a dict mutated during iteration error from doctest.py.
+    # - for from_module()
+    self.__module__ = name.rsplit(".", 1)[0]
+    # - for is_routine()
+    self.__wrapped__ = None
+
+    super(LazyLoader, self).__init__(name)
+
+  def _load(self):
+    """Load the module and insert it into the parent's globals."""
+    # Import the target module and insert it into the parent's namespace
+    module = importlib.import_module(self.__name__)
+    self._parent_module_globals[self._local_name] = module
+
+    # Emit a warning if one was specified
+    if self._warning:
+      # logging.warning(self._warning)
+      # Make sure to only warn once.
+      self._warning = None
+
+    # Update this object's dict so that if someone keeps a reference to the
+    #   LazyLoader, lookups are efficient (__getattr__ is only called on lookups
+    #   that fail).
+    self.__dict__.update(module.__dict__)
+
+    return module
+
+  def __getattr__(self, item):
+    module = self._load()
+    return getattr(module, item)
+
+  def __repr__(self):
+    # Carefully to not trigger _load, since repr may be called in very
+    # sensitive places.
+    return f"<LazyLoader {self.__name__} as {self._local_name}>"
+
+  def __dir__(self):
+    module = self._load()
+    return dir(module)
+
+
+# import paddle.nn as nn
+# nn = LazyLoader("nn", globals(), "paddle.nn")
+
+# class M(nn.Layer):
+#     def __init__(self) -> None:
+#       super().__init__()
diff --git a/rtdetrv2_pytorch/src/misc/logger.py b/rtdetrv2_pytorch/src/misc/logger.py
new file mode 100644
index 0000000..2ef0c27
--- /dev/null
+++ b/rtdetrv2_pytorch/src/misc/logger.py
@@ -0,0 +1,239 @@
+"""
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+https://github.com/facebookresearch/detr/blob/main/util/misc.py
+Mostly copy-paste from torchvision references.
+"""
+
+import time
+import pickle
+import datetime
+from collections import defaultdict, deque
+from typing import Dict
+
+import torch
+import torch.distributed as tdist
+
+from .dist_utils import is_dist_available_and_initialized, get_world_size
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_available_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        tdist.barrier()
+        tdist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to("cuda")
+
+    # obtain Tensor size of each rank
+    local_size = torch.tensor([tensor.numel()], device="cuda")
+    size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
+    tdist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
+    if local_size != max_size:
+        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
+        tensor = torch.cat((tensor, padding), dim=0)
+    tdist.all_gather(tensor_list, tensor)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+
+def reduce_dict(input_dict, average=True) -> Dict[str, torch.Tensor]:
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that all processes
+    have the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        tdist.all_reduce(values)
+        if average:
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        if torch.cuda.is_available():
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}',
+                'max mem: {memory:.0f}'
+            ])
+        else:
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}'
+            ])
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+
diff --git a/rtdetrv2_pytorch/src/misc/profiler_utils.py b/rtdetrv2_pytorch/src/misc/profiler_utils.py
new file mode 100644
index 0000000..b63dcba
--- /dev/null
+++ b/rtdetrv2_pytorch/src/misc/profiler_utils.py
@@ -0,0 +1,65 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import re
+import torch
+import torch.nn as nn
+from torch import Tensor 
+
+from typing import List
+
+def stats(
+    model: nn.Module, 
+    data: Tensor=None, 
+    input_shape: List=[1, 3, 640, 640], 
+    device: str='cpu', 
+    verbose=False) -> str:
+    
+    is_training = model.training
+
+    model.train()
+    num_params = sum([p.numel() for p in model.parameters() if p.requires_grad])
+
+    model.eval()
+    model = model.to(device)
+
+    if data is None:
+        data = torch.rand(*input_shape, device=device)
+        
+    def trace_handler(prof):
+        print(prof.key_averages().table(
+            sort_by="self_cuda_time_total", row_limit=-1))
+
+    num_active = 2
+    with torch.profiler.profile(
+        activities=[
+            torch.profiler.ProfilerActivity.CPU,
+            torch.profiler.ProfilerActivity.CUDA,
+        ],
+        schedule=torch.profiler.schedule(
+            wait=1,
+            warmup=1,
+            active=num_active,
+            repeat=1
+        ),
+        # on_trace_ready=trace_handler,
+        # on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')
+        # with_modules=True,
+        with_flops=True,
+    ) as p:
+        for _ in range(5):
+            _ = model(data)
+            p.step()
+
+    if is_training:
+        model.train()
+    
+    info = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1)
+    num_flops = sum([float(v.strip()) for v in re.findall('(\d+.?\d+ *\n)', info)]) / num_active
+
+    if verbose:
+        # print(info)
+        print(f'Total number of trainable parameters: {num_params}')
+        print(f'Total number of flops: {int(num_flops)}M with {input_shape}')
+
+    return {'n_parameters': num_params, 'n_flops': num_flops, 'info': info}
diff --git a/rtdetrv2_pytorch/src/misc/visualizer.py b/rtdetrv2_pytorch/src/misc/visualizer.py
new file mode 100644
index 0000000..b9bb7f8
--- /dev/null
+++ b/rtdetrv2_pytorch/src/misc/visualizer.py
@@ -0,0 +1,34 @@
+""""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torch.utils.data
+
+import torchvision
+torchvision.disable_beta_transforms_warning()
+
+import PIL 
+
+__all__ = ['show_sample']
+
+def show_sample(sample):
+    """for coco dataset/dataloader
+    """
+    import matplotlib.pyplot as plt
+    from torchvision.transforms.v2 import functional as F
+    from torchvision.utils import draw_bounding_boxes
+
+    image, target = sample
+    if isinstance(image, PIL.Image.Image):
+        image = F.to_image_tensor(image)
+
+    image = F.convert_dtype(image, torch.uint8)
+    annotated_image = draw_bounding_boxes(image, target["boxes"], colors="yellow", width=3)
+
+    fig, ax = plt.subplots()
+    ax.imshow(annotated_image.permute(1, 2, 0).numpy())
+    ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
+    fig.tight_layout()
+    fig.show()
+    plt.show()
+
diff --git a/rtdetrv2_pytorch/src/nn/__init__.py b/rtdetrv2_pytorch/src/nn/__init__.py
new file mode 100644
index 0000000..37d12fc
--- /dev/null
+++ b/rtdetrv2_pytorch/src/nn/__init__.py
@@ -0,0 +1,17 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+
+from .arch import *
+from .criterion import *
+from .postprocessor import *
+
+# 
+from .backbone import *
+
+
+from .backbone import (
+    get_activation, 
+    FrozenBatchNorm2d,
+    freeze_batch_norm2d,
+)
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/src/nn/arch/__init__.py b/rtdetrv2_pytorch/src/nn/arch/__init__.py
new file mode 100644
index 0000000..57774ad
--- /dev/null
+++ b/rtdetrv2_pytorch/src/nn/arch/__init__.py
@@ -0,0 +1,6 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+
+from .classification import Classification, ClassHead
+from .yolo import YOLO
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/src/nn/arch/classification.py b/rtdetrv2_pytorch/src/nn/arch/classification.py
new file mode 100644
index 0000000..7b47b0f
--- /dev/null
+++ b/rtdetrv2_pytorch/src/nn/arch/classification.py
@@ -0,0 +1,45 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+
+import torch 
+import torch.nn as nn
+
+from ...core import register
+
+
+__all__ = ['Classification', 'ClassHead']
+
+
+@register()
+class Classification(torch.nn.Module):
+    __inject__ = ['backbone', 'head']
+
+    def __init__(self, backbone: nn.Module, head: nn.Module=None):
+        super().__init__()
+        
+        self.backbone = backbone
+        self.head = head
+
+    def forward(self, x):
+        x = self.backbone(x)
+
+        if self.head is not None:
+            x = self.head(x)
+
+        return x 
+
+
+@register()
+class ClassHead(nn.Module):
+    def __init__(self, hidden_dim, num_classes):
+        super().__init__()
+        self.pool = nn.AdaptiveAvgPool2d(1)
+        self.proj = nn.Linear(hidden_dim, num_classes)  
+
+    def forward(self, x):
+        x = x[0] if isinstance(x, (list, tuple)) else x 
+        x = self.pool(x)
+        x = x.reshape(x.shape[0], -1)
+        x = self.proj(x)
+        return x 
diff --git a/rtdetrv2_pytorch/src/nn/arch/yolo.py b/rtdetrv2_pytorch/src/nn/arch/yolo.py
new file mode 100644
index 0000000..c3c1fae
--- /dev/null
+++ b/rtdetrv2_pytorch/src/nn/arch/yolo.py
@@ -0,0 +1,33 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+
+from ...core import register
+
+
+__all__ = ['YOLO', ]
+
+
+@register()
+class YOLO(torch.nn.Module):
+    __inject__ = ['backbone', 'neck', 'head', ]
+
+    def __init__(self, backbone: torch.nn.Module, neck, head):
+        super().__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.head = head
+
+    def forward(self, x, **kwargs):           
+        x = self.backbone(x)
+        x = self.neck(x)        
+        x = self.head(x)
+        return x
+    
+    def deploy(self, ):
+        self.eval()
+        for m in self.modules():
+            if m is not self and hasattr(m, 'deploy'):
+                m.deploy()
+        return self 
diff --git a/rtdetrv2_pytorch/src/nn/backbone/__init__.py b/rtdetrv2_pytorch/src/nn/backbone/__init__.py
new file mode 100644
index 0000000..b001c3f
--- /dev/null
+++ b/rtdetrv2_pytorch/src/nn/backbone/__init__.py
@@ -0,0 +1,18 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+from .common import (
+    get_activation, 
+    FrozenBatchNorm2d,
+    freeze_batch_norm2d,
+)
+from .presnet import PResNet
+from .test_resnet import MResNet
+
+from .timm_model import TimmModel
+from .torchvision_model import TorchVisionModel
+
+from .csp_resnet import CSPResNet
+from .csp_darknet import CSPDarkNet, CSPPAN
+
+from .hgnetv2 import HGNetv2
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/src/nn/backbone/common.py b/rtdetrv2_pytorch/src/nn/backbone/common.py
new file mode 100644
index 0000000..1a6604e
--- /dev/null
+++ b/rtdetrv2_pytorch/src/nn/backbone/common.py
@@ -0,0 +1,97 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch 
+import torch.nn as nn
+
+
+class FrozenBatchNorm2d(nn.Module):
+    """copy and modified from https://github.com/facebookresearch/detr/blob/master/models/backbone.py
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+    def __init__(self, num_features, eps=1e-5):
+        super(FrozenBatchNorm2d, self).__init__()
+        n = num_features
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+        self.eps = eps
+        self.num_features = n 
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        num_batches_tracked_key = prefix + 'num_batches_tracked'
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs)
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        scale = w * (rv + self.eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+
+    def extra_repr(self):
+        return (
+            "{num_features}, eps={eps}".format(**self.__dict__)
+        )
+
+def freeze_batch_norm2d(module: nn.Module) -> nn.Module:
+    if isinstance(module, nn.BatchNorm2d):
+        module = FrozenBatchNorm2d(module.num_features)
+    else:
+        for name, child in module.named_children():
+            _child = freeze_batch_norm2d(child)
+            if _child is not child:
+                setattr(module, name, _child)
+    return module
+
+
+def get_activation(act: str, inplace: bool=True):
+    """get activation
+    """
+    if act is None:
+        return nn.Identity()
+
+    elif isinstance(act, nn.Module):
+        return act 
+
+    act = act.lower()
+    
+    if act == 'silu' or act == 'swish':
+        m = nn.SiLU()
+
+    elif act == 'relu':
+        m = nn.ReLU()
+
+    elif act == 'leaky_relu':
+        m = nn.LeakyReLU()
+
+    elif act == 'silu':
+        m = nn.SiLU()
+    
+    elif act == 'gelu':
+        m = nn.GELU()
+
+    elif act == 'hardsigmoid':
+        m = nn.Hardsigmoid()
+
+    else:
+        raise RuntimeError('')  
+
+    if hasattr(m, 'inplace'):
+        m.inplace = inplace
+    
+    return m 
diff --git a/rtdetrv2_pytorch/src/nn/backbone/csp_darknet.py b/rtdetrv2_pytorch/src/nn/backbone/csp_darknet.py
new file mode 100644
index 0000000..bb89947
--- /dev/null
+++ b/rtdetrv2_pytorch/src/nn/backbone/csp_darknet.py
@@ -0,0 +1,177 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch 
+import torch.nn as nn 
+import torch.nn.functional as F 
+
+import math
+import warnings
+
+from .common import get_activation
+from ...core import register
+
+
+def autopad(k, p=None): 
+    if p is None:
+        p = k // 2 if isinstance(k, int) else [x // 2 for x in k] 
+    return p
+
+def make_divisible(c, d):
+    return math.ceil(c / d) * d
+    
+
+class Conv(nn.Module):
+    def __init__(self, cin, cout, k=1, s=1, p=None, g=1, act='silu') -> None:
+        super().__init__()
+        self.conv = nn.Conv2d(cin, cout, k, s, autopad(k, p), groups=g, bias=False)
+        self.bn = nn.BatchNorm2d(cout)
+        self.act = get_activation(act, inplace=True)
+
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+
+
+class Bottleneck(nn.Module):
+    # Standard bottleneck
+    def __init__(self, c1, c2, shortcut=True, g=1, e=0.5, act='silu'):
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1, act=act)
+        self.cv2 = Conv(c_, c2, 3, 1, g=g, act=act)
+        self.add = shortcut and c1 == c2
+
+    def forward(self, x):
+        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
+
+
+class C3(nn.Module):
+    # CSP Bottleneck with 3 convolutions
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, act='silu'):  # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1, act=act)
+        self.cv2 = Conv(c1, c_, 1, 1, act=act)
+        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0, act=act) for _ in range(n)))
+        self.cv3 = Conv(2 * c_, c2, 1, act=act)
+
+    def forward(self, x):
+        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
+
+
+class SPPF(nn.Module):
+    # Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher
+    def __init__(self, c1, c2, k=5, act='silu'):  # equivalent to SPP(k=(5, 9, 13))
+        super().__init__()
+        c_ = c1 // 2  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1, act=act)
+        self.cv2 = Conv(c_ * 4, c2, 1, 1, act=act)
+        self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
+
+    def forward(self, x):
+        x = self.cv1(x)
+        with warnings.catch_warnings():
+            warnings.simplefilter('ignore')  # suppress torch 1.9.0 max_pool2d() warning
+            y1 = self.m(x)
+            y2 = self.m(y1)
+            return self.cv2(torch.cat([x, y1, y2, self.m(y2)], 1))
+
+
+@register()
+class CSPDarkNet(nn.Module):
+    __share__ = ['depth_multi', 'width_multi']
+
+    def __init__(self, in_channels=3, width_multi=1.0, depth_multi=1.0, return_idx=[2, 3, -1], act='silu', ) -> None:
+        super().__init__()
+
+        channels = [64, 128, 256, 512, 1024]
+        channels = [make_divisible(c * width_multi, 8) for c in channels]
+
+        depths = [3, 6, 9, 3]
+        depths = [max(round(d * depth_multi), 1) for d in depths]
+
+        self.layers = nn.ModuleList([Conv(in_channels, channels[0], 6, 2, 2, act=act)])
+        for i, (c, d) in enumerate(zip(channels, depths), 1):
+            layer = nn.Sequential(*[Conv(c, channels[i], 3, 2, act=act), C3(channels[i], channels[i], n=d, act=act)])
+            self.layers.append(layer)
+
+        self.layers.append(SPPF(channels[-1], channels[-1], k=5, act=act))
+
+        self.return_idx = return_idx
+        self.out_channels = [channels[i] for i in self.return_idx]
+        self.strides = [[2, 4, 8, 16, 32][i] for i in self.return_idx]
+        self.depths = depths
+        self.act = act
+
+    def forward(self, x):
+        outputs = []
+        for _, m in enumerate(self.layers):
+            x = m(x)
+            outputs.append(x)
+
+        return [outputs[i] for i in self.return_idx]
+
+
+@register()
+class CSPPAN(nn.Module):
+    """
+    P5 ---> 1x1  ---------------------------------> concat --> c3 --> det
+             | up                                     | conv /2 
+    P4 ---> concat ---> c3 ---> 1x1  -->  concat ---> c3 -----------> det
+                                 | up       | conv /2
+    P3 -----------------------> concat ---> c3 ---------------------> det
+    """
+    __share__ = ['depth_multi', ]
+
+    def __init__(self, in_channels=[256, 512, 1024], depth_multi=1., act='silu') -> None:
+        super().__init__()
+        depth = max(round(3 * depth_multi), 1)
+
+        self.out_channels = in_channels
+        self.fpn_stems = nn.ModuleList([Conv(cin, cout, 1, 1, act=act) for cin, cout in zip(in_channels[::-1], in_channels[::-1][1:])])
+        self.fpn_csps = nn.ModuleList([C3(cin, cout, depth, False, act=act) for cin, cout in zip(in_channels[::-1], in_channels[::-1][1:])])
+
+        self.pan_stems = nn.ModuleList([Conv(c, c, 3, 2, act=act) for c in in_channels[:-1]])
+        self.pan_csps = nn.ModuleList([C3(c, c, depth, False, act=act) for c in in_channels[1:]])
+
+    def forward(self, feats):
+        fpn_feats = []
+        for i, feat in enumerate(feats[::-1]):
+            if i == 0:
+                feat = self.fpn_stems[i](feat)
+                fpn_feats.append(feat)
+            else:
+                _feat = F.interpolate(fpn_feats[-1], scale_factor=2, mode='nearest')
+                feat = torch.concat([_feat, feat], dim=1)
+                feat = self.fpn_csps[i-1](feat)
+                if i < len(self.fpn_stems):
+                    feat = self.fpn_stems[i](feat)
+                fpn_feats.append(feat)
+
+        pan_feats = []
+        for i, feat in enumerate(fpn_feats[::-1]):
+            if i == 0:
+                pan_feats.append(feat)
+            else:
+                _feat = self.pan_stems[i-1](pan_feats[-1])
+                feat = torch.concat([_feat, feat], dim=1)
+                feat = self.pan_csps[i-1](feat)
+                pan_feats.append(feat)
+
+        return pan_feats
+
+
+if __name__ == '__main__':
+
+    data = torch.rand(1, 3, 320, 640)
+
+    width_multi = 0.75
+    depth_multi = 0.33
+
+    m = CSPDarkNet(3, width_multi=width_multi, depth_multi=depth_multi, act='silu')
+    outputs = m(data)
+    print([o.shape for o in outputs])
+
+    m = CSPPAN(in_channels=m.out_channels, depth_multi=depth_multi, act='silu')
+    outputs = m(outputs)
+    print([o.shape for o in outputs])
diff --git a/rtdetrv2_pytorch/src/nn/backbone/csp_resnet.py b/rtdetrv2_pytorch/src/nn/backbone/csp_resnet.py
new file mode 100644
index 0000000..ee3c493
--- /dev/null
+++ b/rtdetrv2_pytorch/src/nn/backbone/csp_resnet.py
@@ -0,0 +1,277 @@
+"""
+https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.6/ppdet/modeling/backbones/cspresnet.py
+
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch 
+import torch.nn as nn 
+import torch.nn.functional as F 
+from collections import OrderedDict
+
+from .common import get_activation
+
+from ...core import register
+
+__all__ = ['CSPResNet']
+
+
+donwload_url = {
+    's': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/CSPResNetb_s_pretrained_from_paddle.pth',
+    'm': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/CSPResNetb_m_pretrained_from_paddle.pth',
+    'l': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/CSPResNetb_l_pretrained_from_paddle.pth',
+    'x': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/CSPResNetb_x_pretrained_from_paddle.pth',
+}
+
+
+class ConvBNLayer(nn.Module):
+    def __init__(self, ch_in, ch_out, filter_size=3, stride=1, groups=1, padding=0, act=None):
+        super().__init__()
+        self.conv = nn.Conv2d(ch_in, ch_out, filter_size, stride, padding, groups=groups, bias=False)
+        self.bn = nn.BatchNorm2d(ch_out)
+        self.act = get_activation(act) 
+       
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+        return x
+
+class RepVggBlock(nn.Module):
+    def __init__(self, ch_in, ch_out, act='relu', alpha: bool=False):
+        super().__init__()
+        self.ch_in = ch_in
+        self.ch_out = ch_out
+        self.conv1 = ConvBNLayer(
+            ch_in, ch_out, 3, stride=1, padding=1, act=None)
+        self.conv2 = ConvBNLayer(
+            ch_in, ch_out, 1, stride=1, padding=0, act=None)
+        self.act = get_activation(act) 
+
+        if alpha:
+            self.alpha = nn.Parameter(torch.ones(1, ))
+        else:
+            self.alpha = None
+
+    def forward(self, x):
+        if hasattr(self, 'conv'):
+            y = self.conv(x)
+        else:
+            if self.alpha:
+                y = self.conv1(x) + self.alpha * self.conv2(x)
+            else:
+                y = self.conv1(x) + self.conv2(x)
+        y = self.act(y)
+        return y
+
+    def convert_to_deploy(self):
+        if not hasattr(self, 'conv'):
+            self.conv = nn.Conv2d(self.ch_in, self.ch_out, 3, 1, padding=1)
+
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.conv.weight.data = kernel
+        self.conv.bias.data = bias 
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
+
+        if self.alpha:
+            return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor(
+                kernel1x1), bias3x3 + self.alpha * bias1x1
+        else:
+            return kernel3x3 + self._pad_1x1_to_3x3_tensor(
+                kernel1x1), bias3x3 + bias1x1
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return F.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch: ConvBNLayer):
+        if branch is None:
+            return 0, 0
+        kernel = branch.conv.weight
+        running_mean = branch.norm.running_mean
+        running_var = branch.norm.running_var
+        gamma = branch.norm.weight
+        beta = branch.norm.bias
+        eps = branch.norm.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+
+class BasicBlock(nn.Module):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 act='relu',
+                 shortcut=True,
+                 use_alpha=False):
+        super().__init__()
+        assert ch_in == ch_out
+        self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act)
+        self.conv2 = RepVggBlock(ch_out, ch_out, act=act, alpha=use_alpha)
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        y = self.conv1(x)
+        y = self.conv2(y)
+        if self.shortcut:
+            return x + y
+        else:
+            return y
+
+
+class EffectiveSELayer(nn.Module):
+    """ Effective Squeeze-Excitation
+    From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
+    """
+
+    def __init__(self, channels, act='hardsigmoid'):
+        super(EffectiveSELayer, self).__init__()
+        self.fc = nn.Conv2d(channels, channels, kernel_size=1, padding=0)
+        self.act = get_activation(act)
+
+    def forward(self, x: torch.Tensor):
+        x_se = x.mean((2, 3), keepdim=True)
+        x_se = self.fc(x_se)
+        x_se = self.act(x_se)
+        return x * x_se
+
+
+class CSPResStage(nn.Module):
+    def __init__(self,
+                 block_fn,
+                 ch_in,
+                 ch_out,
+                 n,
+                 stride,
+                 act='relu',
+                 attn='eca',
+                 use_alpha=False):
+        super().__init__()
+        ch_mid = (ch_in + ch_out) // 2
+        if stride == 2:
+            self.conv_down = ConvBNLayer(
+                ch_in, ch_mid, 3, stride=2, padding=1, act=act)
+        else:
+            self.conv_down = None
+        self.conv1 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
+        self.conv2 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
+        self.blocks = nn.Sequential(*[
+            block_fn(
+                ch_mid // 2,
+                ch_mid // 2,
+                act=act,
+                shortcut=True,
+                use_alpha=use_alpha) for i in range(n)
+        ])
+        if attn:
+            self.attn = EffectiveSELayer(ch_mid, act='hardsigmoid')
+        else:
+            self.attn = None
+
+        self.conv3 = ConvBNLayer(ch_mid, ch_out, 1, act=act)
+
+    def forward(self, x):
+        if self.conv_down is not None:
+            x = self.conv_down(x)
+        y1 = self.conv1(x)
+        y2 = self.blocks(self.conv2(x))
+        y = torch.concat([y1, y2], dim=1)
+        if self.attn is not None:
+            y = self.attn(y)
+        y = self.conv3(y)
+        return y
+
+
+@register()
+class CSPResNet(nn.Module):
+    layers = [3, 6, 6, 3]
+    channels = [64, 128, 256, 512, 1024]
+    model_cfg = {
+        's': {'depth_mult': 0.33, 'width_mult': 0.50, },
+        'm': {'depth_mult': 0.67, 'width_mult': 0.75, },
+        'l': {'depth_mult': 1.00, 'width_mult': 1.00, },
+        'x': {'depth_mult': 1.33, 'width_mult': 1.25, },
+    }
+
+    def __init__(self,
+                 name: str,
+                 act='silu',
+                 return_idx=[1, 2, 3],
+                 use_large_stem=True,
+                 use_alpha=False,
+                 pretrained=False):
+
+        super().__init__()        
+        depth_mult = self.model_cfg[name]['depth_mult']
+        width_mult = self.model_cfg[name]['width_mult']
+
+        channels = [max(round(c * width_mult), 1) for c in self.channels]
+        layers = [max(round(l * depth_mult), 1) for l in self.layers]
+        act = get_activation(act)
+
+        if use_large_stem:
+            self.stem = nn.Sequential(OrderedDict([
+                ('conv1', ConvBNLayer(
+                    3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
+                ('conv2', ConvBNLayer(
+                    channels[0] // 2,
+                    channels[0] // 2,
+                    3,
+                    stride=1,
+                    padding=1,
+                    act=act)), ('conv3', ConvBNLayer(
+                        channels[0] // 2,
+                        channels[0],
+                        3,
+                        stride=1,
+                        padding=1,
+                        act=act))]))
+        else:
+            self.stem = nn.Sequential(OrderedDict([
+                ('conv1', ConvBNLayer(
+                    3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
+                ('conv2', ConvBNLayer(
+                    channels[0] // 2,
+                    channels[0],
+                    3,
+                    stride=1,
+                    padding=1,
+                    act=act))]))
+
+        n = len(channels) - 1
+        self.stages = nn.Sequential(OrderedDict([(str(i), CSPResStage(
+            BasicBlock,
+            channels[i],
+            channels[i + 1],
+            layers[i],
+            2,
+            act=act,
+            use_alpha=use_alpha)) for i in range(n)]))
+
+        self._out_channels = channels[1:]
+        self._out_strides = [4 * 2**i for i in range(n)]
+        self.return_idx = return_idx
+
+        if pretrained:
+            if isinstance(pretrained, bool) or 'http' in pretrained:
+                state = torch.hub.load_state_dict_from_url(donwload_url[name], map_location='cpu')
+            else:
+                state = torch.load(pretrained, map_location='cpu')
+            self.load_state_dict(state)
+            print(f'Load CSPResNet_{name} state_dict')
+
+    def forward(self, x):
+        x = self.stem(x)
+        outs = []
+        for idx, stage in enumerate(self.stages):
+            x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+        
+        return outs
diff --git a/rtdetrv2_pytorch/src/nn/backbone/hgnetv2.py b/rtdetrv2_pytorch/src/nn/backbone/hgnetv2.py
new file mode 100644
index 0000000..31cabbb
--- /dev/null
+++ b/rtdetrv2_pytorch/src/nn/backbone/hgnetv2.py
@@ -0,0 +1,428 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+
+https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+import torch.nn.functional as F
+
+from torch import Tensor
+from typing import List, Tuple
+
+from .common import FrozenBatchNorm2d
+from ...core import register
+
+
+__all__ = ['HGNetv2']
+
+
+class LearnableAffineBlock(nn.Module):
+    def __init__(self, scale_value=1.0, bias_value=0.0):
+        super().__init__()
+        self.scale = nn.Parameter(torch.tensor([scale_value]))
+        self.bias = nn.Parameter(torch.tensor([bias_value]))
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.scale * x + self.bias
+
+
+class ConvBNAct(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=0,
+                 groups=1,
+                 use_act=True,
+                 use_lab=False):
+        super().__init__()
+        self.use_act = use_act
+        self.use_lab = use_lab
+        if padding == 'same':
+            self.conv = nn.Sequential(
+                nn.ZeroPad2d([0, 1, 0, 1]),
+                nn.Conv2d(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    stride,
+                    groups=groups,
+                    bias=False
+                )
+            )
+        else:
+            self.conv = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding=(kernel_size - 1) // 2,
+                groups=groups,
+                bias=False
+            )
+        self.bn = nn.BatchNorm2d(out_channels)
+        if self.use_act:
+            self.act = nn.ReLU()
+            if self.use_lab:
+                self.lab = LearnableAffineBlock()
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.use_act:
+            x = self.act(x)
+            if self.use_lab:
+                x = self.lab(x)
+        return x
+
+
+class LightConvBNAct(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 groups=1,
+                 use_lab=False):
+        super().__init__()
+        self.conv1 = ConvBNAct(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            use_act=False,
+            use_lab=use_lab
+        )
+        self.conv2 = ConvBNAct(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            groups=out_channels,
+            use_act=True,
+            use_lab=use_lab
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+
+
+class StemBlock(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 use_lab=False):
+        super().__init__()
+        self.stem1 = ConvBNAct(
+            in_channels=in_channels,
+            out_channels=mid_channels,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab
+        )
+        self.stem2a = ConvBNAct(
+            in_channels=mid_channels,
+            out_channels=mid_channels // 2,
+            kernel_size=2,
+            stride=1,
+            padding='same',
+            use_lab=use_lab
+        )
+        self.stem2b = ConvBNAct(
+            in_channels=mid_channels // 2,
+            out_channels=mid_channels,
+            kernel_size=2,
+            stride=1,
+            padding='same',
+            use_lab=use_lab
+        )
+        self.stem3 = ConvBNAct(
+            in_channels=mid_channels * 2,
+            out_channels=mid_channels,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab
+        )
+        self.stem4 = ConvBNAct(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab
+        )
+
+        self.pool = nn.Sequential(
+            nn.ZeroPad2d([0, 1, 0, 1]),
+            nn.MaxPool2d(2, 1, ceil_mode=True)
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.stem1(x)
+        x2 = self.stem2a(x)
+        x2 = self.stem2b(x2)
+        x1 = self.pool(x)
+        x = torch.concat([x1, x2], dim=1)
+        x = self.stem3(x)
+        x = self.stem4(x)
+
+        return x
+
+
+class HG_Block(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 kernel_size=3,
+                 layer_num=6,
+                 identity=False,
+                 light_block=True,
+                 use_lab=False):
+        super().__init__()
+        self.identity = identity
+
+        self.layers = nn.ModuleList()
+        block_type = "LightConvBNAct" if light_block else "ConvBNAct"
+        for i in range(layer_num):
+            self.layers.append(
+                eval(block_type)(in_channels=in_channels
+                                 if i == 0 else mid_channels,
+                                 out_channels=mid_channels,
+                                 stride=1,
+                                 kernel_size=kernel_size,
+                                 use_lab=use_lab))
+        # feature aggregation
+        total_channels = in_channels + layer_num * mid_channels
+        self.aggregation_squeeze_conv = ConvBNAct(
+            in_channels=total_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab)
+        self.aggregation_excitation_conv = ConvBNAct(
+            in_channels=out_channels // 2,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab)
+
+    def forward(self, x):
+        identity = x
+        output = []
+        output.append(x)
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+        x = torch.concat(output, dim=1)
+        x = self.aggregation_squeeze_conv(x)
+        x = self.aggregation_excitation_conv(x)
+        if self.identity:
+            x = x + identity
+        return x
+
+
+class HG_Stage(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 block_num,
+                 layer_num=6,
+                 downsample=True,
+                 light_block=True,
+                 kernel_size=3,
+                 use_lab=False):
+        super().__init__()
+        self.downsample = downsample
+        if downsample:
+            self.downsample = ConvBNAct(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=3,
+                stride=2,
+                groups=in_channels,
+                use_act=False,
+                use_lab=use_lab)
+
+        blocks_list = []
+        for i in range(block_num):
+            blocks_list.append(
+                HG_Block(
+                    in_channels=in_channels if i == 0 else out_channels,
+                    mid_channels=mid_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    layer_num=layer_num,
+                    identity=False if i == 0 else True,
+                    light_block=light_block,
+                    use_lab=use_lab))
+        self.blocks = nn.Sequential(*blocks_list)
+
+    def forward(self, x):
+        if self.downsample:
+            x = self.downsample(x)
+        x = self.blocks(x)
+        return x
+
+
+@register()
+class HGNetv2(nn.Module):
+    """
+    Args:
+        stem_channels: list. Number of channels for the stem block.
+        stage_type: str. The stage configuration of PPHGNet. such as the number of channels, stride, etc.
+        use_lab: boolean. Whether to use LearnableAffineBlock in network.
+        lr_mult_list: list. Control the learning rate of different stages.
+    Returns:
+        model: nn.Module.
+    """
+
+    arch_configs = {
+        'L': {
+            'stem_channels': [3, 32, 48],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [48, 48, 128, 1, False, False, 3, 6],
+                "stage2": [128, 96, 512, 1, True, False, 3, 6],
+                "stage3": [512, 192, 1024, 3, True, True, 5, 6],
+                "stage4": [1024, 384, 2048, 1, True, True, 5, 6],
+            },
+            'url': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/PPHGNetV2_L_ssld_pretrained_from_paddle.pth',
+
+        },
+        'X': {
+            'stem_channels': [3, 32, 64],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [64, 64, 128, 1, False, False, 3, 6],
+                "stage2": [128, 128, 512, 2, True, False, 3, 6],
+                "stage3": [512, 256, 1024, 5, True, True, 5, 6],
+                "stage4": [1024, 512, 2048, 2, True, True, 5, 6],
+            },
+            'url': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/PPHGNetV2_X_ssld_pretrained_from_paddle.pth',
+
+        },
+        'H': {
+            'stem_channels': [3, 48, 96],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [96, 96, 192, 2, False, False, 3, 6],
+                "stage2": [192, 192, 512, 3, True, False, 3, 6],
+                "stage3": [512, 384, 1024, 6, True, True, 5, 6],
+                "stage4": [1024, 768, 2048, 3, True, True, 5, 6],
+            },
+            'url': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/PPHGNetV2_H_ssld_pretrained_from_paddle.pth',
+        }
+    }
+
+    def __init__(self,
+                 name,
+                 use_lab=False,
+                 return_idx=[1, 2, 3],
+                 freeze_at=-1,
+                 freeze_norm=False,
+                 pretrained=False):
+        super().__init__()
+        self.use_lab = use_lab
+        self.return_idx = return_idx
+
+        stem_channels = self.arch_configs[name]['stem_channels']
+        stage_config = self.arch_configs[name]['stage_config']
+        download_url = self.arch_configs[name]['url']
+
+        self._out_strides = [4, 8, 16, 32]
+        self._out_channels = [stage_config[k][2] for k in stage_config]
+
+        # stem
+        self.stem = StemBlock(
+            in_channels=stem_channels[0],
+            mid_channels=stem_channels[1],
+            out_channels=stem_channels[2],
+            use_lab=use_lab
+        )
+
+        # stages
+        self.stages = nn.ModuleList()
+        for i, k in enumerate(stage_config):
+            in_channels, mid_channels, out_channels, block_num, downsample, light_block, kernel_size, layer_num = stage_config[
+                k]
+            self.stages.append(
+                HG_Stage(
+                    in_channels,
+                    mid_channels,
+                    out_channels,
+                    block_num,
+                    layer_num,
+                    downsample,
+                    light_block,
+                    kernel_size,
+                    use_lab))
+
+        self._init_weights()
+
+        if freeze_at >= 0:
+            self._freeze_parameters(self.stem)
+            for i in range(min(freeze_at, 4)):
+                self._freeze_parameters(self.stages[i])
+
+        if freeze_norm:
+            self._freeze_norm(self)
+
+        if pretrained:
+            if isinstance(pretrained, bool) or 'http' in pretrained:
+                state = torch.hub.load_state_dict_from_url(download_url, map_location='cpu')
+            else:
+                state = torch.load(pretrained, map_location='cpu')
+            self.load_state_dict(state)
+            print(f'Load HGNetv2_{name} state_dict')
+        
+
+    def _init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                init.kaiming_normal_(m.weight)
+            elif isinstance(m, (nn.BatchNorm2d)):
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                init.constant_(m.bias, 0)
+
+    def _freeze_parameters(self, m: nn.Module):
+        for p in m.parameters():
+            p.requires_grad = False
+
+    def _freeze_norm(self, m: nn.Module):
+        if isinstance(m, nn.BatchNorm2d):
+            m = FrozenBatchNorm2d(m.num_features)
+        else:
+            for name, child in m.named_children():
+                _child = self._freeze_norm(child)
+                if _child is not child:
+                    setattr(m, name, _child)
+        return m
+
+
+    def forward(self, x: Tensor) -> List[Tensor]:
+        x = self.stem(x)
+        outs = []
+        for idx, stage in enumerate(self.stages):
+            x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+        return outs
+
+
+
+if __name__ == '__main__':
+
+    m = HGNetv2(name='X', pretrained=False, freeze_at=-1, freeze_norm=False)
+    data = torch.randn(1, 3, 640, 640)
+
+    output = m(data)
+    print([o.shape for o in output])
+
+    output[0].mean().backward()
diff --git a/rtdetrv2_pytorch/src/nn/backbone/presnet.py b/rtdetrv2_pytorch/src/nn/backbone/presnet.py
new file mode 100644
index 0000000..baf8594
--- /dev/null
+++ b/rtdetrv2_pytorch/src/nn/backbone/presnet.py
@@ -0,0 +1,245 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+import torch
+import torch.nn as nn 
+import torch.nn.functional as F 
+
+from collections import OrderedDict
+
+from .common import get_activation, FrozenBatchNorm2d
+
+from ...core import register
+
+
+__all__ = ['PResNet']
+
+
+ResNet_cfg = {
+    18: [2, 2, 2, 2],
+    34: [3, 4, 6, 3],
+    50: [3, 4, 6, 3],
+    101: [3, 4, 23, 3],
+    # 152: [3, 8, 36, 3],
+}
+
+
+donwload_url = {
+    18: 'https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet18_vd_pretrained_from_paddle.pth',
+    34: 'https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet34_vd_pretrained_from_paddle.pth',
+    50: 'https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet50_vd_ssld_v2_pretrained_from_paddle.pth',
+    101: 'https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet101_vd_ssld_pretrained_from_paddle.pth',
+}
+
+
+class ConvNormLayer(nn.Module):
+    def __init__(self, ch_in, ch_out, kernel_size, stride, padding=None, bias=False, act=None):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            ch_in, 
+            ch_out, 
+            kernel_size, 
+            stride, 
+            padding=(kernel_size-1)//2 if padding is None else padding, 
+            bias=bias)
+        self.norm = nn.BatchNorm2d(ch_out)
+        self.act = get_activation(act) 
+
+    def forward(self, x):
+        return self.act(self.norm(self.conv(x)))
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, ch_in, ch_out, stride, shortcut, act='relu', variant='b'):
+        super().__init__()
+
+        self.shortcut = shortcut
+
+        if not shortcut:
+            if variant == 'd' and stride == 2:
+                self.short = nn.Sequential(OrderedDict([
+                    ('pool', nn.AvgPool2d(2, 2, 0, ceil_mode=True)),
+                    ('conv', ConvNormLayer(ch_in, ch_out, 1, 1))
+                ]))
+            else:
+                self.short = ConvNormLayer(ch_in, ch_out, 1, stride)
+
+        self.branch2a = ConvNormLayer(ch_in, ch_out, 3, stride, act=act)
+        self.branch2b = ConvNormLayer(ch_out, ch_out, 3, 1, act=None)
+        self.act = nn.Identity() if act is None else get_activation(act) 
+
+
+    def forward(self, x):
+        out = self.branch2a(x)
+        out = self.branch2b(out)
+        if self.shortcut:
+            short = x
+        else:
+            short = self.short(x)
+        
+        out = out + short
+        out = self.act(out)
+
+        return out
+
+
+class BottleNeck(nn.Module):
+    expansion = 4
+
+    def __init__(self, ch_in, ch_out, stride, shortcut, act='relu', variant='b'):
+        super().__init__()
+
+        if variant == 'a':
+            stride1, stride2 = stride, 1
+        else:
+            stride1, stride2 = 1, stride
+
+        width = ch_out 
+
+        self.branch2a = ConvNormLayer(ch_in, width, 1, stride1, act=act)
+        self.branch2b = ConvNormLayer(width, width, 3, stride2, act=act)
+        self.branch2c = ConvNormLayer(width, ch_out * self.expansion, 1, 1)
+
+        self.shortcut = shortcut
+        if not shortcut:
+            if variant == 'd' and stride == 2:
+                self.short = nn.Sequential(OrderedDict([
+                    ('pool', nn.AvgPool2d(2, 2, 0, ceil_mode=True)),
+                    ('conv', ConvNormLayer(ch_in, ch_out * self.expansion, 1, 1))
+                ]))
+            else:
+                self.short = ConvNormLayer(ch_in, ch_out * self.expansion, 1, stride)
+
+        self.act = nn.Identity() if act is None else get_activation(act) 
+
+    def forward(self, x):
+        out = self.branch2a(x)
+        out = self.branch2b(out)
+        out = self.branch2c(out)
+
+        if self.shortcut:
+            short = x
+        else:
+            short = self.short(x)
+
+        out = out + short
+        out = self.act(out)
+
+        return out
+
+
+class Blocks(nn.Module):
+    def __init__(self, block, ch_in, ch_out, count, stage_num, act='relu', variant='b'):
+        super().__init__()
+
+        self.blocks = nn.ModuleList()
+        for i in range(count):
+            self.blocks.append(
+                block(
+                    ch_in, 
+                    ch_out,
+                    stride=2 if i == 0 and stage_num != 2 else 1, 
+                    shortcut=False if i == 0 else True,
+                    variant=variant,
+                    act=act)
+            )
+
+            if i == 0:
+                ch_in = ch_out * block.expansion
+
+    def forward(self, x):
+        out = x
+        for block in self.blocks:
+            out = block(out)
+        return out
+
+
+@register()
+class PResNet(nn.Module):
+    def __init__(
+        self, 
+        depth, 
+        variant='d', 
+        num_stages=4, 
+        return_idx=[0, 1, 2, 3], 
+        act='relu',
+        freeze_at=-1, 
+        freeze_norm=True, 
+        pretrained=False):
+        super().__init__()
+
+        block_nums = ResNet_cfg[depth]
+        ch_in = 64
+        if variant in ['c', 'd']:
+            conv_def = [
+                [3, ch_in // 2, 3, 2, "conv1_1"],
+                [ch_in // 2, ch_in // 2, 3, 1, "conv1_2"],
+                [ch_in // 2, ch_in, 3, 1, "conv1_3"],
+            ]
+        else:
+            conv_def = [[3, ch_in, 7, 2, "conv1_1"]]
+
+        self.conv1 = nn.Sequential(OrderedDict([
+            (name, ConvNormLayer(cin, cout, k, s, act=act)) for cin, cout, k, s, name in conv_def
+        ]))
+
+        ch_out_list = [64, 128, 256, 512]
+        block = BottleNeck if depth >= 50 else BasicBlock
+
+        _out_channels = [block.expansion * v for v in ch_out_list]
+        _out_strides = [4, 8, 16, 32]
+
+        self.res_layers = nn.ModuleList()
+        for i in range(num_stages):
+            stage_num = i + 2
+            self.res_layers.append(
+                Blocks(block, ch_in, ch_out_list[i], block_nums[i], stage_num, act=act, variant=variant)
+            )
+            ch_in = _out_channels[i]
+
+        self.return_idx = return_idx
+        self.out_channels = [_out_channels[_i] for _i in return_idx]
+        self.out_strides = [_out_strides[_i] for _i in return_idx]
+
+        if freeze_at >= 0:
+            self._freeze_parameters(self.conv1)
+            for i in range(min(freeze_at, num_stages)):
+                self._freeze_parameters(self.res_layers[i])
+
+        if freeze_norm:
+            self._freeze_norm(self)
+
+        if pretrained:
+            if isinstance(pretrained, bool) or 'http' in pretrained:
+                state = torch.hub.load_state_dict_from_url(donwload_url[depth], map_location='cpu')
+            else:
+                state = torch.load(pretrained, map_location='cpu')
+            self.load_state_dict(state)
+            print(f'Load PResNet{depth} state_dict')
+
+    def _freeze_parameters(self, m: nn.Module):
+        for p in m.parameters():
+            p.requires_grad = False
+
+    def _freeze_norm(self, m: nn.Module):
+        if isinstance(m, nn.BatchNorm2d):
+            m = FrozenBatchNorm2d(m.num_features)
+        else:
+            for name, child in m.named_children():
+                _child = self._freeze_norm(child)
+                if _child is not child:
+                    setattr(m, name, _child)
+        return m
+
+    def forward(self, x):
+        conv1 = self.conv1(x)
+        x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1)
+        outs = []
+        for idx, stage in enumerate(self.res_layers):
+            x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+        return outs
+
+
diff --git a/rtdetrv2_pytorch/src/nn/backbone/test_resnet.py b/rtdetrv2_pytorch/src/nn/backbone/test_resnet.py
new file mode 100644
index 0000000..72740d4
--- /dev/null
+++ b/rtdetrv2_pytorch/src/nn/backbone/test_resnet.py
@@ -0,0 +1,81 @@
+import torch
+import torch.nn as nn 
+import torch.nn.functional as F 
+
+from collections import OrderedDict
+
+
+from ...core import register
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(BasicBlock, self).__init__()
+
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.shortcut = nn.Sequential()         
+        if stride != 1 or in_planes != self.expansion*planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion*planes,kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(self.expansion*planes)
+            )
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))       
+        out += self.shortcut(x)          
+        out = F.relu(out)
+        return out
+
+
+
+class _ResNet(nn.Module):
+    def __init__(self, block, num_blocks, num_classes=10):
+        super().__init__()
+        self.in_planes = 64
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        
+        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
+        
+        self.linear = nn.Linear(512 * block.expansion, num_classes)
+
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1]*(num_blocks-1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_planes, planes, stride))
+            self.in_planes = planes * block.expansion 
+        return nn.Sequential(*layers)
+        
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        out = F.avg_pool2d(out, 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)              
+        return out
+        
+
+@register()
+class MResNet(nn.Module):
+    def __init__(self, num_classes=10, num_blocks=[2, 2, 2, 2]) -> None:
+        super().__init__()
+        self.model = _ResNet(BasicBlock, num_blocks, num_classes)
+        
+    def forward(self, x):
+        return self.model(x)
+
diff --git a/rtdetrv2_pytorch/src/nn/backbone/timm_model.py b/rtdetrv2_pytorch/src/nn/backbone/timm_model.py
new file mode 100644
index 0000000..7fa19c0
--- /dev/null
+++ b/rtdetrv2_pytorch/src/nn/backbone/timm_model.py
@@ -0,0 +1,70 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+
+https://towardsdatascience.com/getting-started-with-pytorch-image-models-timm-a-practitioners-guide-4e77b4bf9055#0583
+"""
+
+import torch
+from torchvision.models.feature_extraction import get_graph_node_names, create_feature_extractor
+
+from .utils import IntermediateLayerGetter
+from ...core import register
+
+
+@register()
+class TimmModel(torch.nn.Module):
+    def __init__(self, \
+        name, 
+        return_layers, 
+        pretrained=False, 
+        exportable=True, 
+        features_only=True,
+        **kwargs) -> None:
+
+        super().__init__()
+
+        import timm
+        model = timm.create_model(
+            name,
+            pretrained=pretrained, 
+            exportable=exportable, 
+            features_only=features_only,
+            **kwargs
+        )
+        # nodes, _ = get_graph_node_names(model)
+        # print(nodes)
+        # features = {'': ''}
+        # model = create_feature_extractor(model, return_nodes=features)
+
+        assert set(return_layers).issubset(model.feature_info.module_name()), \
+            f'return_layers should be a subset of {model.feature_info.module_name()}'
+        
+        # self.model = model
+        self.model = IntermediateLayerGetter(model, return_layers)
+
+        return_idx = [model.feature_info.module_name().index(name) for name in return_layers]
+        self.strides = [model.feature_info.reduction()[i] for i in return_idx]
+        self.channels = [model.feature_info.channels()[i] for i in return_idx]
+        self.return_idx = return_idx
+        self.return_layers = return_layers
+
+    def forward(self, x: torch.Tensor): 
+        outputs = self.model(x)
+        # outputs = [outputs[i] for i in self.return_idx]
+        return outputs
+
+
+if __name__ == '__main__':
+    
+    model = TimmModel(name='resnet34', return_layers=['layer2', 'layer3'])
+    data = torch.rand(1, 3, 640, 640)
+    outputs = model(data)
+    
+    for output in outputs:
+        print(output.shape)
+
+    """
+    model:
+        type: TimmModel
+        name: resnet34
+        return_layers: ['layer2', 'layer4']
+    """
diff --git a/rtdetrv2_pytorch/src/nn/backbone/torchvision_model.py b/rtdetrv2_pytorch/src/nn/backbone/torchvision_model.py
new file mode 100644
index 0000000..de3294f
--- /dev/null
+++ b/rtdetrv2_pytorch/src/nn/backbone/torchvision_model.py
@@ -0,0 +1,49 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torchvision 
+
+from ...core import register
+from .utils import IntermediateLayerGetter
+
+__all__ = ['TorchVisionModel']
+
+@register()
+class TorchVisionModel(torch.nn.Module):
+    def __init__(self, name, return_layers, weights=None, **kwargs) -> None:
+        super().__init__()
+        
+        if weights is not None:
+            weights = getattr(torchvision.models.get_model_weights(name), weights)
+
+        model = torchvision.models.get_model(name, weights=weights, **kwargs)
+
+        # TODO hard code.
+        if hasattr(model, 'features'):
+            model = IntermediateLayerGetter(model.features, return_layers)
+        else:
+            model = IntermediateLayerGetter(model, return_layers)
+
+        self.model = model 
+
+    def forward(self, x):
+        return self.model(x)
+
+
+# TorchVisionModel('swin_t', return_layers=['5', '7'])
+# TorchVisionModel('resnet34', return_layers=['layer2','layer3', 'layer4'])
+
+"""
+TorchVisionModel:
+    name: swin_t
+    return_layers: ['5', '7']
+    weights: DEFAULT
+
+
+model:
+    type: TorchVisionModel
+    name: resnet34
+    return_layers: ['layer2','layer3', 'layer4']
+    weights: DEFAULT
+"""
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/src/nn/backbone/utils.py b/rtdetrv2_pytorch/src/nn/backbone/utils.py
new file mode 100644
index 0000000..3ec4ef7
--- /dev/null
+++ b/rtdetrv2_pytorch/src/nn/backbone/utils.py
@@ -0,0 +1,55 @@
+"""
+https://github.com/pytorch/vision/blob/main/torchvision/models/_utils.py
+
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+from collections import OrderedDict
+from typing import Dict, List
+
+
+import torch.nn as nn 
+
+
+class IntermediateLayerGetter(nn.ModuleDict):
+    """
+    Module wrapper that returns intermediate layers from a model
+
+    It has a strong assumption that the modules have been registered
+    into the model in the same order as they are used.
+    This means that one should **not** reuse the same nn.Module
+    twice in the forward if you want this to work.
+
+    Additionally, it is only able to query submodules that are directly
+    assigned to the model. So if `model` is passed, `model.feature1` can
+    be returned, but not `model.feature1.layer2`.
+    """
+
+    _version = 3
+
+    def __init__(self, model: nn.Module, return_layers: List[str]) -> None:
+        if not set(return_layers).issubset([name for name, _ in model.named_children()]):
+            raise ValueError("return_layers are not present in model. {}"\
+                .format([name for name, _ in model.named_children()]))
+        orig_return_layers = return_layers
+        return_layers = {str(k): str(k)  for k in return_layers}
+        layers = OrderedDict()
+        for name, module in model.named_children():
+            layers[name] = module
+            if name in return_layers:
+                del return_layers[name]
+            if not return_layers:
+                break
+
+        super().__init__(layers)
+        self.return_layers = orig_return_layers
+
+    def forward(self, x):
+        outputs = []
+        for name, module in self.items():
+            x = module(x)
+            if name in self.return_layers:
+                outputs.append(x)
+        
+        return outputs
+
diff --git a/rtdetrv2_pytorch/src/nn/criterion/__init__.py b/rtdetrv2_pytorch/src/nn/criterion/__init__.py
new file mode 100644
index 0000000..485d636
--- /dev/null
+++ b/rtdetrv2_pytorch/src/nn/criterion/__init__.py
@@ -0,0 +1,10 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+
+import torch.nn as nn 
+from ...core import register
+
+from .det_criterion import DetCriterion
+
+CrossEntropyLoss = register()(nn.CrossEntropyLoss)
diff --git a/rtdetrv2_pytorch/src/nn/criterion/det_criterion.py b/rtdetrv2_pytorch/src/nn/criterion/det_criterion.py
new file mode 100644
index 0000000..3d87982
--- /dev/null
+++ b/rtdetrv2_pytorch/src/nn/criterion/det_criterion.py
@@ -0,0 +1,171 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torch.nn.functional as F 
+import torch.distributed
+import torchvision
+
+from ...misc import box_ops
+from ...misc import dist_utils
+from ...core import register
+
+
+@register()
+class DetCriterion(torch.nn.Module):
+    """Default Detection Criterion
+    """
+    __share__ = ['num_classes']
+    __inject__ = ['matcher']
+
+    def __init__(self, 
+                losses, 
+                weight_dict, 
+                num_classes=80, 
+                alpha=0.75, 
+                gamma=2.0, 
+                box_fmt='cxcywh',
+                matcher=None):
+        """
+        Args:
+            losses (list[str]): requested losses, support ['boxes', 'vfl', 'focal']
+            weight_dict (dict[str, float)]: corresponding losses weight, including
+                ['loss_bbox', 'loss_giou', 'loss_vfl', 'loss_focal']
+            box_fmt (str): in box format, 'cxcywh' or 'xyxy'
+            matcher (Matcher): matcher used to match source to target
+        """
+        super().__init__()
+        self.losses = losses
+        self.weight_dict = weight_dict
+        self.alpha = alpha
+        self.gamma = gamma
+        self.num_classes = num_classes
+        self.box_fmt = box_fmt
+        assert matcher is not None, ''
+        self.matcher = matcher
+
+    def forward(self, outputs, targets, **kwargs):
+        """
+        Args:
+            outputs: Dict[Tensor], 'pred_boxes', 'pred_logits', 'meta'.
+            targets, List[Dict[str, Tensor]], len(targets) == batch_size.
+            kwargs, store other information such as current epoch id.
+        Return:
+            losses, Dict[str, Tensor]
+        """
+        matched = self.matcher(outputs, targets)
+        values = matched['values']
+        indices = matched['indices']
+        num_boxes = self._get_positive_nums(indices)
+        
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            l_dict = self.get_loss(loss, outputs, targets, indices, num_boxes)
+            l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+            losses.update(l_dict)
+        return losses 
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])        
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def _get_positive_nums(self, indices):
+        # number of positive samples
+        num_pos = sum(len(i) for (i, _) in indices)
+        num_pos = torch.as_tensor([num_pos], dtype=torch.float32, device=indices[0][0].device)
+        if dist_utils.is_dist_available_and_initialized():
+            torch.distributed.all_reduce(num_pos)
+        num_pos = torch.clamp(num_pos / dist_utils.get_world_size(), min=1).item()
+        return num_pos
+
+    def loss_labels_focal(self, outputs, targets, indices, num_boxes):
+        assert 'pred_logits' in outputs
+        src_logits = outputs['pred_logits']
+
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][j] for t, (_, j) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
+                                    dtype=torch.int64, device=src_logits.device)
+        target_classes[idx] = target_classes_o
+
+        target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1].to(src_logits.dtype)
+        loss = torchvision.ops.sigmoid_focal_loss(src_logits, target, self.alpha, self.gamma, reduction='none')
+        loss = loss.sum() / num_boxes
+        return {'loss_focal': loss}
+
+    def loss_labels_vfl(self, outputs, targets, indices, num_boxes):
+        assert 'pred_boxes' in outputs
+        idx = self._get_src_permutation_idx(indices)
+        
+        src_boxes = outputs['pred_boxes'][idx]
+        target_boxes = torch.cat([t['boxes'][j] for t, (_, j) in zip(targets, indices)], dim=0)
+
+        src_boxes = torchvision.ops.box_convert(src_boxes, in_fmt=self.box_fmt, out_fmt='xyxy')
+        target_boxes = torchvision.ops.box_convert(target_boxes, in_fmt=self.box_fmt, out_fmt='xyxy')
+        iou, _ = box_ops.elementwise_box_iou(src_boxes.detach(), target_boxes)
+        
+        src_logits: torch.Tensor = outputs['pred_logits']
+        target_classes_o = torch.cat([t["labels"][j] for t, (_, j) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
+                                    dtype=torch.int64, device=src_logits.device)
+        target_classes[idx] = target_classes_o
+        target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1]
+
+        target_score_o = torch.zeros_like(target_classes, dtype=src_logits.dtype)
+        target_score_o[idx] = iou.to(src_logits.dtype)
+        target_score = target_score_o.unsqueeze(-1) * target
+
+        src_score = F.sigmoid(src_logits.detach())
+        weight = self.alpha * src_score.pow(self.gamma) * (1 - target) + target_score
+        
+        loss = F.binary_cross_entropy_with_logits(src_logits, target_score, weight=weight, reduction='none')        
+        loss = loss.sum() / num_boxes
+        return {'loss_vfl': loss}
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        assert 'pred_boxes' in outputs
+        idx = self._get_src_permutation_idx(indices)        
+        src_boxes = outputs['pred_boxes'][idx]
+        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        losses = {}
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
+        losses['loss_bbox'] = loss_bbox.sum() / num_boxes
+        
+        src_boxes = torchvision.ops.box_convert(src_boxes, in_fmt=self.box_fmt, out_fmt='xyxy')
+        target_boxes = torchvision.ops.box_convert(target_boxes, in_fmt=self.box_fmt, out_fmt='xyxy')
+        loss_giou = 1 - box_ops.elementwise_generalized_box_iou(src_boxes, target_boxes)
+        losses['loss_giou'] = loss_giou.sum() / num_boxes
+        return losses
+
+    def loss_boxes_giou(self, outputs, targets, indices, num_boxes):
+        assert 'pred_boxes' in outputs
+        idx = self._get_src_permutation_idx(indices)        
+        src_boxes = outputs['pred_boxes'][idx]
+        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        losses = {}
+        src_boxes = torchvision.ops.box_convert(src_boxes, in_fmt=self.box_fmt, out_fmt='xyxy')
+        target_boxes = torchvision.ops.box_convert(target_boxes, in_fmt=self.box_fmt, out_fmt='xyxy')
+        loss_giou = 1 - box_ops.elementwise_generalized_box_iou(src_boxes, target_boxes)
+        losses['loss_giou'] = loss_giou.sum() / num_boxes
+        return losses
+
+    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
+        loss_map = {
+            'boxes': self.loss_boxes,
+            'giou': self.loss_boxes_giou,
+            'vfl': self.loss_labels_vfl,
+            'focal': self.loss_labels_focal,
+        }
+        assert loss in loss_map, f'do you really want to compute {loss} loss?'
+        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
diff --git a/rtdetrv2_pytorch/src/nn/postprocessor/__init__.py b/rtdetrv2_pytorch/src/nn/postprocessor/__init__.py
new file mode 100644
index 0000000..147c3d4
--- /dev/null
+++ b/rtdetrv2_pytorch/src/nn/postprocessor/__init__.py
@@ -0,0 +1,5 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+
+from .nms_postprocessor import DetNMSPostProcessor
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/src/nn/postprocessor/box_revert.py b/rtdetrv2_pytorch/src/nn/postprocessor/box_revert.py
new file mode 100644
index 0000000..0f1a14f
--- /dev/null
+++ b/rtdetrv2_pytorch/src/nn/postprocessor/box_revert.py
@@ -0,0 +1,62 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torchvision
+from torch import Tensor
+from enum import Enum
+
+
+class BoxProcessFormat(Enum):
+    """Box process format 
+
+    Available formats are
+    * ``RESIZE``
+    * ``RESIZE_KEEP_RATIO``
+    * ``RESIZE_KEEP_RATIO_PADDING``
+    """
+    RESIZE = 1
+    RESIZE_KEEP_RATIO = 2
+    RESIZE_KEEP_RATIO_PADDING = 3
+
+
+def box_revert(
+    boxes: Tensor, 
+    orig_sizes: Tensor=None, 
+    eval_sizes: Tensor=None,
+    inpt_sizes: Tensor=None,
+    inpt_padding: Tensor=None,
+    normalized: bool=True,
+    in_fmt: str='cxcywh', 
+    out_fmt: str='xyxy',
+    process_fmt=BoxProcessFormat.RESIZE,
+) -> Tensor:
+    """
+    Args:
+        boxes(Tensor), [N, :, 4], (x1, y1, x2, y2), pred boxes.
+        inpt_sizes(Tensor), [N, 2], (w, h). input sizes.
+        orig_sizes(Tensor), [N, 2], (w, h). origin sizes.
+        inpt_padding (Tensor), [N, 2], (w_pad, h_pad, ...).
+        (inpt_sizes + inpt_padding) == eval_sizes
+    """
+    assert in_fmt in ('cxcywh', 'xyxy'), ''
+
+    if normalized and eval_sizes is not None:
+        boxes = boxes * eval_sizes.repeat(1, 2).unsqueeze(1)
+    
+    if inpt_padding is not None:
+        if in_fmt == 'xyxy':
+            boxes -= inpt_padding[:, :2].repeat(1, 2).unsqueeze(1)
+        elif in_fmt == 'cxcywh':
+            boxes[..., :2] -= inpt_padding[:, :2].repeat(1, 2).unsqueeze(1)
+
+    if orig_sizes is not None:
+        orig_sizes = orig_sizes.repeat(1, 2).unsqueeze(1)
+        if inpt_sizes is not None:
+            inpt_sizes = inpt_sizes.repeat(1, 2).unsqueeze(1)
+            boxes = boxes * (orig_sizes / inpt_sizes)
+        else:
+            boxes = boxes * orig_sizes
+
+    boxes = torchvision.ops.box_convert(boxes, in_fmt=in_fmt, out_fmt=out_fmt)
+    return boxes
diff --git a/rtdetrv2_pytorch/src/nn/postprocessor/detr_postprocessor.py b/rtdetrv2_pytorch/src/nn/postprocessor/detr_postprocessor.py
new file mode 100644
index 0000000..48f5f2b
--- /dev/null
+++ b/rtdetrv2_pytorch/src/nn/postprocessor/detr_postprocessor.py
@@ -0,0 +1,81 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch 
+import torch.nn as nn 
+import torch.nn.functional as F 
+
+import torchvision
+
+
+__all__ = ['DetDETRPostProcessor']
+
+from .box_revert import box_revert
+from .box_revert import BoxProcessFormat
+
+def mod(a, b):
+    out = a - a // b * b
+    return out
+
+class DetDETRPostProcessor(nn.Module):
+    def __init__(
+        self, 
+        num_classes=80, 
+        use_focal_loss=True, 
+        num_top_queries=300, 
+        box_process_format=BoxProcessFormat.RESIZE,
+    ) -> None:
+        super().__init__()
+        self.use_focal_loss = use_focal_loss
+        self.num_top_queries = num_top_queries
+        self.num_classes = int(num_classes)
+        self.box_process_format = box_process_format
+        self.deploy_mode = False 
+
+    def extra_repr(self) -> str:
+        return f'use_focal_loss={self.use_focal_loss}, num_classes={self.num_classes}, num_top_queries={self.num_top_queries}'
+    
+    def forward(self, outputs, **kwargs):
+        logits, boxes = outputs['pred_logits'], outputs['pred_boxes']
+
+        if self.use_focal_loss:
+            scores = F.sigmoid(logits)
+            scores, index = torch.topk(scores.flatten(1), self.num_top_queries, dim=-1)
+            labels = index % self.num_classes
+            # labels = mod(index, self.num_classes) # for tensorrt
+            index = index // self.num_classes
+            boxes = boxes.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, boxes.shape[-1]))
+            
+        else:
+            scores = F.softmax(logits)[:, :, :-1]
+            scores, labels = scores.max(dim=-1)
+            if scores.shape[1] > self.num_top_queries:
+                scores, index = torch.topk(scores, self.num_top_queries, dim=-1)
+                labels = torch.gather(labels, dim=1, index=index)
+                boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1]))
+
+        if kwargs is not None:
+            boxes = box_revert(
+                boxes, 
+                in_fmt='cxcywh',
+                out_fmt='xyxy',
+                process_fmt=self.box_process_format,
+                normalized=True,
+                **kwargs,
+            )
+
+        # TODO for onnx export
+        if self.deploy_mode:
+            return labels, boxes, scores
+
+        results = []
+        for lab, box, sco in zip(labels, boxes, scores):
+            result = dict(labels=lab, boxes=box, scores=sco)
+            results.append(result)
+        
+        return results
+        
+    def deploy(self, ):
+        self.eval()
+        self.deploy_mode = True
+        return self 
diff --git a/rtdetrv2_pytorch/src/nn/postprocessor/nms_postprocessor.py b/rtdetrv2_pytorch/src/nn/postprocessor/nms_postprocessor.py
new file mode 100644
index 0000000..b094594
--- /dev/null
+++ b/rtdetrv2_pytorch/src/nn/postprocessor/nms_postprocessor.py
@@ -0,0 +1,79 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torch.nn.functional as F 
+import torch.distributed
+import torchvision
+from torch import Tensor 
+
+from ...core import register
+
+from typing import Dict 
+
+
+__all__ = ['DetNMSPostProcessor', ]
+
+
+@register()
+class DetNMSPostProcessor(torch.nn.Module):
+    def __init__(self, \
+                iou_threshold=0.7, 
+                score_threshold=0.01, 
+                keep_topk=300, 
+                box_fmt='cxcywh',
+                logit_fmt='sigmoid') -> None:
+        super().__init__()
+        self.iou_threshold = iou_threshold
+        self.score_threshold = score_threshold
+        self.keep_topk = keep_topk
+        self.box_fmt = box_fmt.lower()
+        self.logit_fmt = logit_fmt.lower()
+        self.logit_func = getattr(F, self.logit_fmt, None)
+        self.deploy_mode = False 
+    
+    def forward(self, outputs: Dict[str, Tensor], orig_target_sizes: Tensor):
+        logits, boxes = outputs['pred_logits'], outputs['pred_boxes']
+        pred_boxes = torchvision.ops.box_convert(boxes, in_fmt=self.box_fmt, out_fmt='xyxy')
+        pred_boxes *= orig_target_sizes.repeat(1, 2).unsqueeze(1)
+
+        values, pred_labels = torch.max(logits, dim=-1)
+        
+        if self.logit_func:
+            pred_scores = self.logit_func(values)
+        else:
+            pred_scores = values
+
+        # TODO for onnx export
+        if self.deploy_mode:
+            blobs = {
+                'pred_labels': pred_labels, 
+                'pred_boxes': pred_boxes,
+                'pred_scores': pred_scores
+            }
+            return blobs
+
+        results = []
+        for i in range(logits.shape[0]):
+            score_keep = pred_scores[i] > self.score_threshold
+            pred_box = pred_boxes[i][score_keep]
+            pred_label = pred_labels[i][score_keep]
+            pred_score = pred_scores[i][score_keep]
+
+            keep = torchvision.ops.batched_nms(pred_box, pred_score, pred_label, self.iou_threshold)            
+            keep = keep[:self.keep_topk]
+
+            blob = {
+                'labels': pred_label[keep],
+                'boxes': pred_box[keep],
+                'scores': pred_score[keep],
+            }
+
+            results.append(blob)
+            
+        return results
+
+    def deploy(self, ):
+        self.eval()
+        self.deploy_mode = True
+        return self 
diff --git a/rtdetrv2_pytorch/src/optim/__init__.py b/rtdetrv2_pytorch/src/optim/__init__.py
new file mode 100644
index 0000000..2b04f32
--- /dev/null
+++ b/rtdetrv2_pytorch/src/optim/__init__.py
@@ -0,0 +1,7 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+from .ema import *
+from .optim import *
+from .amp import *
+from .warmup import *
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/src/optim/amp.py b/rtdetrv2_pytorch/src/optim/amp.py
new file mode 100644
index 0000000..4e69b59
--- /dev/null
+++ b/rtdetrv2_pytorch/src/optim/amp.py
@@ -0,0 +1,12 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+
+import torch.cuda.amp as amp
+
+from ..core import register
+
+
+__all__ = ['GradScaler']
+
+GradScaler = register()(amp.grad_scaler.GradScaler)
diff --git a/rtdetrv2_pytorch/src/optim/ema.py b/rtdetrv2_pytorch/src/optim/ema.py
new file mode 100644
index 0000000..e42a0a8
--- /dev/null
+++ b/rtdetrv2_pytorch/src/optim/ema.py
@@ -0,0 +1,92 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+
+import torch
+import torch.nn as nn 
+
+import math
+from copy import deepcopy
+
+from ..core import register
+from ..misc import dist_utils
+
+__all__ = ['ModelEMA']
+
+
+@register()
+class ModelEMA(object):
+    """
+    Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models
+    Keep a moving average of everything in the model state_dict (parameters and buffers).
+    This is intended to allow functionality like
+    https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
+    A smoothed version of the weights is necessary for some training schemes to perform well.
+    This class is sensitive where it is initialized in the sequence of model init,
+    GPU assignment and distributed training wrappers.
+    """
+    def __init__(self, model: nn.Module, decay: float=0.9999, warmups: int=2000, ):
+        super().__init__()
+
+        self.module = deepcopy(dist_utils.de_parallel(model)).eval() 
+        # if next(model.parameters()).device.type != 'cpu':
+        #     self.module.half()  # FP16 EMA
+        
+        self.decay = decay 
+        self.warmups = warmups
+        self.updates = 0  # number of EMA updates
+        self.decay_fn = lambda x: decay * (1 - math.exp(-x / warmups))  # decay exponential ramp (to help early epochs)
+        
+        for p in self.module.parameters():
+            p.requires_grad_(False)
+
+
+    def update(self, model: nn.Module):
+        # Update EMA parameters
+        with torch.no_grad():
+            self.updates += 1
+            d = self.decay_fn(self.updates)
+            msd = dist_utils.de_parallel(model).state_dict()
+            for k, v in self.module.state_dict().items():
+                if v.dtype.is_floating_point:
+                    v *= d
+                    v += (1 - d) * msd[k].detach()
+            
+    def to(self, *args, **kwargs):
+        self.module = self.module.to(*args, **kwargs)
+        return self
+
+    def state_dict(self, ):
+        return dict(module=self.module.state_dict(), updates=self.updates)
+    
+    def load_state_dict(self, state, strict=True):
+        self.module.load_state_dict(state['module'], strict=strict) 
+        if 'updates' in state:
+            self.updates = state['updates']
+
+    def forwad(self, ):
+        raise RuntimeError('ema...')
+
+    def extra_repr(self) -> str:
+        return f'decay={self.decay}, warmups={self.warmups}'
+
+
+
+class ExponentialMovingAverage(torch.optim.swa_utils.AveragedModel):
+    """Maintains moving averages of model parameters using an exponential decay.
+    ``ema_avg = decay * avg_model_param + (1 - decay) * model_param``
+    `torch.optim.swa_utils.AveragedModel <https://pytorch.org/docs/stable/optim.html#custom-averaging-strategies>`_
+    is used to compute the EMA.
+    """
+    def __init__(self, model, decay, device="cpu", use_buffers=True):
+
+        self.decay_fn = lambda x: decay * (1 - math.exp(-x / 2000))  
+        
+        def ema_avg(avg_model_param, model_param, num_averaged):
+            decay = self.decay_fn(num_averaged)
+            return decay * avg_model_param + (1 - decay) * model_param
+
+        super().__init__(model, device, ema_avg, use_buffers=use_buffers)
+
+
+
diff --git a/rtdetrv2_pytorch/src/optim/optim.py b/rtdetrv2_pytorch/src/optim/optim.py
new file mode 100644
index 0000000..843f900
--- /dev/null
+++ b/rtdetrv2_pytorch/src/optim/optim.py
@@ -0,0 +1,23 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+
+import torch.optim as optim
+import torch.optim.lr_scheduler as lr_scheduler
+
+from ..core import register
+
+
+__all__ = ['AdamW', 'SGD', 'Adam', 'MultiStepLR', 'CosineAnnealingLR', 'OneCycleLR', 'LambdaLR']
+
+
+
+SGD = register()(optim.SGD)
+Adam = register()(optim.Adam)
+AdamW = register()(optim.AdamW)
+
+
+MultiStepLR = register()(lr_scheduler.MultiStepLR)
+CosineAnnealingLR = register()(lr_scheduler.CosineAnnealingLR)
+OneCycleLR = register()(lr_scheduler.OneCycleLR)
+LambdaLR = register()(lr_scheduler.LambdaLR)
diff --git a/rtdetrv2_pytorch/src/optim/warmup.py b/rtdetrv2_pytorch/src/optim/warmup.py
new file mode 100644
index 0000000..b2634f9
--- /dev/null
+++ b/rtdetrv2_pytorch/src/optim/warmup.py
@@ -0,0 +1,47 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+from torch.optim.lr_scheduler import LRScheduler
+
+from ..core import register
+
+
+class Warmup(object):
+    def __init__(self, lr_scheduler: LRScheduler, warmup_duration: int, last_step: int=-1) -> None:
+        self.lr_scheduler = lr_scheduler
+        self.warmup_end_values = [pg['lr'] for pg in lr_scheduler.optimizer.param_groups]
+        self.last_step = last_step
+        self.warmup_duration = warmup_duration
+        self.step()
+
+    def state_dict(self):
+        return {k: v for k, v in self.__dict__.items() if k != 'lr_scheduler'}
+
+    def load_state_dict(self, state_dict):
+        self.__dict__.update(state_dict)
+
+    def get_warmup_factor(self, step, **kwargs):
+        raise NotImplementedError
+
+    def step(self, ):
+        self.last_step += 1
+        if self.last_step >= self.warmup_duration:
+            return
+        factor = self.get_warmup_factor(self.last_step)
+        for i, pg in enumerate(self.lr_scheduler.optimizer.param_groups):
+            pg['lr'] = factor * self.warmup_end_values[i]
+    
+    def finished(self, ):
+        if self.last_step >= self.warmup_duration:
+            return True 
+        return False
+
+
+@register()
+class LinearWarmup(Warmup):
+    def __init__(self, lr_scheduler: LRScheduler, warmup_duration: int, last_step: int = -1) -> None:
+        super().__init__(lr_scheduler, warmup_duration, last_step)
+
+    def get_warmup_factor(self, step):
+        return min(1.0, (step + 1) / self.warmup_duration)
+
diff --git a/rtdetrv2_pytorch/src/solver/__init__.py b/rtdetrv2_pytorch/src/solver/__init__.py
new file mode 100644
index 0000000..de1611e
--- /dev/null
+++ b/rtdetrv2_pytorch/src/solver/__init__.py
@@ -0,0 +1,15 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+from ._solver import BaseSolver
+from .clas_solver import ClasSolver
+from .det_solver import DetSolver
+
+
+
+from typing import Dict 
+
+TASKS :Dict[str, BaseSolver] = {
+    'classification': ClasSolver,
+    'detection': DetSolver,
+}
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/src/solver/_solver.py b/rtdetrv2_pytorch/src/solver/_solver.py
new file mode 100644
index 0000000..51e9bef
--- /dev/null
+++ b/rtdetrv2_pytorch/src/solver/_solver.py
@@ -0,0 +1,191 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch 
+import torch.nn as nn 
+
+from datetime import datetime
+from pathlib import Path 
+from typing import Dict
+import atexit
+
+from ..misc import dist_utils
+from ..core import BaseConfig
+
+
+def to(m: nn.Module, device: str):
+    if m is None:
+        return None 
+    return m.to(device) 
+
+
+class BaseSolver(object):
+    def __init__(self, cfg: BaseConfig) -> None:
+        self.cfg = cfg 
+
+    def _setup(self, ):
+        """Avoid instantiating unnecessary classes 
+        """
+        cfg = self.cfg
+        if cfg.device:
+            device = torch.device(cfg.device)
+        else:
+            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+        self.model = cfg.model
+        
+        # NOTE (lyuwenyu): must load_tuning_state before ema instance building
+        if self.cfg.tuning:
+            print(f'tuning checkpoint from {self.cfg.tuning}')
+            self.load_tuning_state(self.cfg.tuning)
+
+        self.model = dist_utils.warp_model(self.model.to(device), sync_bn=cfg.sync_bn, \
+            find_unused_parameters=cfg.find_unused_parameters)
+
+        self.criterion = to(cfg.criterion, device)
+        self.postprocessor = to(cfg.postprocessor, device)
+
+        self.ema = to(cfg.ema, device)
+        self.scaler = cfg.scaler
+
+        self.device = device
+        self.last_epoch = self.cfg.last_epoch
+        
+        self.output_dir = Path(cfg.output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.writer = cfg.writer
+
+        if self.writer:
+            atexit.register(self.writer.close)
+            if dist_utils.is_main_process():
+                self.writer.add_text(f'config', '{:s}'.format(cfg.__repr__()), 0)
+
+    def cleanup(self, ):
+        if self.writer:
+            atexit.register(self.writer.close)
+
+    def train(self, ):
+        self._setup()
+        self.optimizer = self.cfg.optimizer
+        self.lr_scheduler = self.cfg.lr_scheduler
+        self.lr_warmup_scheduler = self.cfg.lr_warmup_scheduler
+
+        self.train_dataloader = dist_utils.warp_loader(self.cfg.train_dataloader, \
+            shuffle=self.cfg.train_dataloader.shuffle)
+        self.val_dataloader = dist_utils.warp_loader(self.cfg.val_dataloader, \
+            shuffle=self.cfg.val_dataloader.shuffle)
+
+        self.evaluator = self.cfg.evaluator
+
+        # NOTE instantiating order
+        if self.cfg.resume:
+            print(f'Resume checkpoint from {self.cfg.resume}')
+            self.load_resume_state(self.cfg.resume)
+
+    def eval(self, ):
+        self._setup()
+
+        self.val_dataloader = dist_utils.warp_loader(self.cfg.val_dataloader, \
+            shuffle=self.cfg.val_dataloader.shuffle)
+
+        self.evaluator = self.cfg.evaluator
+        
+        if self.cfg.resume:
+            print(f'Resume checkpoint from {self.cfg.resume}')
+            self.load_resume_state(self.cfg.resume)
+
+    def to(self, device):
+        for k, v in self.__dict__.items():
+            if hasattr(v, 'to'):
+                v.to(device)
+
+    def state_dict(self):
+        """state dict, train/eval
+        """
+        state = {}
+        state['date'] = datetime.now().isoformat()
+        
+        # TODO for resume
+        state['last_epoch'] = self.last_epoch
+
+        for k, v in self.__dict__.items():
+            if hasattr(v, 'state_dict'):
+                v = dist_utils.de_parallel(v)
+                state[k] = v.state_dict() 
+
+        return state
+
+
+    def load_state_dict(self, state):
+        """load state dict, train/eval
+        """
+        # TODO
+        if 'last_epoch' in state:
+            self.last_epoch = state['last_epoch']
+            print('Load last_epoch')
+
+        for k, v in self.__dict__.items():
+            if hasattr(v, 'load_state_dict') and k in state:
+                v = dist_utils.de_parallel(v)
+                v.load_state_dict(state[k])
+                print(f'Load {k}.state_dict')
+
+            if hasattr(v, 'load_state_dict') and k not in state:
+                print(f'Not load {k}.state_dict')
+
+
+    def load_resume_state(self, path: str):
+        """load resume
+        """
+        # for cuda:0 memory
+        if path.startswith('http'):
+            state = torch.hub.load_state_dict_from_url(path, map_location='cpu')
+        else:
+            state = torch.load(path, map_location='cpu')
+
+        self.load_state_dict(state)
+
+    
+    def load_tuning_state(self, path: str,):
+        """only load model for tuning and skip missed/dismatched keys
+        """
+        if path.startswith('http'):
+            state = torch.hub.load_state_dict_from_url(path, map_location='cpu')
+        else:
+            state = torch.load(path, map_location='cpu')
+
+        module = dist_utils.de_parallel(self.model)
+        
+        # TODO hard code
+        if 'ema' in state:
+            stat, infos = self._matched_state(module.state_dict(), state['ema']['module'])
+        else:
+            stat, infos = self._matched_state(module.state_dict(), state['model'])
+
+        module.load_state_dict(stat, strict=False)
+        print(f'Load model.state_dict, {infos}')
+
+
+    @staticmethod
+    def _matched_state(state: Dict[str, torch.Tensor], params: Dict[str, torch.Tensor]):
+        missed_list = []
+        unmatched_list = []
+        matched_state = {}
+        for k, v in state.items():
+            if k in params:
+                if v.shape == params[k].shape:
+                    matched_state[k] = params[k]
+                else:
+                    unmatched_list.append(k)
+            else:
+                missed_list.append(k)
+
+        return matched_state, {'missed': missed_list, 'unmatched': unmatched_list}
+
+
+    def fit(self, ):
+        raise NotImplementedError('')
+
+
+    def val(self, ):
+        raise NotImplementedError('')
diff --git a/rtdetrv2_pytorch/src/solver/clas_engine.py b/rtdetrv2_pytorch/src/solver/clas_engine.py
new file mode 100644
index 0000000..ad24077
--- /dev/null
+++ b/rtdetrv2_pytorch/src/solver/clas_engine.py
@@ -0,0 +1,74 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torch.nn as nn 
+
+from ..misc import (MetricLogger, SmoothedValue, reduce_dict)
+
+
+def train_one_epoch(model: nn.Module, criterion: nn.Module, dataloader, optimizer, ema, epoch, device):
+    """
+    """
+    model.train()
+
+    metric_logger = MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    print_freq = 100
+    header = 'Epoch: [{}]'.format(epoch)
+
+    for imgs, labels in metric_logger.log_every(dataloader, print_freq, header):
+        imgs = imgs.to(device)
+        labels = labels.to(device)
+
+        preds = model(imgs)
+        loss: torch.Tensor = criterion(preds, labels)
+        
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        
+        if ema is not None:
+            ema.update(model)
+
+        loss_reduced_values = {k: v.item() for k, v in reduce_dict({'loss': loss}).items()}
+        metric_logger.update(**loss_reduced_values)
+        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+    
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+
+    stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+    return stats
+
+
+
+@torch.no_grad()
+def evaluate(model, criterion, dataloader, device):
+    model.eval()
+
+    metric_logger = MetricLogger(delimiter="  ")
+    # metric_logger.add_meter('acc', SmoothedValue(window_size=1, fmt='{global_avg:.4f}'))
+    # metric_logger.add_meter('loss', SmoothedValue(window_size=1, fmt='{value:.2f}'))
+    metric_logger.add_meter('acc', SmoothedValue(window_size=1))
+    metric_logger.add_meter('loss', SmoothedValue(window_size=1))
+
+    header = 'Test:'
+    for imgs, labels in metric_logger.log_every(dataloader, 10, header):
+        imgs, labels = imgs.to(device), labels.to(device)
+        preds = model(imgs)
+
+        acc = (preds.argmax(dim=-1) == labels).sum() / preds.shape[0]
+        loss = criterion(preds, labels)
+
+        dict_reduced = reduce_dict({'acc': acc, 'loss': loss})
+        reduced_values = {k: v.item() for k, v in dict_reduced.items()}
+        metric_logger.update(**reduced_values)
+
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+
+    stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+    return stats
+
+
diff --git a/rtdetrv2_pytorch/src/solver/clas_solver.py b/rtdetrv2_pytorch/src/solver/clas_solver.py
new file mode 100644
index 0000000..dc0860c
--- /dev/null
+++ b/rtdetrv2_pytorch/src/solver/clas_solver.py
@@ -0,0 +1,71 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import time 
+import json
+import datetime
+from pathlib import Path
+
+import torch 
+import torch.nn as nn 
+
+from ..misc import dist_utils
+from ._solver import BaseSolver
+from .clas_engine import train_one_epoch, evaluate
+
+
+class ClasSolver(BaseSolver):
+
+    def fit(self, ):
+        print("Start training")
+        self.train()
+        args = self.cfg 
+
+        n_parameters = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
+        print('Number of params:', n_parameters)
+
+        output_dir = Path(args.output_dir)
+        output_dir.mkdir(exist_ok=True)
+
+        start_time = time.time()
+        start_epoch = self.last_epoch + 1
+        for epoch in range(start_epoch, args.epoches):
+
+            if dist_utils.is_dist_available_and_initialized():
+                self.train_dataloader.sampler.set_epoch(epoch)
+            
+            train_stats = train_one_epoch(self.model, 
+                                        self.criterion, 
+                                        self.train_dataloader, 
+                                        self.optimizer, 
+                                        self.ema, 
+                                        epoch=epoch, 
+                                        device=self.device)
+            self.lr_scheduler.step()
+            self.last_epoch += 1
+
+            if output_dir:
+                checkpoint_paths = [output_dir / 'checkpoint.pth']
+                # extra checkpoint before LR drop and every 100 epochs
+                if (epoch + 1) % args.checkpoint_freq == 0:
+                    checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth')
+                for checkpoint_path in checkpoint_paths:
+                    dist_utils.save_on_master(self.state_dict(epoch), checkpoint_path)
+
+            module = self.ema.module if self.ema else self.model
+            test_stats = evaluate(module, self.criterion, self.val_dataloader, self.device)
+
+            log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                         **{f'test_{k}': v for k, v in test_stats.items()},
+                         'epoch': epoch,
+                         'n_parameters': n_parameters}
+            
+            if output_dir and dist_utils.is_main_process():
+                with (output_dir / "log.txt").open("a") as f:
+                    f.write(json.dumps(log_stats) + "\n")
+
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('Training time {}'.format(total_time_str))
+
+
diff --git a/rtdetrv2_pytorch/src/solver/det_engine.py b/rtdetrv2_pytorch/src/solver/det_engine.py
new file mode 100644
index 0000000..441ef39
--- /dev/null
+++ b/rtdetrv2_pytorch/src/solver/det_engine.py
@@ -0,0 +1,157 @@
+"""
+Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+https://github.com/facebookresearch/detr/blob/main/engine.py
+
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import sys
+import math
+from typing import Iterable
+
+import torch
+import torch.amp 
+from torch.utils.tensorboard import SummaryWriter
+from torch.cuda.amp.grad_scaler import GradScaler
+
+from ..optim import ModelEMA, Warmup
+from ..data import CocoEvaluator
+from ..misc import MetricLogger, SmoothedValue, dist_utils
+
+
+def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
+                    data_loader: Iterable, optimizer: torch.optim.Optimizer,
+                    device: torch.device, epoch: int, max_norm: float = 0, **kwargs):
+    model.train()
+    criterion.train()
+    metric_logger = MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+    
+    print_freq = kwargs.get('print_freq', 10)
+    writer :SummaryWriter = kwargs.get('writer', None)
+
+    ema :ModelEMA = kwargs.get('ema', None)
+    scaler :GradScaler = kwargs.get('scaler', None)
+    lr_warmup_scheduler :Warmup = kwargs.get('lr_warmup_scheduler', None)
+
+    for i, (samples, targets) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
+        samples = samples.to(device)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+        global_step = epoch * len(data_loader) + i
+        metas = dict(epoch=epoch, step=i, global_step=global_step)
+
+        if scaler is not None:
+            with torch.autocast(device_type=str(device), cache_enabled=True):
+                outputs = model(samples, targets=targets)
+            
+            with torch.autocast(device_type=str(device), enabled=False):
+                loss_dict = criterion(outputs, targets, **metas)
+
+            loss = sum(loss_dict.values())
+            scaler.scale(loss).backward()
+            
+            if max_norm > 0:
+                scaler.unscale_(optimizer)
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad()
+
+        else:
+            outputs = model(samples, targets=targets)
+            loss_dict = criterion(outputs, targets, **metas)
+            
+            loss : torch.Tensor = sum(loss_dict.values())
+            optimizer.zero_grad()
+            loss.backward()
+            
+            if max_norm > 0:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+
+            optimizer.step()
+        
+        # ema 
+        if ema is not None:
+            ema.update(model)
+
+        if lr_warmup_scheduler is not None:
+            lr_warmup_scheduler.step()
+
+        loss_dict_reduced = dist_utils.reduce_dict(loss_dict)
+        loss_value = sum(loss_dict_reduced.values())
+
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value))
+            print(loss_dict_reduced)
+            sys.exit(1)
+
+        metric_logger.update(loss=loss_value, **loss_dict_reduced)
+        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+
+        if writer and dist_utils.is_main_process():
+            writer.add_scalar('Loss/total', loss_value.item(), global_step)
+            for j, pg in enumerate(optimizer.param_groups):
+                writer.add_scalar(f'Lr/pg_{j}', pg['lr'], global_step)
+            for k, v in loss_dict_reduced.items():
+                writer.add_scalar(f'Loss/{k}', v.item(), global_step)
+                
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+@torch.no_grad()
+def evaluate(model: torch.nn.Module, criterion: torch.nn.Module, postprocessor, data_loader, coco_evaluator: CocoEvaluator, device):
+    model.eval()
+    criterion.eval()
+    coco_evaluator.cleanup()
+    iou_types = coco_evaluator.iou_types
+
+    metric_logger = MetricLogger(delimiter="  ")
+    header = 'Test:'
+    
+    for samples, targets in metric_logger.log_every(data_loader, 10, header):
+        samples = samples.to(device)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+
+        outputs = model(samples)
+
+        # TODO (lyuwenyu), fix dataset converted using `convert_to_coco_api`?
+        orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
+        
+        results = postprocessor(outputs, orig_target_sizes)
+
+        # if 'segm' in postprocessor.keys():
+        #     target_sizes = torch.stack([t["size"] for t in targets], dim=0)
+        #     results = postprocessor['segm'](results, outputs, orig_target_sizes, target_sizes)
+
+        res = {target['image_id'].item(): output for target, output in zip(targets, results)}
+        if coco_evaluator is not None:
+            coco_evaluator.update(res)
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    if coco_evaluator is not None:
+        coco_evaluator.synchronize_between_processes()
+
+    # accumulate predictions from all images
+    if coco_evaluator is not None:
+        coco_evaluator.accumulate()
+        coco_evaluator.summarize()
+
+    stats = {}
+    # stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+    if coco_evaluator is not None:
+        if 'bbox' in iou_types:
+            stats['coco_eval_bbox'] = coco_evaluator.coco_eval['bbox'].stats.tolist()
+        if 'segm' in iou_types:
+            stats['coco_eval_masks'] = coco_evaluator.coco_eval['segm'].stats.tolist()
+            
+    return stats, coco_evaluator
+
+
+
diff --git a/rtdetrv2_pytorch/src/solver/det_solver.py b/rtdetrv2_pytorch/src/solver/det_solver.py
new file mode 100644
index 0000000..af81989
--- /dev/null
+++ b/rtdetrv2_pytorch/src/solver/det_solver.py
@@ -0,0 +1,131 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import time 
+import json
+import datetime
+
+import torch 
+
+from ..misc import dist_utils, profiler_utils
+
+from ._solver import BaseSolver
+from .det_engine import train_one_epoch, evaluate
+
+
+class DetSolver(BaseSolver):
+    
+    def fit(self, ):
+        print("Start training")
+        self.train()
+        args = self.cfg
+
+        n_parameters = sum([p.numel() for p in self.model.parameters() if p.requires_grad])
+        print(f'number of trainable parameters: {n_parameters}')
+
+        best_stat = {'epoch': -1, }
+
+        start_time = time.time()
+        start_epcoch = self.last_epoch + 1
+        
+        for epoch in range(start_epcoch, args.epoches):
+
+            self.train_dataloader.set_epoch(epoch)
+            # self.train_dataloader.dataset.set_epoch(epoch)
+            if dist_utils.is_dist_available_and_initialized():
+                self.train_dataloader.sampler.set_epoch(epoch)
+            
+            train_stats = train_one_epoch(
+                self.model, 
+                self.criterion, 
+                self.train_dataloader, 
+                self.optimizer, 
+                self.device, 
+                epoch, 
+                max_norm=args.clip_max_norm, 
+                print_freq=args.print_freq, 
+                ema=self.ema, 
+                scaler=self.scaler, 
+                lr_warmup_scheduler=self.lr_warmup_scheduler,
+                writer=self.writer
+            )
+
+            if self.lr_warmup_scheduler is None or self.lr_warmup_scheduler.finished():
+                self.lr_scheduler.step()
+            
+            self.last_epoch += 1
+
+            if self.output_dir:
+                checkpoint_paths = [self.output_dir / 'last.pth']
+                # extra checkpoint before LR drop and every 100 epochs
+                if (epoch + 1) % args.checkpoint_freq == 0:
+                    checkpoint_paths.append(self.output_dir / f'checkpoint{epoch:04}.pth')
+                for checkpoint_path in checkpoint_paths:
+                    dist_utils.save_on_master(self.state_dict(), checkpoint_path)
+
+            module = self.ema.module if self.ema else self.model
+            test_stats, coco_evaluator = evaluate(
+                module, 
+                self.criterion, 
+                self.postprocessor, 
+                self.val_dataloader, 
+                self.evaluator, 
+                self.device
+            )
+
+            # TODO 
+            for k in test_stats:
+                if self.writer and dist_utils.is_main_process():
+                    for i, v in enumerate(test_stats[k]):
+                        self.writer.add_scalar(f'Test/{k}_{i}'.format(k), v, epoch)
+            
+                if k in best_stat:
+                    best_stat['epoch'] = epoch if test_stats[k][0] > best_stat[k] else best_stat['epoch']
+                    best_stat[k] = max(best_stat[k], test_stats[k][0])
+                else:
+                    best_stat['epoch'] = epoch
+                    best_stat[k] = test_stats[k][0]
+
+                if best_stat['epoch'] == epoch and self.output_dir:
+                    dist_utils.save_on_master(self.state_dict(), self.output_dir / 'best.pth')
+
+            print(f'best_stat: {best_stat}')
+
+            log_stats = {
+                **{f'train_{k}': v for k, v in train_stats.items()},
+                **{f'test_{k}': v for k, v in test_stats.items()},
+                'epoch': epoch,
+                'n_parameters': n_parameters
+            }
+
+            if self.output_dir and dist_utils.is_main_process():
+                with (self.output_dir / "log.txt").open("a") as f:
+                    f.write(json.dumps(log_stats) + "\n")
+
+                # for evaluation logs
+                if coco_evaluator is not None:
+                    (self.output_dir / 'eval').mkdir(exist_ok=True)
+                    if "bbox" in coco_evaluator.coco_eval:
+                        filenames = ['latest.pth']
+                        if epoch % 50 == 0:
+                            filenames.append(f'{epoch:03}.pth')
+                        for name in filenames:
+                            torch.save(coco_evaluator.coco_eval["bbox"].eval,
+                                    self.output_dir / "eval" / name)
+
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('Training time {}'.format(total_time_str))
+
+
+    def val(self, ):
+        self.eval()
+        
+        module = self.ema.module if self.ema else self.model
+        test_stats, coco_evaluator = evaluate(module, self.criterion, self.postprocessor,
+                self.val_dataloader, self.evaluator, self.device)
+                
+        if self.output_dir:
+            dist_utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, self.output_dir / "eval.pth")
+        
+        return
diff --git a/rtdetrv2_pytorch/src/zoo/__init__.py b/rtdetrv2_pytorch/src/zoo/__init__.py
new file mode 100644
index 0000000..b1bf6c5
--- /dev/null
+++ b/rtdetrv2_pytorch/src/zoo/__init__.py
@@ -0,0 +1,5 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+
+from . import rtdetr
diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/__init__.py b/rtdetrv2_pytorch/src/zoo/rtdetr/__init__.py
new file mode 100644
index 0000000..6addf4f
--- /dev/null
+++ b/rtdetrv2_pytorch/src/zoo/rtdetr/__init__.py
@@ -0,0 +1,14 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+
+from .rtdetr import RTDETR
+from .matcher import HungarianMatcher
+from .hybrid_encoder import HybridEncoder
+from .rtdetr_decoder import RTDETRTransformer
+from .rtdetr_criterion import RTDETRCriterion
+from .rtdetr_postprocessor import RTDETRPostProcessor
+
+# v2
+from .rtdetrv2_decoder import RTDETRTransformerv2
+from .rtdetrv2_criterion import RTDETRCriterionv2
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/box_ops.py b/rtdetrv2_pytorch/src/zoo/rtdetr/box_ops.py
new file mode 100644
index 0000000..9c52c2b
--- /dev/null
+++ b/rtdetrv2_pytorch/src/zoo/rtdetr/box_ops.py
@@ -0,0 +1,90 @@
+"""
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+https://github.com/facebookresearch/detr/blob/main/util/box_ops.py
+"""
+
+import torch
+from torch import Tensor
+from torchvision.ops.boxes import box_area
+
+
+def box_cxcywh_to_xyxy(x: Tensor) -> Tensor:
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
+         (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return torch.stack(b, dim=-1)
+
+
+def box_xyxy_to_cxcywh(x: Tensor) -> Tensor:
+    x0, y0, x1, y1 = x.unbind(-1)
+    b = [(x0 + x1) / 2, (y0 + y1) / 2,
+         (x1 - x0), (y1 - y0)]
+    return torch.stack(b, dim=-1)
+
+
+# modified from torchvision to also return the union
+def box_iou(boxes1: Tensor, boxes2: Tensor):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/
+
+    The boxes should be in [x0, y0, x1, y1] format
+
+    Returns a [N, M] pairwise matrix, where N = len(boxes1)
+    and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    iou, union = box_iou(boxes1, boxes2)
+
+    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    area = wh[:, :, 0] * wh[:, :, 1]
+
+    return iou - (area - union) / area
+
+
+def masks_to_boxes(masks):
+    """Compute the bounding boxes around the provided masks
+
+    The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
+
+    Returns a [N, 4] tensors, with the boxes in xyxy format
+    """
+    if masks.numel() == 0:
+        return torch.zeros((0, 4), device=masks.device)
+
+    h, w = masks.shape[-2:]
+
+    y = torch.arange(0, h, dtype=torch.float)
+    x = torch.arange(0, w, dtype=torch.float)
+    y, x = torch.meshgrid(y, x)
+
+    x_mask = (masks * x.unsqueeze(0))
+    x_max = x_mask.flatten(1).max(-1)[0]
+    x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+
+    y_mask = (masks * y.unsqueeze(0))
+    y_max = y_mask.flatten(1).max(-1)[0]
+    y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+
+    return torch.stack([x_min, y_min, x_max, y_max], 1)
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/conver_params.py b/rtdetrv2_pytorch/src/zoo/rtdetr/conver_params.py
new file mode 100644
index 0000000..e93366a
--- /dev/null
+++ b/rtdetrv2_pytorch/src/zoo/rtdetr/conver_params.py
@@ -0,0 +1,72 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch 
+
+def main(args) -> None:
+    import cvperception
+    from cvperception.core import load_config, merge_config, create
+    cfg = load_config(args.config)
+    model: torch.nn.Module = create(cfg['model'], merge_config(cfg))
+
+    if args.version == 1:
+        state = model.state_dict()
+        keys = [k for k in state.keys() if 'num_batches_tracked' not in k]
+
+    elif args.version == 2:
+        state = model.state_dict()
+        ignore_keys = ['anchors', 'valid_mask', 'num_points_scale']
+        keys = [k for k in state.keys() if 'num_batches_tracked' not in k]
+        keys = [k for k in keys if not any([x in k for x in ignore_keys])]
+    
+    import paddle
+    p_state = paddle.load(args.pdparams)
+    pkeys = list(p_state.keys())
+    
+    assert len(keys) == len(pkeys), f'{len(keys)}, {len(pkeys)}'
+
+    new_state = {}
+    for i, k in enumerate(keys):    
+        pp = p_state[pkeys[i]]
+        pp = torch.tensor(pp.numpy())
+
+        if 'denoising_class_embed' in k:
+            new_state[k] = torch.concat([pp, torch.zeros(1, pp.shape[-1])], dim=0)
+            continue
+
+        tp = state[k]
+        if len(tp.shape) == 2:
+            new_state[k] = pp.T
+        elif len(tp.shape) == 1:
+            new_state[k] = pp
+        else:
+            assert tp.shape == pp.shape, f'{k}, {pp.shape}, {tp.shape}'
+            new_state[k] = pp
+
+    assert len(new_state) == len(p_state), ''
+
+    # checkpoint = {'ema': {'module': new_state, }}
+    # torch.save(checkpoint, args.output_file)
+
+    model.load_state_dict(new_state, strict=False)
+
+    checkpoint = {'ema': {'module': model.state_dict(), }}
+    torch.save(checkpoint, args.output_file)
+
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, )
+    parser.add_argument('-p', '--pdparams', type=str, )
+    parser.add_argument('-o', '--output_file', type=str, )
+    parser.add_argument('-v', '--version', type=int, default=1)
+
+    args = parser.parse_args()
+    main(args)
+    
+    # python ./src/cvperception/zoo/rtdetr/conver_params.py -c configs/rtdetr/rtdetr_r18vd_6x_coco.yml -p rtdetr_r18vd_dec3_6x_coco.pdparams -o rtdetr_r18vd_dec3_6x_coco_new.pth
+    # python ./src/cvperception/zoo/rtdetr/conver_params.py -c configs/rtdetr/rtdetr_r18vd_6x_coco.yml -p rtdetr_r18vd_5x_coco_objects365.pdparams -o rtdetr_r18vd_5x_coco_objects365_new.pth
+    # python ./src/cvperception/zoo/rtdetr/conver_params.py -c configs/rtdetrv2/rtdetrv2_r50vd_120e_coco.yml -p rtdetr_r50vd_1x_objects365.pdparams -o rtdetrv2_r50vd_1x_objects365_new.pth -v 2
+    
diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/denoising.py b/rtdetrv2_pytorch/src/zoo/rtdetr/denoising.py
new file mode 100644
index 0000000..4723b67
--- /dev/null
+++ b/rtdetrv2_pytorch/src/zoo/rtdetr/denoising.py
@@ -0,0 +1,104 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch 
+
+from .utils import inverse_sigmoid
+from .box_ops import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh
+
+
+
+def get_contrastive_denoising_training_group(targets,
+                                             num_classes,
+                                             num_queries,
+                                             class_embed,
+                                             num_denoising=100,
+                                             label_noise_ratio=0.5,
+                                             box_noise_scale=1.0,):
+    """cnd"""
+    if num_denoising <= 0:
+        return None, None, None, None
+
+    num_gts = [len(t['labels']) for t in targets]
+    device = targets[0]['labels'].device
+    
+    max_gt_num = max(num_gts)
+    if max_gt_num == 0:
+        return None, None, None, None
+
+    num_group = num_denoising // max_gt_num
+    num_group = 1 if num_group == 0 else num_group
+    # pad gt to max_num of a batch
+    bs = len(num_gts)
+
+    input_query_class = torch.full([bs, max_gt_num], num_classes, dtype=torch.int32, device=device)
+    input_query_bbox = torch.zeros([bs, max_gt_num, 4], device=device)
+    pad_gt_mask = torch.zeros([bs, max_gt_num], dtype=torch.bool, device=device)
+
+    for i in range(bs):
+        num_gt = num_gts[i]
+        if num_gt > 0:
+            input_query_class[i, :num_gt] = targets[i]['labels']
+            input_query_bbox[i, :num_gt] = targets[i]['boxes']
+            pad_gt_mask[i, :num_gt] = 1
+    # each group has positive and negative queries.
+    input_query_class = input_query_class.tile([1, 2 * num_group])
+    input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1])
+    pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group])
+    # positive and negative mask
+    negative_gt_mask = torch.zeros([bs, max_gt_num * 2, 1], device=device)
+    negative_gt_mask[:, max_gt_num:] = 1
+    negative_gt_mask = negative_gt_mask.tile([1, num_group, 1])
+    positive_gt_mask = 1 - negative_gt_mask
+    # contrastive denoising training positive index
+    positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
+    dn_positive_idx = torch.nonzero(positive_gt_mask)[:, 1]
+    dn_positive_idx = torch.split(dn_positive_idx, [n * num_group for n in num_gts])
+    # total denoising queries
+    num_denoising = int(max_gt_num * 2 * num_group)
+
+    if label_noise_ratio > 0:
+        mask = torch.rand_like(input_query_class, dtype=torch.float) < (label_noise_ratio * 0.5)
+        # randomly put a new one here
+        new_label = torch.randint_like(mask, 0, num_classes, dtype=input_query_class.dtype)
+        input_query_class = torch.where(mask & pad_gt_mask, new_label, input_query_class)
+
+    if box_noise_scale > 0:
+        known_bbox = box_cxcywh_to_xyxy(input_query_bbox)
+        diff = torch.tile(input_query_bbox[..., 2:] * 0.5, [1, 1, 2]) * box_noise_scale
+        rand_sign = torch.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
+        rand_part = torch.rand_like(input_query_bbox)
+        rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (1 - negative_gt_mask)
+        known_bbox += (rand_sign * rand_part * diff)
+        known_bbox = torch.clip(known_bbox, min=0.0, max=1.0)
+        input_query_bbox = box_xyxy_to_cxcywh(known_bbox)
+        input_query_bbox_unact = inverse_sigmoid(input_query_bbox)
+
+    input_query_logits = class_embed(input_query_class)
+
+    tgt_size = num_denoising + num_queries
+    attn_mask = torch.full([tgt_size, tgt_size], False, dtype=torch.bool, device=device)
+    # match query cannot see the reconstruction
+    attn_mask[num_denoising:, :num_denoising] = True
+    
+    # reconstruct cannot see each other
+    for i in range(num_group):
+        if i == 0:
+            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1): num_denoising] = True
+        if i == num_group - 1:
+            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), :max_gt_num * i * 2] = True
+        else:
+            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1): num_denoising] = True
+            attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), :max_gt_num * 2 * i] = True
+        
+    dn_meta = {
+        "dn_positive_idx": dn_positive_idx,
+        "dn_num_group": num_group,
+        "dn_num_split": [num_denoising, num_queries]
+    }
+
+    # print(input_query_class.shape) # torch.Size([4, 196, 256])
+    # print(input_query_bbox.shape) # torch.Size([4, 196, 4])
+    # print(attn_mask.shape) # torch.Size([496, 496])
+    
+    return input_query_logits, input_query_bbox_unact, attn_mask, dn_meta
diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/hybrid_encoder.py b/rtdetrv2_pytorch/src/zoo/rtdetr/hybrid_encoder.py
new file mode 100644
index 0000000..e8c22cc
--- /dev/null
+++ b/rtdetrv2_pytorch/src/zoo/rtdetr/hybrid_encoder.py
@@ -0,0 +1,330 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import copy
+from collections import OrderedDict
+
+import torch 
+import torch.nn as nn 
+import torch.nn.functional as F 
+
+from .utils import get_activation
+
+from ...core import register
+
+
+__all__ = ['HybridEncoder']
+
+
+
+class ConvNormLayer(nn.Module):
+    def __init__(self, ch_in, ch_out, kernel_size, stride, padding=None, bias=False, act=None):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            ch_in, 
+            ch_out, 
+            kernel_size, 
+            stride, 
+            padding=(kernel_size-1)//2 if padding is None else padding, 
+            bias=bias)
+        self.norm = nn.BatchNorm2d(ch_out)
+        self.act = nn.Identity() if act is None else get_activation(act) 
+
+    def forward(self, x):
+        return self.act(self.norm(self.conv(x)))
+
+
+class RepVggBlock(nn.Module):
+    def __init__(self, ch_in, ch_out, act='relu'):
+        super().__init__()
+        self.ch_in = ch_in
+        self.ch_out = ch_out
+        self.conv1 = ConvNormLayer(ch_in, ch_out, 3, 1, padding=1, act=None)
+        self.conv2 = ConvNormLayer(ch_in, ch_out, 1, 1, padding=0, act=None)
+        self.act = nn.Identity() if act is None else get_activation(act) 
+
+    def forward(self, x):
+        if hasattr(self, 'conv'):
+            y = self.conv(x)
+        else:
+            y = self.conv1(x) + self.conv2(x)
+
+        return self.act(y)
+
+    def convert_to_deploy(self):
+        if not hasattr(self, 'conv'):
+            self.conv = nn.Conv2d(self.ch_in, self.ch_out, 3, 1, padding=1)
+
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.conv.weight.data = kernel
+        self.conv.bias.data = bias 
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
+        
+        return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1), bias3x3 + bias1x1
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return F.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch: ConvNormLayer):
+        if branch is None:
+            return 0, 0
+        kernel = branch.conv.weight
+        running_mean = branch.norm.running_mean
+        running_var = branch.norm.running_var
+        gamma = branch.norm.weight
+        beta = branch.norm.bias
+        eps = branch.norm.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+
+class CSPRepLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_blocks=3,
+                 expansion=1.0,
+                 bias=None,
+                 act="silu"):
+        super(CSPRepLayer, self).__init__()
+        hidden_channels = int(out_channels * expansion)
+        self.conv1 = ConvNormLayer(in_channels, hidden_channels, 1, 1, bias=bias, act=act)
+        self.conv2 = ConvNormLayer(in_channels, hidden_channels, 1, 1, bias=bias, act=act)
+        self.bottlenecks = nn.Sequential(*[
+            RepVggBlock(hidden_channels, hidden_channels, act=act) for _ in range(num_blocks)
+        ])
+        if hidden_channels != out_channels:
+            self.conv3 = ConvNormLayer(hidden_channels, out_channels, 1, 1, bias=bias, act=act)
+        else:
+            self.conv3 = nn.Identity()
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_1 = self.bottlenecks(x_1)
+        x_2 = self.conv2(x)
+        return self.conv3(x_1 + x_2)
+
+
+# transformer
+class TransformerEncoderLayer(nn.Module):
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 normalize_before=False):
+        super().__init__()
+        self.normalize_before = normalize_before
+
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout, batch_first=True)
+
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = get_activation(activation) 
+
+    @staticmethod
+    def with_pos_embed(tensor, pos_embed):
+        return tensor if pos_embed is None else tensor + pos_embed
+
+    def forward(self, src, src_mask=None, pos_embed=None) -> torch.Tensor:
+        residual = src
+        if self.normalize_before:
+            src = self.norm1(src)
+        q = k = self.with_pos_embed(src, pos_embed)
+        src, _ = self.self_attn(q, k, value=src, attn_mask=src_mask)
+
+        src = residual + self.dropout1(src)
+        if not self.normalize_before:
+            src = self.norm1(src)
+
+        residual = src
+        if self.normalize_before:
+            src = self.norm2(src)
+        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = residual + self.dropout2(src)
+        if not self.normalize_before:
+            src = self.norm2(src)
+        return src
+
+
+class TransformerEncoder(nn.Module):
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super(TransformerEncoder, self).__init__()
+        self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for _ in range(num_layers)])
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, src, src_mask=None, pos_embed=None) -> torch.Tensor:
+        output = src
+        for layer in self.layers:
+            output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+@register()
+class HybridEncoder(nn.Module):
+    __share__ = ['eval_spatial_size', ]
+
+    def __init__(self,
+                 in_channels=[512, 1024, 2048],
+                 feat_strides=[8, 16, 32],
+                 hidden_dim=256,
+                 nhead=8,
+                 dim_feedforward = 1024,
+                 dropout=0.0,
+                 enc_act='gelu',
+                 use_encoder_idx=[2],
+                 num_encoder_layers=1,
+                 pe_temperature=10000,
+                 expansion=1.0,
+                 depth_mult=1.0,
+                 act='silu',
+                 eval_spatial_size=None, 
+                 version='v2'):
+        super().__init__()
+        self.in_channels = in_channels
+        self.feat_strides = feat_strides
+        self.hidden_dim = hidden_dim
+        self.use_encoder_idx = use_encoder_idx
+        self.num_encoder_layers = num_encoder_layers
+        self.pe_temperature = pe_temperature
+        self.eval_spatial_size = eval_spatial_size        
+        self.out_channels = [hidden_dim for _ in range(len(in_channels))]
+        self.out_strides = feat_strides
+        
+        # channel projection
+        self.input_proj = nn.ModuleList()
+        for in_channel in in_channels:
+            if version == 'v1':
+                proj = nn.Sequential(
+                    nn.Conv2d(in_channel, hidden_dim, kernel_size=1, bias=False),
+                    nn.BatchNorm2d(hidden_dim))
+            elif version == 'v2':
+                proj = nn.Sequential(OrderedDict([
+                    ('conv', nn.Conv2d(in_channel, hidden_dim, kernel_size=1, bias=False)),
+                    ('norm', nn.BatchNorm2d(hidden_dim))
+                ]))
+            else:
+                raise AttributeError()
+                
+            self.input_proj.append(proj)
+
+        # encoder transformer
+        encoder_layer = TransformerEncoderLayer(
+            hidden_dim, 
+            nhead=nhead,
+            dim_feedforward=dim_feedforward, 
+            dropout=dropout,
+            activation=enc_act)
+
+        self.encoder = nn.ModuleList([
+            TransformerEncoder(copy.deepcopy(encoder_layer), num_encoder_layers) for _ in range(len(use_encoder_idx))
+        ])
+
+        # top-down fpn
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_blocks = nn.ModuleList()
+        for _ in range(len(in_channels) - 1, 0, -1):
+            self.lateral_convs.append(ConvNormLayer(hidden_dim, hidden_dim, 1, 1, act=act))
+            self.fpn_blocks.append(
+                CSPRepLayer(hidden_dim * 2, hidden_dim, round(3 * depth_mult), act=act, expansion=expansion)
+            )
+
+        # bottom-up pan
+        self.downsample_convs = nn.ModuleList()
+        self.pan_blocks = nn.ModuleList()
+        for _ in range(len(in_channels) - 1):
+            self.downsample_convs.append(
+                ConvNormLayer(hidden_dim, hidden_dim, 3, 2, act=act)
+            )
+            self.pan_blocks.append(
+                CSPRepLayer(hidden_dim * 2, hidden_dim, round(3 * depth_mult), act=act, expansion=expansion)
+            )
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        if self.eval_spatial_size:
+            for idx in self.use_encoder_idx:
+                stride = self.feat_strides[idx]
+                pos_embed = self.build_2d_sincos_position_embedding(
+                    self.eval_spatial_size[1] // stride, self.eval_spatial_size[0] // stride,
+                    self.hidden_dim, self.pe_temperature)
+                setattr(self, f'pos_embed{idx}', pos_embed)
+                # self.register_buffer(f'pos_embed{idx}', pos_embed)
+
+    @staticmethod
+    def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.):
+        """
+        """
+        grid_w = torch.arange(int(w), dtype=torch.float32)
+        grid_h = torch.arange(int(h), dtype=torch.float32)
+        grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing='ij')
+        assert embed_dim % 4 == 0, \
+            'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
+        pos_dim = embed_dim // 4
+        omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
+        omega = 1. / (temperature ** omega)
+
+        out_w = grid_w.flatten()[..., None] @ omega[None]
+        out_h = grid_h.flatten()[..., None] @ omega[None]
+
+        return torch.concat([out_w.sin(), out_w.cos(), out_h.sin(), out_h.cos()], dim=1)[None, :, :]
+
+    def forward(self, feats):
+        assert len(feats) == len(self.in_channels)
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+        
+        # encoder
+        if self.num_encoder_layers > 0:
+            for i, enc_ind in enumerate(self.use_encoder_idx):
+                h, w = proj_feats[enc_ind].shape[2:]
+                # flatten [B, C, H, W] to [B, HxW, C]
+                src_flatten = proj_feats[enc_ind].flatten(2).permute(0, 2, 1)
+                if self.training or self.eval_spatial_size is None:
+                    pos_embed = self.build_2d_sincos_position_embedding(
+                        w, h, self.hidden_dim, self.pe_temperature).to(src_flatten.device)
+                else:
+                    pos_embed = getattr(self, f'pos_embed{enc_ind}', None).to(src_flatten.device)
+
+                memory :torch.Tensor = self.encoder[i](src_flatten, pos_embed=pos_embed)
+                proj_feats[enc_ind] = memory.permute(0, 2, 1).reshape(-1, self.hidden_dim, h, w).contiguous()
+
+        # broadcasting and fusion
+        inner_outs = [proj_feats[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_heigh = inner_outs[0]
+            feat_low = proj_feats[idx - 1]
+            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](feat_heigh)
+            inner_outs[0] = feat_heigh
+            upsample_feat = F.interpolate(feat_heigh, scale_factor=2., mode='nearest')
+            inner_out = self.fpn_blocks[len(self.in_channels)-1-idx](torch.concat([upsample_feat, feat_low], dim=1))
+            inner_outs.insert(0, inner_out)
+
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_height = inner_outs[idx + 1]
+            downsample_feat = self.downsample_convs[idx](feat_low)
+            out = self.pan_blocks[idx](torch.concat([downsample_feat, feat_height], dim=1))
+            outs.append(out)
+
+        return outs
diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/matcher.py b/rtdetrv2_pytorch/src/zoo/rtdetr/matcher.py
new file mode 100644
index 0000000..580ea91
--- /dev/null
+++ b/rtdetrv2_pytorch/src/zoo/rtdetr/matcher.py
@@ -0,0 +1,111 @@
+"""
+Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+Modules to compute the matching cost and solve the corresponding LSAP.
+
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F 
+
+from scipy.optimize import linear_sum_assignment
+from typing import Dict 
+
+from .box_ops import box_cxcywh_to_xyxy, generalized_box_iou
+
+from ...core import register
+
+
+@register()
+class HungarianMatcher(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
+    while the others are un-matched (and thus treated as non-objects).
+    """
+
+    __share__ = ['use_focal_loss', ]
+
+    def __init__(self, weight_dict, use_focal_loss=False, alpha=0.25, gamma=2.0):
+        """Creates the matcher
+
+        Params:
+            cost_class: This is the relative weight of the classification error in the matching cost
+            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
+            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
+        """
+        super().__init__()
+        self.cost_class = weight_dict['cost_class']
+        self.cost_bbox = weight_dict['cost_bbox']
+        self.cost_giou = weight_dict['cost_giou']
+
+        self.use_focal_loss = use_focal_loss
+        self.alpha = alpha
+        self.gamma = gamma
+
+        assert self.cost_class != 0 or self.cost_bbox != 0 or self.cost_giou != 0, "all costs cant be 0"
+
+    @torch.no_grad()
+    def forward(self, outputs: Dict[str, torch.Tensor], targets):
+        """ Performs the matching
+
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
+
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
+
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        bs, num_queries = outputs["pred_logits"].shape[:2]
+
+        # We flatten to compute the cost matrices in a batch
+        if self.use_focal_loss:
+            out_prob = F.sigmoid(outputs["pred_logits"].flatten(0, 1))
+        else:
+            out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1)  # [batch_size * num_queries, num_classes]
+
+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+
+        # Also concat the target labels and boxes
+        tgt_ids = torch.cat([v["labels"] for v in targets])
+        tgt_bbox = torch.cat([v["boxes"] for v in targets])
+
+        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+        # but approximate it in 1 - proba[target class].
+        # The 1 is a constant that doesn't change the matching, it can be ommitted.
+        if self.use_focal_loss:
+            out_prob = out_prob[:, tgt_ids]
+            neg_cost_class = (1 - self.alpha) * (out_prob ** self.gamma) * (-(1 - out_prob + 1e-8).log())
+            pos_cost_class = self.alpha * ((1 - out_prob) ** self.gamma) * (-(out_prob + 1e-8).log())
+            cost_class = pos_cost_class - neg_cost_class        
+        else:
+            cost_class = -out_prob[:, tgt_ids]
+
+        # Compute the L1 cost between boxes
+        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
+
+        # Compute the giou cost betwen boxes
+        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
+        
+        # Final cost matrix
+        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
+        C = C.view(bs, num_queries, -1).cpu()
+
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
+        indices = [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+
+        return {'indices': indices}
+        
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr.py b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr.py
new file mode 100644
index 0000000..373f7bf
--- /dev/null
+++ b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr.py
@@ -0,0 +1,44 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch 
+import torch.nn as nn 
+import torch.nn.functional as F 
+
+import random 
+import numpy as np 
+from typing import List 
+
+from ...core import register
+
+
+__all__ = ['RTDETR', ]
+
+
+@register()
+class RTDETR(nn.Module):
+    __inject__ = ['backbone', 'encoder', 'decoder', ]
+
+    def __init__(self, \
+        backbone: nn.Module, 
+        encoder: nn.Module, 
+        decoder: nn.Module, 
+    ):
+        super().__init__()
+        self.backbone = backbone
+        self.decoder = decoder
+        self.encoder = encoder
+        
+    def forward(self, x, targets=None):
+        x = self.backbone(x)
+        x = self.encoder(x)        
+        x = self.decoder(x, targets)
+
+        return x
+    
+    def deploy(self, ):
+        self.eval()
+        for m in self.modules():
+            if hasattr(m, 'convert_to_deploy'):
+                m.convert_to_deploy()
+        return self 
diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr_criterion.py b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr_criterion.py
new file mode 100644
index 0000000..ab269e8
--- /dev/null
+++ b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr_criterion.py
@@ -0,0 +1,282 @@
+"""
+reference: 
+https://github.com/facebookresearch/detr/blob/main/models/detr.py
+
+Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+
+import torch 
+import torch.nn as nn 
+import torch.distributed
+import torch.nn.functional as F 
+import torchvision
+
+from .box_ops import box_cxcywh_to_xyxy, box_iou, generalized_box_iou
+from ...misc.dist_utils import get_world_size, is_dist_available_and_initialized
+from ...core import register
+
+
+
+@register()
+class RTDETRCriterion(nn.Module):
+    """ This class computes the loss for DETR.
+    The process happens in two steps:
+        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+    """
+    __share__ = ['num_classes', ]
+    __inject__ = ['matcher', ]
+
+    def __init__(self, matcher, weight_dict, losses, alpha=0.2, gamma=2.0, eos_coef=1e-4, num_classes=80):
+        """ Create the criterion.
+        Parameters:
+            num_classes: number of object categories, omitting the special no-object category
+            matcher: module able to compute a matching between targets and proposals
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            eos_coef: relative classification weight applied to the no-object category
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.losses = losses 
+
+        empty_weight = torch.ones(self.num_classes + 1)
+        empty_weight[-1] = eos_coef
+        self.register_buffer('empty_weight', empty_weight)
+
+        self.alpha = alpha
+        self.gamma = gamma
+
+
+    def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
+        """Classification loss (NLL)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+        """
+        assert 'pred_logits' in outputs
+        src_logits = outputs['pred_logits']
+
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
+                                    dtype=torch.int64, device=src_logits.device)
+        target_classes[idx] = target_classes_o
+
+        loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight)
+        losses = {'loss_ce': loss_ce}
+
+        if log:
+            # TODO this should probably be a separate loss, not hacked in this one here
+            losses['class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0]
+        return losses
+
+    def loss_labels_focal(self, outputs, targets, indices, num_boxes, log=True):
+        assert 'pred_logits' in outputs
+        src_logits = outputs['pred_logits']
+
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
+                                    dtype=torch.int64, device=src_logits.device)
+        target_classes[idx] = target_classes_o
+
+        target = F.one_hot(target_classes, num_classes=self.num_classes+1)[..., :-1]
+        loss = torchvision.ops.sigmoid_focal_loss(src_logits, target, self.alpha, self.gamma, reduction='none')
+        loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes
+
+        return {'loss_focal': loss}
+
+    def loss_labels_vfl(self, outputs, targets, indices, num_boxes, log=True):
+        assert 'pred_boxes' in outputs
+        idx = self._get_src_permutation_idx(indices)
+
+        src_boxes = outputs['pred_boxes'][idx]
+        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        ious, _ = box_iou(box_cxcywh_to_xyxy(src_boxes), box_cxcywh_to_xyxy(target_boxes))
+        ious = torch.diag(ious).detach()
+
+        src_logits = outputs['pred_logits']
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
+                                    dtype=torch.int64, device=src_logits.device)
+        target_classes[idx] = target_classes_o
+        target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1]
+
+        target_score_o = torch.zeros_like(target_classes, dtype=src_logits.dtype)
+        target_score_o[idx] = ious.to(target_score_o.dtype)
+        target_score = target_score_o.unsqueeze(-1) * target
+
+        pred_score = F.sigmoid(src_logits).detach()
+        weight = self.alpha * pred_score.pow(self.gamma) * (1 - target) + target_score
+        
+        loss = F.binary_cross_entropy_with_logits(src_logits, target_score, weight=weight, reduction='none')
+        loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes
+        return {'loss_vfl': loss}
+
+    @torch.no_grad()
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """ Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes
+        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients
+        """
+        pred_logits = outputs['pred_logits']
+        device = pred_logits.device
+        tgt_lengths = torch.as_tensor([len(v["labels"]) for v in targets], device=device)
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (pred_logits.argmax(-1) != pred_logits.shape[-1] - 1).sum(1)
+        card_err = F.l1_loss(card_pred.float(), tgt_lengths.float())
+        losses = {'cardinality_error': card_err}
+        return losses
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
+           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
+           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        assert 'pred_boxes' in outputs
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs['pred_boxes'][idx]
+        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        losses = {}
+
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
+        losses['loss_bbox'] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - torch.diag(generalized_box_iou(\
+            box_cxcywh_to_xyxy(src_boxes), box_cxcywh_to_xyxy(target_boxes)))
+        losses['loss_giou'] = loss_giou.sum() / num_boxes
+        return losses
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
+        loss_map = {
+            'labels': self.loss_labels,
+            'boxes': self.loss_boxes,
+            'cardinality': self.loss_cardinality,
+            'focal': self.loss_labels_focal,
+            'vfl': self.loss_labels_vfl,
+        }
+        assert loss in loss_map, f'do you really want to compute {loss} loss?'
+        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
+
+    def forward(self, outputs, targets, **kwargs):
+        """ This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if 'aux' not in k}
+
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t["labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        if is_dist_available_and_initialized():
+            torch.distributed.all_reduce(num_boxes)
+        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
+        
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)['indices']
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            l_dict = self.get_loss(loss, outputs, targets, indices, num_boxes)
+            l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+            losses.update(l_dict)
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if 'aux_outputs' in outputs:
+            for i, aux_outputs in enumerate(outputs['aux_outputs']):
+                indices = self.matcher(aux_outputs, targets)['indices']
+                for loss in self.losses:
+                    if loss == 'masks':
+                        # Intermediate masks losses are too costly to compute, we ignore them.
+                        continue
+                    kwargs = {}
+                    if loss == 'labels':
+                        # Logging is enabled only for the last layer
+                        kwargs = {'log': False}
+
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs)
+                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                    l_dict = {k + f'_aux_{i}': v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        # In case of cdn auxiliary losses. For rtdetr
+        if 'dn_aux_outputs' in outputs:
+            assert 'dn_meta' in outputs, ''
+            indices = self.get_cdn_matched_indices(outputs['dn_meta'], targets)
+            dn_num_boxes = num_boxes * outputs['dn_meta']['dn_num_group']
+            for i, aux_outputs in enumerate(outputs['dn_aux_outputs']):
+                for loss in self.losses:
+                    if loss == 'masks':
+                        # Intermediate masks losses are too costly to compute, we ignore them.
+                        continue
+
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, dn_num_boxes, **kwargs)
+                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                    l_dict = {k + f'_dn_{i}': v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        return losses
+
+    @staticmethod
+    def get_cdn_matched_indices(dn_meta, targets):
+        """get_cdn_matched_indices
+        """
+        dn_positive_idx, dn_num_group = dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
+        num_gts = [len(t['labels']) for t in targets]
+        device = targets[0]['labels'].device
+        
+        dn_match_indices = []
+        for i, num_gt in enumerate(num_gts):
+            if num_gt > 0:
+                gt_idx = torch.arange(num_gt, dtype=torch.int64, device=device)
+                gt_idx = gt_idx.tile(dn_num_group)
+                assert len(dn_positive_idx[i]) == len(gt_idx)
+                dn_match_indices.append((dn_positive_idx[i], gt_idx))
+            else:
+                dn_match_indices.append((torch.zeros(0, dtype=torch.int64, device=device), \
+                    torch.zeros(0, dtype=torch.int64,  device=device)))
+        
+        return dn_match_indices
+
+
+
+
+
+@torch.no_grad()
+def accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    if target.numel() == 0:
+        return [torch.zeros([], device=output.device)]
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+
+
+
+
diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr_decoder.py b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr_decoder.py
new file mode 100644
index 0000000..536fbf8
--- /dev/null
+++ b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr_decoder.py
@@ -0,0 +1,583 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import math 
+import copy 
+from collections import OrderedDict
+
+import torch 
+import torch.nn as nn 
+import torch.nn.functional as F 
+import torch.nn.init as init 
+
+from .denoising import get_contrastive_denoising_training_group
+from .utils import deformable_attention_core_func, get_activation, inverse_sigmoid
+from .utils import bias_init_with_prob
+
+
+from ...core import register
+
+
+__all__ = ['RTDETRTransformer']
+
+
+
+class MLP(nn.Module):
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, act='relu'):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+        self.act = nn.Identity() if act is None else get_activation(act)
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = self.act(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+
+class MSDeformableAttention(nn.Module):
+    def __init__(self, embed_dim=256, num_heads=8, num_levels=4, num_points=4,):
+        """
+        Multi-Scale Deformable Attention Module
+        """
+        super(MSDeformableAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_levels = num_levels
+        self.num_points = num_points
+        self.total_points = num_heads * num_levels * num_points
+
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        self.sampling_offsets = nn.Linear(embed_dim, self.total_points * 2,)
+        self.attention_weights = nn.Linear(embed_dim, self.total_points)
+        self.value_proj = nn.Linear(embed_dim, embed_dim)
+        self.output_proj = nn.Linear(embed_dim, embed_dim)
+
+        self.ms_deformable_attn_core = deformable_attention_core_func
+
+        self._reset_parameters()
+
+
+    def _reset_parameters(self):
+        # sampling_offsets
+        init.constant_(self.sampling_offsets.weight, 0)
+        thetas = torch.arange(self.num_heads, dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = grid_init / grid_init.abs().max(-1, keepdim=True).values
+        grid_init = grid_init.reshape(self.num_heads, 1, 1, 2).tile([1, self.num_levels, self.num_points, 1])
+        scaling = torch.arange(1, self.num_points + 1, dtype=torch.float32).reshape(1, 1, -1, 1)
+        grid_init *= scaling
+        self.sampling_offsets.bias.data[...] = grid_init.flatten()
+
+        # attention_weights
+        init.constant_(self.attention_weights.weight, 0)
+        init.constant_(self.attention_weights.bias, 0)
+
+        # proj
+        init.xavier_uniform_(self.value_proj.weight)
+        init.constant_(self.value_proj.bias, 0)
+        init.xavier_uniform_(self.output_proj.weight)
+        init.constant_(self.output_proj.bias, 0)
+
+
+    def forward(self,
+                query,
+                reference_points,
+                value,
+                value_spatial_shapes,
+                value_mask=None):
+        """
+        Args:
+            query (Tensor): [bs, query_length, C]
+            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area
+            value (Tensor): [bs, value_length, C]
+            value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+            value_level_start_index (List): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
+            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
+
+        Returns:
+            output (Tensor): [bs, Length_{query}, C]
+        """
+        bs, Len_q = query.shape[:2]
+        Len_v = value.shape[1]
+
+        value = self.value_proj(value)
+        if value_mask is not None:
+            value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
+            value *= value_mask
+        value = value.reshape(bs, Len_v, self.num_heads, self.head_dim)
+
+        sampling_offsets = self.sampling_offsets(query).reshape(
+            bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).reshape(
+            bs, Len_q, self.num_heads, self.num_levels * self.num_points)
+        attention_weights = F.softmax(attention_weights, dim=-1).reshape(
+            bs, Len_q, self.num_heads, self.num_levels, self.num_points)
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.tensor(value_spatial_shapes)
+            offset_normalizer = offset_normalizer.flip([1]).reshape(
+                1, 1, 1, self.num_levels, 1, 2)
+            sampling_locations = reference_points.reshape(
+                bs, Len_q, 1, self.num_levels, 1, 2
+            ) + sampling_offsets / offset_normalizer
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2] + sampling_offsets /
+                self.num_points * reference_points[:, :, None, :, None, 2:] * 0.5)
+        else:
+            raise ValueError(
+                "Last dim of reference_points must be 2 or 4, but get {} instead.".
+                format(reference_points.shape[-1]))
+
+        output = self.ms_deformable_attn_core(value, value_spatial_shapes, sampling_locations, attention_weights)
+
+        output = self.output_proj(output)
+
+        return output
+
+
+class TransformerDecoderLayer(nn.Module):
+    def __init__(self,
+                 d_model=256,
+                 n_head=8,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 n_levels=4,
+                 n_points=4,):
+        super(TransformerDecoderLayer, self).__init__()
+
+        # self attention
+        self.self_attn = nn.MultiheadAttention(d_model, n_head, dropout=dropout, batch_first=True)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # cross attention
+        self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels, n_points)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.activation = getattr(F, activation)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(d_model)
+
+        # self._reset_parameters()
+
+    # def _reset_parameters(self):
+    #     linear_init_(self.linear1)
+    #     linear_init_(self.linear2)
+    #     xavier_uniform_(self.linear1.weight)
+    #     xavier_uniform_(self.linear2.weight)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+
+    def forward(self,
+                tgt,
+                reference_points,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                attn_mask=None,
+                memory_mask=None,
+                query_pos_embed=None):
+        # self attention
+        q = k = self.with_pos_embed(tgt, query_pos_embed)
+
+        # if attn_mask is not None:
+        #     attn_mask = torch.where(
+        #         attn_mask.to(torch.bool),
+        #         torch.zeros_like(attn_mask),
+        #         torch.full_like(attn_mask, float('-inf'), dtype=tgt.dtype))
+
+        tgt2, _ = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        # cross attention
+        tgt2 = self.cross_attn(\
+            self.with_pos_embed(tgt, query_pos_embed), 
+            reference_points, 
+            memory, 
+            memory_spatial_shapes, 
+            memory_mask)
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+
+        # ffn
+        tgt2 = self.forward_ffn(tgt)
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt.clamp(min=-65504, max=65504))
+
+        return tgt
+
+
+class TransformerDecoder(nn.Module):
+    def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
+        super(TransformerDecoder, self).__init__()
+        self.layers = nn.ModuleList([copy.deepcopy(decoder_layer) for _ in range(num_layers)])
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
+
+    def forward(self,
+                tgt,
+                ref_points_unact,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                bbox_head,
+                score_head,
+                query_pos_head,
+                attn_mask=None,
+                memory_mask=None):
+        output = tgt
+        dec_out_bboxes = []
+        dec_out_logits = []
+        ref_points_detach = F.sigmoid(ref_points_unact)
+
+        for i, layer in enumerate(self.layers):
+            ref_points_input = ref_points_detach.unsqueeze(2)
+            query_pos_embed = query_pos_head(ref_points_detach)
+
+            output = layer(output, ref_points_input, memory,
+                           memory_spatial_shapes, memory_level_start_index,
+                           attn_mask, memory_mask, query_pos_embed)
+
+            inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points_detach))
+
+            if self.training:
+                dec_out_logits.append(score_head[i](output))
+                if i == 0:
+                    dec_out_bboxes.append(inter_ref_bbox)
+                else:
+                    dec_out_bboxes.append(F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points)))
+
+            elif i == self.eval_idx:
+                dec_out_logits.append(score_head[i](output))
+                dec_out_bboxes.append(inter_ref_bbox)
+                break
+
+            ref_points = inter_ref_bbox
+            ref_points_detach = inter_ref_bbox.detach(
+            ) if self.training else inter_ref_bbox
+
+        return torch.stack(dec_out_bboxes), torch.stack(dec_out_logits)
+
+
+@register()
+class RTDETRTransformer(nn.Module):
+    __share__ = ['num_classes']
+    def __init__(self,
+                 num_classes=80,
+                 hidden_dim=256,
+                 num_queries=300,
+                 position_embed_type='sine',
+                 feat_channels=[512, 1024, 2048],
+                 feat_strides=[8, 16, 32],
+                 num_levels=3,
+                 num_points=4,
+                 nhead=8,
+                 num_layers=6,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 num_denoising=100,
+                 label_noise_ratio=0.5,
+                 box_noise_scale=1.0,
+                 learnt_init_query=False,
+                 eval_spatial_size=None,
+                 eval_idx=-1,
+                 eps=1e-2, 
+                 aux_loss=True,
+                 version='v1'):
+
+        super(RTDETRTransformer, self).__init__()
+        assert position_embed_type in ['sine', 'learned'], \
+            f'ValueError: position_embed_type not supported {position_embed_type}!'
+        assert len(feat_channels) <= num_levels
+        assert len(feat_strides) == len(feat_channels)
+        for _ in range(num_levels - len(feat_strides)):
+            feat_strides.append(feat_strides[-1] * 2)
+
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+        self.feat_strides = feat_strides
+        self.num_levels = num_levels
+        self.num_classes = num_classes
+        self.num_queries = num_queries
+        self.eps = eps
+        self.num_layers = num_layers
+        self.eval_spatial_size = eval_spatial_size
+        self.aux_loss = aux_loss
+
+        # backbone feature projection
+        self._build_input_proj_layer(feat_channels)
+
+        # Transformer module
+        decoder_layer = TransformerDecoderLayer(hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels, num_points)
+        self.decoder = TransformerDecoder(hidden_dim, decoder_layer, num_layers, eval_idx)
+
+        self.num_denoising = num_denoising
+        self.label_noise_ratio = label_noise_ratio
+        self.box_noise_scale = box_noise_scale
+        # denoising part
+        if num_denoising > 0: 
+            # self.denoising_class_embed = nn.Embedding(num_classes, hidden_dim, padding_idx=num_classes-1) # TODO for load paddle weights
+            self.denoising_class_embed = nn.Embedding(num_classes+1, hidden_dim, padding_idx=num_classes)
+            init.normal_(self.denoising_class_embed.weight[:-1])
+
+        # decoder embedding
+        self.learnt_init_query = learnt_init_query
+        if learnt_init_query:
+            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
+        self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2)
+
+        # encoder head
+        if version == 'v1':
+            self.enc_output = nn.Sequential(
+                nn.Linear(hidden_dim, hidden_dim),
+                nn.LayerNorm(hidden_dim,)
+            )
+        else:
+            self.enc_output = nn.Sequential(OrderedDict([
+                ('proj', nn.Linear(hidden_dim, hidden_dim)),
+                ('norm', nn.LayerNorm(hidden_dim,)),
+            ]))
+
+        self.enc_score_head = nn.Linear(hidden_dim, num_classes)
+        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+
+        # decoder head
+        self.dec_score_head = nn.ModuleList([
+            nn.Linear(hidden_dim, num_classes)
+            for _ in range(num_layers)
+        ])
+        self.dec_bbox_head = nn.ModuleList([
+            MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+            for _ in range(num_layers)
+        ])
+
+        # init encoder output anchors and valid_mask
+        if self.eval_spatial_size:
+            self.anchors, self.valid_mask = self._generate_anchors()
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        bias = bias_init_with_prob(0.01)
+
+        init.constant_(self.enc_score_head.bias, bias)
+        init.constant_(self.enc_bbox_head.layers[-1].weight, 0)
+        init.constant_(self.enc_bbox_head.layers[-1].bias, 0)
+
+        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
+            init.constant_(cls_.bias, bias)
+            init.constant_(reg_.layers[-1].weight, 0)
+            init.constant_(reg_.layers[-1].bias, 0)
+        
+        # linear_init_(self.enc_output[0])
+        init.xavier_uniform_(self.enc_output[0].weight)
+        if self.learnt_init_query:
+            init.xavier_uniform_(self.tgt_embed.weight)
+        init.xavier_uniform_(self.query_pos_head.layers[0].weight)
+        init.xavier_uniform_(self.query_pos_head.layers[1].weight)
+
+
+    def _build_input_proj_layer(self, feat_channels):
+        self.input_proj = nn.ModuleList()
+        for in_channels in feat_channels:
+            self.input_proj.append(
+                nn.Sequential(OrderedDict([
+                    ('conv', nn.Conv2d(in_channels, self.hidden_dim, 1, bias=False)), 
+                    ('norm', nn.BatchNorm2d(self.hidden_dim,))])
+                )
+            )
+
+        in_channels = feat_channels[-1]
+
+        for _ in range(self.num_levels - len(feat_channels)):
+            self.input_proj.append(
+                nn.Sequential(OrderedDict([
+                    ('conv', nn.Conv2d(in_channels, self.hidden_dim, 3, 2, padding=1, bias=False)),
+                    ('norm', nn.BatchNorm2d(self.hidden_dim))])
+                )
+            )
+            in_channels = self.hidden_dim
+
+    def _get_encoder_input(self, feats):
+        # get projection features
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+        if self.num_levels > len(proj_feats):
+            len_srcs = len(proj_feats)
+            for i in range(len_srcs, self.num_levels):
+                if i == len_srcs:
+                    proj_feats.append(self.input_proj[i](feats[-1]))
+                else:
+                    proj_feats.append(self.input_proj[i](proj_feats[-1]))
+
+        # get encoder inputs
+        feat_flatten = []
+        spatial_shapes = []
+        level_start_index = [0, ]
+        for i, feat in enumerate(proj_feats):
+            _, _, h, w = feat.shape
+            # [b, c, h, w] -> [b, h*w, c]
+            feat_flatten.append(feat.flatten(2).permute(0, 2, 1))
+            # [num_levels, 2]
+            spatial_shapes.append([h, w])
+            # [l], start index of each level
+            level_start_index.append(h * w + level_start_index[-1])
+
+        # [b, l, c]
+        feat_flatten = torch.concat(feat_flatten, 1)
+        level_start_index.pop()
+        return (feat_flatten, spatial_shapes, level_start_index)
+
+    def _generate_anchors(self,
+                          spatial_shapes=None,
+                          grid_size=0.05,
+                          dtype=torch.float32,
+                          device='cpu'):
+        if spatial_shapes is None:
+            spatial_shapes = [[int(self.eval_spatial_size[0] / s), int(self.eval_spatial_size[1] / s)]
+                for s in self.feat_strides
+            ]
+        anchors = []
+        for lvl, (h, w) in enumerate(spatial_shapes):
+            grid_y, grid_x = torch.meshgrid(\
+                torch.arange(end=h, dtype=dtype), \
+                torch.arange(end=w, dtype=dtype), indexing='ij')
+            grid_xy = torch.stack([grid_x, grid_y], -1)
+            valid_WH = torch.tensor([w, h]).to(dtype)
+            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
+            wh = torch.ones_like(grid_xy) * grid_size * (2.0 ** lvl)
+            anchors.append(torch.concat([grid_xy, wh], -1).reshape(-1, h * w, 4))
+
+        anchors = torch.concat(anchors, 1).to(device)
+        valid_mask = ((anchors > self.eps) * (anchors < 1 - self.eps)).all(-1, keepdim=True)
+        anchors = torch.log(anchors / (1 - anchors))
+        # anchors = torch.where(valid_mask, anchors, float('inf'))
+        # anchors[valid_mask] = torch.inf # valid_mask [1, 8400, 1]
+        anchors = torch.where(valid_mask, anchors, torch.inf)
+
+        return anchors, valid_mask
+
+
+    def _get_decoder_input(self,
+                           memory,
+                           spatial_shapes,
+                           denoising_class=None,
+                           denoising_bbox_unact=None):
+        bs, _, _ = memory.shape
+        # prepare input for decoder
+        if self.training or self.eval_spatial_size is None:
+            anchors, valid_mask = self._generate_anchors(spatial_shapes, device=memory.device)
+        else:
+            anchors, valid_mask = self.anchors.to(memory.device), self.valid_mask.to(memory.device)
+
+        # memory = torch.where(valid_mask, memory, 0)
+        memory = valid_mask.to(memory.dtype) * memory  # TODO fix type error for onnx export 
+
+        output_memory = self.enc_output(memory)
+
+        enc_outputs_class = self.enc_score_head(output_memory)
+        enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors
+
+        _, topk_ind = torch.topk(enc_outputs_class.max(-1).values, self.num_queries, dim=1)
+        
+        reference_points_unact = enc_outputs_coord_unact.gather(dim=1, \
+            index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_coord_unact.shape[-1]))
+
+        enc_topk_bboxes = F.sigmoid(reference_points_unact)
+        if denoising_bbox_unact is not None:
+            reference_points_unact = torch.concat(
+                [denoising_bbox_unact, reference_points_unact], 1)
+        
+        enc_topk_logits = enc_outputs_class.gather(dim=1, \
+            index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_class.shape[-1]))
+
+        # extract region features
+        if self.learnt_init_query:
+            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
+        else:
+            target = output_memory.gather(dim=1, \
+                index=topk_ind.unsqueeze(-1).repeat(1, 1, output_memory.shape[-1]))
+            target = target.detach()
+
+        if denoising_class is not None:
+            target = torch.concat([denoising_class, target], 1)
+
+        return target, reference_points_unact.detach(), enc_topk_bboxes, enc_topk_logits
+
+
+    def forward(self, feats, targets=None):
+
+        # input projection and embedding
+        (memory, spatial_shapes, level_start_index) = self._get_encoder_input(feats)
+        
+        # prepare denoising training
+        if self.training and self.num_denoising > 0:
+            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
+                get_contrastive_denoising_training_group(targets, \
+                    self.num_classes, 
+                    self.num_queries, 
+                    self.denoising_class_embed, 
+                    num_denoising=self.num_denoising, 
+                    label_noise_ratio=self.label_noise_ratio, 
+                    box_noise_scale=self.box_noise_scale, )
+        else:
+            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
+
+        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
+            self._get_decoder_input(memory, spatial_shapes, denoising_class, denoising_bbox_unact)
+
+        # decoder
+        out_bboxes, out_logits = self.decoder(
+            target,
+            init_ref_points_unact,
+            memory,
+            spatial_shapes,
+            level_start_index,
+            self.dec_bbox_head,
+            self.dec_score_head,
+            self.query_pos_head,
+            attn_mask=attn_mask)
+
+        if self.training and dn_meta is not None:
+            dn_out_bboxes, out_bboxes = torch.split(out_bboxes, dn_meta['dn_num_split'], dim=2)
+            dn_out_logits, out_logits = torch.split(out_logits, dn_meta['dn_num_split'], dim=2)
+
+        out = {'pred_logits': out_logits[-1], 'pred_boxes': out_bboxes[-1]}
+
+        if self.training and self.aux_loss:
+            out['aux_outputs'] = self._set_aux_loss(out_logits[:-1], out_bboxes[:-1])
+            out['aux_outputs'].extend(self._set_aux_loss([enc_topk_logits], [enc_topk_bboxes]))
+            
+            if self.training and dn_meta is not None:
+                out['dn_aux_outputs'] = self._set_aux_loss(dn_out_logits, dn_out_bboxes)
+                out['dn_meta'] = dn_meta
+
+        return out
+
+
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{'pred_logits': a, 'pred_boxes': b}
+                for a, b in zip(outputs_class, outputs_coord)]
diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py
new file mode 100644
index 0000000..efe58fd
--- /dev/null
+++ b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py
@@ -0,0 +1,94 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch 
+import torch.nn as nn 
+import torch.nn.functional as F 
+
+import torchvision
+
+from ...core import register
+
+
+__all__ = ['RTDETRPostProcessor']
+
+
+def mod(a, b):
+    out = a - a // b * b
+    return out
+
+
+@register()
+class RTDETRPostProcessor(nn.Module):
+    __share__ = [
+        'num_classes', 
+        'use_focal_loss', 
+        'num_top_queries', 
+        'remap_mscoco_category'
+    ]
+    
+    def __init__(
+        self, 
+        num_classes=80, 
+        use_focal_loss=True, 
+        num_top_queries=300, 
+        remap_mscoco_category=False
+    ) -> None:
+        super().__init__()
+        self.use_focal_loss = use_focal_loss
+        self.num_top_queries = num_top_queries
+        self.num_classes = int(num_classes)
+        self.remap_mscoco_category = remap_mscoco_category 
+        self.deploy_mode = False 
+
+    def extra_repr(self) -> str:
+        return f'use_focal_loss={self.use_focal_loss}, num_classes={self.num_classes}, num_top_queries={self.num_top_queries}'
+    
+    # def forward(self, outputs, orig_target_sizes):
+    def forward(self, outputs, orig_target_sizes: torch.Tensor):
+        logits, boxes = outputs['pred_logits'], outputs['pred_boxes']
+        # orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)        
+
+        bbox_pred = torchvision.ops.box_convert(boxes, in_fmt='cxcywh', out_fmt='xyxy')
+        bbox_pred *= orig_target_sizes.repeat(1, 2).unsqueeze(1)
+
+        if self.use_focal_loss:
+            scores = F.sigmoid(logits)
+            scores, index = torch.topk(scores.flatten(1), self.num_top_queries, dim=-1)
+            # TODO for older tensorrt
+            # labels = index % self.num_classes
+            labels = mod(index, self.num_classes)
+            index = index // self.num_classes
+            boxes = bbox_pred.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, bbox_pred.shape[-1]))
+            
+        else:
+            scores = F.softmax(logits, dim=-1)[:, :, :-1]
+            scores, labels = scores.max(dim=-1)
+            boxes = bbox_pred
+            if scores.shape[1] > self.num_top_queries:
+                scores, index = torch.topk(scores, self.num_top_queries, dim=-1)
+                labels = torch.gather(labels, dim=1, index=index)
+                boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1]))
+        
+        # TODO for onnx export
+        if self.deploy_mode:
+            return labels, boxes, scores
+
+        # TODO
+        if self.remap_mscoco_category:
+            from ...data.dataset import mscoco_label2category
+            labels = torch.tensor([mscoco_label2category[int(x.item())] for x in labels.flatten()])\
+                .to(boxes.device).reshape(labels.shape)
+
+        results = []
+        for lab, box, sco in zip(labels, boxes, scores):
+            result = dict(labels=lab, boxes=box, scores=sco)
+            results.append(result)
+        
+        return results
+        
+
+    def deploy(self, ):
+        self.eval()
+        self.deploy_mode = True
+        return self 
diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetrv2_criterion.py b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetrv2_criterion.py
new file mode 100644
index 0000000..c69e368
--- /dev/null
+++ b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetrv2_criterion.py
@@ -0,0 +1,265 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import torch 
+import torch.nn as nn 
+import torch.distributed
+import torch.nn.functional as F 
+import torchvision
+
+import copy
+
+from .box_ops import box_cxcywh_to_xyxy, box_iou, generalized_box_iou
+from ...misc.dist_utils import get_world_size, is_dist_available_and_initialized
+from ...core import register
+
+
+@register()
+class RTDETRCriterionv2(nn.Module):
+    """ This class computes the loss for DETR.
+    The process happens in two steps:
+        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+    """
+    __share__ = ['num_classes', ]
+    __inject__ = ['matcher', ]
+
+    def __init__(self, \
+        matcher, 
+        weight_dict, 
+        losses, 
+        alpha=0.2, 
+        gamma=2.0, 
+        num_classes=80, 
+        boxes_weight_format=None,
+        share_matched_indices=False):
+        """Create the criterion.
+        Parameters:
+            matcher: module able to compute a matching between targets and proposals
+            num_classes: number of object categories, omitting the special no-object category
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            eos_coef: relative classification weight applied to the no-object category
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+            boxes_weight_format: format for boxes weight (iou, )
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.losses = losses 
+        self.boxes_weight_format = boxes_weight_format
+        self.share_matched_indices = share_matched_indices
+        self.alpha = alpha
+        self.gamma = gamma
+
+    def loss_labels_focal(self, outputs, targets, indices, num_boxes):
+        assert 'pred_logits' in outputs
+        src_logits = outputs['pred_logits']
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
+                                    dtype=torch.int64, device=src_logits.device)
+        target_classes[idx] = target_classes_o
+        target = F.one_hot(target_classes, num_classes=self.num_classes+1)[..., :-1]
+        loss = torchvision.ops.sigmoid_focal_loss(src_logits, target, self.alpha, self.gamma, reduction='none')
+        loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes
+
+        return {'loss_focal': loss}
+
+    def loss_labels_vfl(self, outputs, targets, indices, num_boxes, values=None):
+        assert 'pred_boxes' in outputs
+        idx = self._get_src_permutation_idx(indices)
+        if values is None:
+            src_boxes = outputs['pred_boxes'][idx]
+            target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+            ious, _ = box_iou(box_cxcywh_to_xyxy(src_boxes), box_cxcywh_to_xyxy(target_boxes))
+            ious = torch.diag(ious).detach()
+        else:
+            ious = values
+
+        src_logits = outputs['pred_logits']
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
+                                    dtype=torch.int64, device=src_logits.device)
+        target_classes[idx] = target_classes_o
+        target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1]
+
+        target_score_o = torch.zeros_like(target_classes, dtype=src_logits.dtype)
+        target_score_o[idx] = ious.to(target_score_o.dtype)
+        target_score = target_score_o.unsqueeze(-1) * target
+
+        pred_score = F.sigmoid(src_logits).detach()
+        weight = self.alpha * pred_score.pow(self.gamma) * (1 - target) + target_score
+        
+        loss = F.binary_cross_entropy_with_logits(src_logits, target_score, weight=weight, reduction='none')
+        loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes
+        return {'loss_vfl': loss}
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes, boxes_weight=None):
+        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
+           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
+           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        assert 'pred_boxes' in outputs
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs['pred_boxes'][idx]
+        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        losses = {}
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
+        losses['loss_bbox'] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - torch.diag(generalized_box_iou(\
+            box_cxcywh_to_xyxy(src_boxes), box_cxcywh_to_xyxy(target_boxes)))
+        loss_giou = loss_giou if boxes_weight is None else loss_giou * boxes_weight
+        losses['loss_giou'] = loss_giou.sum() / num_boxes
+        return losses
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
+        loss_map = {
+            'boxes': self.loss_boxes,
+            'focal': self.loss_labels_focal,
+            'vfl': self.loss_labels_vfl,
+        }
+        assert loss in loss_map, f'do you really want to compute {loss} loss?'
+        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
+
+    def forward(self, outputs, targets, **kwargs):
+        """ This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if 'aux' not in k}
+
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t["labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        if is_dist_available_and_initialized():
+            torch.distributed.all_reduce(num_boxes)
+        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
+        
+        # Retrieve the matching between the outputs of the last layer and the targets
+        matched = self.matcher(outputs_without_aux, targets)
+        indices = matched['indices']
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            meta = self.get_loss_meta_info(loss, outputs, targets, indices)            
+            l_dict = self.get_loss(loss, outputs, targets, indices, num_boxes, **meta)
+            l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+            losses.update(l_dict)
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if 'aux_outputs' in outputs:
+            for i, aux_outputs in enumerate(outputs['aux_outputs']):
+                if not self.share_matched_indices:
+                    matched = self.matcher(aux_outputs, targets)
+                    indices = matched['indices']
+                for loss in self.losses:
+                    meta = self.get_loss_meta_info(loss, aux_outputs, targets, indices)
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **meta)
+                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                    l_dict = {k + f'_aux_{i}': v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        # In case of cdn auxiliary losses. For rtdetr
+        if 'dn_aux_outputs' in outputs:
+            assert 'dn_meta' in outputs, ''
+            indices = self.get_cdn_matched_indices(outputs['dn_meta'], targets)
+            dn_num_boxes = num_boxes * outputs['dn_meta']['dn_num_group']
+            for i, aux_outputs in enumerate(outputs['dn_aux_outputs']):
+                for loss in self.losses:
+                    meta = self.get_loss_meta_info(loss, aux_outputs, targets, indices)
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, dn_num_boxes, **meta)
+                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                    l_dict = {k + f'_dn_{i}': v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        # In case of encoder auxiliary losses. For rtdetr v2
+        if 'enc_aux_outputs' in outputs:
+            assert 'enc_meta' in outputs, ''
+            class_agnostic = outputs['enc_meta']['class_agnostic']
+            if class_agnostic:
+                orig_num_classes = self.num_classes
+                self.num_classes = 1
+                enc_targets = copy.deepcopy(targets)
+                for t in enc_targets:
+                    t['labels'] = torch.zeros_like(t["labels"])
+            else:
+                enc_targets = targets
+
+            for i, aux_outputs in enumerate(outputs['enc_aux_outputs']):
+                matched = self.matcher(aux_outputs, targets)
+                indices = matched['indices']
+                for loss in self.losses:
+                    meta = self.get_loss_meta_info(loss, aux_outputs, enc_targets, indices)
+                    l_dict = self.get_loss(loss, aux_outputs, enc_targets, indices, num_boxes, **meta)
+                    l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict}
+                    l_dict = {k + f'_enc_{i}': v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+            
+            if class_agnostic:
+                self.num_classes = orig_num_classes
+
+        return losses
+
+    def get_loss_meta_info(self, loss, outputs, targets, indices):
+        if self.boxes_weight_format is None:
+            return {}
+
+        src_boxes = outputs['pred_boxes'][self._get_src_permutation_idx(indices)]
+        target_boxes = torch.cat([t['boxes'][j] for t, (_, j) in zip(targets, indices)], dim=0)
+
+        if self.boxes_weight_format == 'iou':
+            iou, _ = box_iou(box_cxcywh_to_xyxy(src_boxes.detach()), box_cxcywh_to_xyxy(target_boxes))
+            iou = torch.diag(iou)
+        elif self.boxes_weight_format == 'giou':
+            iou = torch.diag(generalized_box_iou(\
+                box_cxcywh_to_xyxy(src_boxes.detach()), box_cxcywh_to_xyxy(target_boxes)))
+        else:
+            raise AttributeError()
+
+        if loss in ('boxes', ):
+            meta = {'boxes_weight': iou}
+        elif loss in ('vfl', ):
+            meta = {'values': iou}
+        else:
+            meta = {}
+
+        return meta
+
+    @staticmethod
+    def get_cdn_matched_indices(dn_meta, targets):
+        """get_cdn_matched_indices
+        """
+        dn_positive_idx, dn_num_group = dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
+        num_gts = [len(t['labels']) for t in targets]
+        device = targets[0]['labels'].device
+        
+        dn_match_indices = []
+        for i, num_gt in enumerate(num_gts):
+            if num_gt > 0:
+                gt_idx = torch.arange(num_gt, dtype=torch.int64, device=device)
+                gt_idx = gt_idx.tile(dn_num_group)
+                assert len(dn_positive_idx[i]) == len(gt_idx)
+                dn_match_indices.append((dn_positive_idx[i], gt_idx))
+            else:
+                dn_match_indices.append((torch.zeros(0, dtype=torch.int64, device=device), \
+                    torch.zeros(0, dtype=torch.int64,  device=device)))
+        
+        return dn_match_indices
diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetrv2_decoder.py b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetrv2_decoder.py
new file mode 100644
index 0000000..e35a7c3
--- /dev/null
+++ b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetrv2_decoder.py
@@ -0,0 +1,609 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import math 
+import copy 
+import functools
+from collections import OrderedDict
+
+import torch 
+import torch.nn as nn 
+import torch.nn.functional as F 
+import torch.nn.init as init 
+from typing import List
+
+from .denoising import get_contrastive_denoising_training_group
+from .utils import deformable_attention_core_func_v2, get_activation, inverse_sigmoid
+from .utils import bias_init_with_prob
+
+from ...core import register
+
+__all__ = ['RTDETRTransformerv2']
+
+
+class MLP(nn.Module):
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, act='relu'):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+        self.act = get_activation(act)
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = self.act(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+class MSDeformableAttention(nn.Module):
+    def __init__(
+        self, 
+        embed_dim=256, 
+        num_heads=8, 
+        num_levels=4, 
+        num_points=4, 
+        method='default',
+        offset_scale=0.5,
+    ):
+        """Multi-Scale Deformable Attention
+        """
+        super(MSDeformableAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_levels = num_levels
+        self.offset_scale = offset_scale
+
+        if isinstance(num_points, list):
+            assert len(num_points) == num_levels, ''
+            num_points_list = num_points
+        else:
+            num_points_list = [num_points for _ in range(num_levels)]
+
+        self.num_points_list = num_points_list
+        
+        num_points_scale = [1/n for n in num_points_list for _ in range(n)]
+        self.register_buffer('num_points_scale', torch.tensor(num_points_scale, dtype=torch.float32))
+
+        self.total_points = num_heads * sum(num_points_list)
+        self.method = method
+
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        self.sampling_offsets = nn.Linear(embed_dim, self.total_points * 2)
+        self.attention_weights = nn.Linear(embed_dim, self.total_points)
+        self.value_proj = nn.Linear(embed_dim, embed_dim)
+        self.output_proj = nn.Linear(embed_dim, embed_dim)
+
+        self.ms_deformable_attn_core = functools.partial(deformable_attention_core_func_v2, method=self.method) 
+
+        self._reset_parameters()
+
+        if method == 'discrete':
+            for p in self.sampling_offsets.parameters():
+                p.requires_grad = False
+
+    def _reset_parameters(self):
+        # sampling_offsets
+        init.constant_(self.sampling_offsets.weight, 0)
+        thetas = torch.arange(self.num_heads, dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = grid_init / grid_init.abs().max(-1, keepdim=True).values
+        grid_init = grid_init.reshape(self.num_heads, 1, 2).tile([1, sum(self.num_points_list), 1])
+        scaling = torch.concat([torch.arange(1, n + 1) for n in self.num_points_list]).reshape(1, -1, 1)
+        grid_init *= scaling
+        self.sampling_offsets.bias.data[...] = grid_init.flatten()
+
+        # attention_weights
+        init.constant_(self.attention_weights.weight, 0)
+        init.constant_(self.attention_weights.bias, 0)
+
+        # proj
+        init.xavier_uniform_(self.value_proj.weight)
+        init.constant_(self.value_proj.bias, 0)
+        init.xavier_uniform_(self.output_proj.weight)
+        init.constant_(self.output_proj.bias, 0)
+
+
+    def forward(self,
+                query: torch.Tensor,
+                reference_points: torch.Tensor,
+                value: torch.Tensor,
+                value_spatial_shapes: List[int],
+                value_mask: torch.Tensor=None):
+        """
+        Args:
+            query (Tensor): [bs, query_length, C]
+            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area
+            value (Tensor): [bs, value_length, C]
+            value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
+
+        Returns:
+            output (Tensor): [bs, Length_{query}, C]
+        """
+        bs, Len_q = query.shape[:2]
+        Len_v = value.shape[1]
+
+        value = self.value_proj(value)
+        if value_mask is not None:
+            value = value * value_mask.to(value.dtype).unsqueeze(-1)
+
+        value = value.reshape(bs, Len_v, self.num_heads, self.head_dim)
+
+        sampling_offsets: torch.Tensor = self.sampling_offsets(query)
+        sampling_offsets = sampling_offsets.reshape(bs, Len_q, self.num_heads, sum(self.num_points_list), 2)
+
+        attention_weights = self.attention_weights(query).reshape(bs, Len_q, self.num_heads, sum(self.num_points_list))
+        attention_weights = F.softmax(attention_weights, dim=-1).reshape(bs, Len_q, self.num_heads, sum(self.num_points_list))
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.tensor(value_spatial_shapes)
+            offset_normalizer = offset_normalizer.flip([1]).reshape(1, 1, 1, self.num_levels, 1, 2)
+            sampling_locations = reference_points.reshape(bs, Len_q, 1, self.num_levels, 1, 2) + sampling_offsets / offset_normalizer
+        elif reference_points.shape[-1] == 4:
+            # reference_points [8, 480, None, 1,  4]
+            # sampling_offsets [8, 480, 8,    12, 2]
+            num_points_scale = self.num_points_scale.to(dtype=query.dtype).unsqueeze(-1)
+            offset = sampling_offsets * num_points_scale * reference_points[:, :, None, :, 2:] * self.offset_scale
+            sampling_locations = reference_points[:, :, None, :, :2] + offset
+        else:
+            raise ValueError(
+                "Last dim of reference_points must be 2 or 4, but get {} instead.".
+                format(reference_points.shape[-1]))
+
+        output = self.ms_deformable_attn_core(value, value_spatial_shapes, sampling_locations, attention_weights, self.num_points_list)
+
+        output = self.output_proj(output)
+
+        return output
+
+
+class TransformerDecoderLayer(nn.Module):
+    def __init__(self,
+                 d_model=256,
+                 n_head=8,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation='relu',
+                 n_levels=4,
+                 n_points=4,
+                 cross_attn_method='default'):
+        super(TransformerDecoderLayer, self).__init__()
+
+        # self attention
+        self.self_attn = nn.MultiheadAttention(d_model, n_head, dropout=dropout, batch_first=True)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # cross attention
+        self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels, n_points, method=cross_attn_method)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.activation = get_activation(activation)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(d_model)
+        
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        init.xavier_uniform_(self.linear1.weight)
+        init.xavier_uniform_(self.linear2.weight)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+
+    def forward(self,
+                target,
+                reference_points,
+                memory,
+                memory_spatial_shapes,
+                attn_mask=None,
+                memory_mask=None,
+                query_pos_embed=None):
+        # self attention
+        q = k = self.with_pos_embed(target, query_pos_embed)
+
+        target2, _ = self.self_attn(q, k, value=target, attn_mask=attn_mask)
+        target = target + self.dropout1(target2)
+        target = self.norm1(target)
+
+        # cross attention
+        target2 = self.cross_attn(\
+            self.with_pos_embed(target, query_pos_embed), 
+            reference_points, 
+            memory, 
+            memory_spatial_shapes, 
+            memory_mask)
+        target = target + self.dropout2(target2)
+        target = self.norm2(target)
+
+        # ffn
+        target2 = self.forward_ffn(target)
+        target = target + self.dropout4(target2)
+        target = self.norm3(target)
+
+        return target
+
+
+class TransformerDecoder(nn.Module):
+    def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
+        super(TransformerDecoder, self).__init__()
+        self.layers = nn.ModuleList([copy.deepcopy(decoder_layer) for _ in range(num_layers)])
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
+
+    def forward(self,
+                target,
+                ref_points_unact,
+                memory,
+                memory_spatial_shapes,
+                bbox_head,
+                score_head,
+                query_pos_head,
+                attn_mask=None,
+                memory_mask=None):
+        dec_out_bboxes = []
+        dec_out_logits = []
+        ref_points_detach = F.sigmoid(ref_points_unact)
+
+        output = target
+        for i, layer in enumerate(self.layers):
+            ref_points_input = ref_points_detach.unsqueeze(2)
+            query_pos_embed = query_pos_head(ref_points_detach)
+
+            output = layer(output, ref_points_input, memory, memory_spatial_shapes, attn_mask, memory_mask, query_pos_embed)
+
+            inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points_detach))
+
+            if self.training:
+                dec_out_logits.append(score_head[i](output))
+                if i == 0:
+                    dec_out_bboxes.append(inter_ref_bbox)
+                else:
+                    dec_out_bboxes.append(F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points)))
+
+            elif i == self.eval_idx:
+                dec_out_logits.append(score_head[i](output))
+                dec_out_bboxes.append(inter_ref_bbox)
+                break
+
+            ref_points = inter_ref_bbox
+            ref_points_detach = inter_ref_bbox.detach()
+
+        return torch.stack(dec_out_bboxes), torch.stack(dec_out_logits)
+
+
+@register()
+class RTDETRTransformerv2(nn.Module):
+    __share__ = ['num_classes', 'eval_spatial_size']
+
+    def __init__(self,
+                 num_classes=80,
+                 hidden_dim=256,
+                 num_queries=300,
+                 feat_channels=[512, 1024, 2048],
+                 feat_strides=[8, 16, 32],
+                 num_levels=3,
+                 num_points=4,
+                 nhead=8,
+                 num_layers=6,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 num_denoising=100,
+                 label_noise_ratio=0.5,
+                 box_noise_scale=1.0,
+                 learn_query_content=False,
+                 eval_spatial_size=None,
+                 eval_idx=-1,
+                 eps=1e-2, 
+                 aux_loss=True, 
+                 cross_attn_method='default', 
+                 query_select_method='default'):
+        super().__init__()
+        assert len(feat_channels) <= num_levels
+        assert len(feat_strides) == len(feat_channels)
+        
+        for _ in range(num_levels - len(feat_strides)):
+            feat_strides.append(feat_strides[-1] * 2)
+
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+        self.feat_strides = feat_strides
+        self.num_levels = num_levels
+        self.num_classes = num_classes
+        self.num_queries = num_queries
+        self.eps = eps
+        self.num_layers = num_layers
+        self.eval_spatial_size = eval_spatial_size
+        self.aux_loss = aux_loss
+
+        assert query_select_method in ('default', 'one2many', 'agnostic'), ''
+        assert cross_attn_method in ('default', 'discrete'), ''
+        self.cross_attn_method = cross_attn_method
+        self.query_select_method = query_select_method
+
+        # backbone feature projection
+        self._build_input_proj_layer(feat_channels)
+
+        # Transformer module
+        decoder_layer = TransformerDecoderLayer(hidden_dim, nhead, dim_feedforward, dropout, \
+            activation, num_levels, num_points, cross_attn_method=cross_attn_method)
+        self.decoder = TransformerDecoder(hidden_dim, decoder_layer, num_layers, eval_idx)
+
+        # denoising
+        self.num_denoising = num_denoising
+        self.label_noise_ratio = label_noise_ratio
+        self.box_noise_scale = box_noise_scale
+        if num_denoising > 0: 
+            self.denoising_class_embed = nn.Embedding(num_classes+1, hidden_dim, padding_idx=num_classes)
+            init.normal_(self.denoising_class_embed.weight[:-1])
+
+        # decoder embedding
+        self.learn_query_content = learn_query_content
+        if learn_query_content:
+            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
+        self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, 2)
+
+        # if num_select_queries != self.num_queries:
+        #     layer = TransformerEncoderLayer(hidden_dim, nhead, dim_feedforward, activation='gelu')
+        #     self.encoder = TransformerEncoder(layer, 1)
+
+        self.enc_output = nn.Sequential(OrderedDict([
+            ('proj', nn.Linear(hidden_dim, hidden_dim)),
+            ('norm', nn.LayerNorm(hidden_dim,)),
+        ]))
+
+        if query_select_method == 'agnostic':
+            self.enc_score_head = nn.Linear(hidden_dim, 1)
+        else:
+            self.enc_score_head = nn.Linear(hidden_dim, num_classes)
+
+        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, 3)
+
+        # decoder head
+        self.dec_score_head = nn.ModuleList([
+            nn.Linear(hidden_dim, num_classes) for _ in range(num_layers)
+        ])
+        self.dec_bbox_head = nn.ModuleList([
+            MLP(hidden_dim, hidden_dim, 4, 3) for _ in range(num_layers)
+        ])
+
+        # init encoder output anchors and valid_mask
+        if self.eval_spatial_size:
+            anchors, valid_mask = self._generate_anchors()
+            self.register_buffer('anchors', anchors)
+            self.register_buffer('valid_mask', valid_mask)
+
+        self._reset_parameters()
+        
+    def _reset_parameters(self):
+        bias = bias_init_with_prob(0.01)
+        init.constant_(self.enc_score_head.bias, bias)
+        init.constant_(self.enc_bbox_head.layers[-1].weight, 0)
+        init.constant_(self.enc_bbox_head.layers[-1].bias, 0)
+
+        for _cls, _reg in zip(self.dec_score_head, self.dec_bbox_head):
+            init.constant_(_cls.bias, bias)
+            init.constant_(_reg.layers[-1].weight, 0)
+            init.constant_(_reg.layers[-1].bias, 0)
+        
+        init.xavier_uniform_(self.enc_output[0].weight)
+        if self.learn_query_content:
+            init.xavier_uniform_(self.tgt_embed.weight)
+        init.xavier_uniform_(self.query_pos_head.layers[0].weight)
+        init.xavier_uniform_(self.query_pos_head.layers[1].weight)
+        for m in self.input_proj:
+            init.xavier_uniform_(m[0].weight)
+
+    def _build_input_proj_layer(self, feat_channels):
+        self.input_proj = nn.ModuleList()
+        for in_channels in feat_channels:
+            self.input_proj.append(
+                nn.Sequential(OrderedDict([
+                    ('conv', nn.Conv2d(in_channels, self.hidden_dim, 1, bias=False)), 
+                    ('norm', nn.BatchNorm2d(self.hidden_dim,))])
+                )
+            )
+
+        in_channels = feat_channels[-1]
+
+        for _ in range(self.num_levels - len(feat_channels)):
+            self.input_proj.append(
+                nn.Sequential(OrderedDict([
+                    ('conv', nn.Conv2d(in_channels, self.hidden_dim, 3, 2, padding=1, bias=False)),
+                    ('norm', nn.BatchNorm2d(self.hidden_dim))])
+                )
+            )
+            in_channels = self.hidden_dim
+
+    def _get_encoder_input(self, feats: List[torch.Tensor]):
+        # get projection features
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+        if self.num_levels > len(proj_feats):
+            len_srcs = len(proj_feats)
+            for i in range(len_srcs, self.num_levels):
+                if i == len_srcs:
+                    proj_feats.append(self.input_proj[i](feats[-1]))
+                else:
+                    proj_feats.append(self.input_proj[i](proj_feats[-1]))
+
+        # get encoder inputs
+        feat_flatten = []
+        spatial_shapes = []
+        for i, feat in enumerate(proj_feats):
+            _, _, h, w = feat.shape
+            # [b, c, h, w] -> [b, h*w, c]
+            feat_flatten.append(feat.flatten(2).permute(0, 2, 1))
+            # [num_levels, 2]
+            spatial_shapes.append([h, w])
+        # [b, l, c]
+        feat_flatten = torch.concat(feat_flatten, 1)
+        return feat_flatten, spatial_shapes
+
+    def _generate_anchors(self,
+                          spatial_shapes=None,
+                          grid_size=0.05,
+                          dtype=torch.float32,
+                          device='cpu'):
+        if spatial_shapes is None:
+            spatial_shapes = []
+            eval_h, eval_w = self.eval_spatial_size
+            for s in self.feat_strides:
+                spatial_shapes.append([int(eval_h / s), int(eval_w / s)])
+
+        anchors = []
+        for lvl, (h, w) in enumerate(spatial_shapes):
+            grid_y, grid_x = torch.meshgrid(torch.arange(h), torch.arange(w), indexing='ij')
+            grid_xy = torch.stack([grid_x, grid_y], dim=-1)
+            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / torch.tensor([w, h], dtype=dtype)
+            wh = torch.ones_like(grid_xy) * grid_size * (2.0 ** lvl)
+            lvl_anchors = torch.concat([grid_xy, wh], dim=-1).reshape(-1, h * w, 4)
+            anchors.append(lvl_anchors)
+
+        anchors = torch.concat(anchors, dim=1).to(device)
+        valid_mask = ((anchors > self.eps) * (anchors < 1 - self.eps)).all(-1, keepdim=True)
+        anchors = torch.log(anchors / (1 - anchors))
+        anchors = torch.where(valid_mask, anchors, torch.inf)
+
+        return anchors, valid_mask
+
+
+    def _get_decoder_input(self,
+                           memory: torch.Tensor,
+                           spatial_shapes,
+                           denoising_logits=None,
+                           denoising_bbox_unact=None):
+
+        # prepare input for decoder
+        if self.training or self.eval_spatial_size is None:
+            anchors, valid_mask = self._generate_anchors(spatial_shapes, device=memory.device)
+        else:
+            anchors = self.anchors
+            valid_mask = self.valid_mask
+
+        # memory = torch.where(valid_mask, memory, 0)
+        # TODO fix type error for onnx export 
+        memory = valid_mask.to(memory.dtype) * memory  
+
+        output_memory :torch.Tensor = self.enc_output(memory)
+        enc_outputs_logits :torch.Tensor = self.enc_score_head(output_memory)
+        enc_outputs_coord_unact :torch.Tensor = self.enc_bbox_head(output_memory) + anchors
+
+        enc_topk_bboxes_list, enc_topk_logits_list = [], []
+        enc_topk_memory, enc_topk_logits, enc_topk_bbox_unact = \
+            self._select_topk(output_memory, enc_outputs_logits, enc_outputs_coord_unact, self.num_queries)
+            
+        if self.training:
+            enc_topk_bboxes = F.sigmoid(enc_topk_bbox_unact)
+            enc_topk_bboxes_list.append(enc_topk_bboxes)
+            enc_topk_logits_list.append(enc_topk_logits)
+
+        # if self.num_select_queries != self.num_queries:            
+        #     raise NotImplementedError('')
+
+        if self.learn_query_content:
+            content = self.tgt_embed.weight.unsqueeze(0).tile([memory.shape[0], 1, 1])
+        else:
+            content = enc_topk_memory.detach()
+            
+        enc_topk_bbox_unact = enc_topk_bbox_unact.detach()
+        
+        if denoising_bbox_unact is not None:
+            enc_topk_bbox_unact = torch.concat([denoising_bbox_unact, enc_topk_bbox_unact], dim=1)
+            content = torch.concat([denoising_logits, content], dim=1)
+        
+        return content, enc_topk_bbox_unact, enc_topk_bboxes_list, enc_topk_logits_list
+
+    def _select_topk(self, memory: torch.Tensor, outputs_logits: torch.Tensor, outputs_coords_unact: torch.Tensor, topk: int):
+        if self.query_select_method == 'default':
+            _, topk_ind = torch.topk(outputs_logits.max(-1).values, topk, dim=-1)
+
+        elif self.query_select_method == 'one2many':
+            _, topk_ind = torch.topk(outputs_logits.flatten(1), topk, dim=-1)
+            topk_ind = topk_ind // self.num_classes
+
+        elif self.query_select_method == 'agnostic':
+            _, topk_ind = torch.topk(outputs_logits.squeeze(-1), topk, dim=-1)
+        
+        topk_ind: torch.Tensor
+
+        topk_coords = outputs_coords_unact.gather(dim=1, \
+            index=topk_ind.unsqueeze(-1).repeat(1, 1, outputs_coords_unact.shape[-1]))
+        
+        topk_logits = outputs_logits.gather(dim=1, \
+            index=topk_ind.unsqueeze(-1).repeat(1, 1, outputs_logits.shape[-1]))
+        
+        topk_memory = memory.gather(dim=1, \
+            index=topk_ind.unsqueeze(-1).repeat(1, 1, memory.shape[-1]))
+
+        return topk_memory, topk_logits, topk_coords
+
+
+    def forward(self, feats, targets=None):
+        # input projection and embedding
+        memory, spatial_shapes = self._get_encoder_input(feats)
+        
+        # prepare denoising training
+        if self.training and self.num_denoising > 0:
+            denoising_logits, denoising_bbox_unact, attn_mask, dn_meta = \
+                get_contrastive_denoising_training_group(targets, \
+                    self.num_classes, 
+                    self.num_queries, 
+                    self.denoising_class_embed, 
+                    num_denoising=self.num_denoising, 
+                    label_noise_ratio=self.label_noise_ratio, 
+                    box_noise_scale=self.box_noise_scale, )
+        else:
+            denoising_logits, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
+
+        init_ref_contents, init_ref_points_unact, enc_topk_bboxes_list, enc_topk_logits_list = \
+            self._get_decoder_input(memory, spatial_shapes, denoising_logits, denoising_bbox_unact)
+
+        # decoder
+        out_bboxes, out_logits = self.decoder(
+            init_ref_contents,
+            init_ref_points_unact,
+            memory,
+            spatial_shapes,
+            self.dec_bbox_head,
+            self.dec_score_head,
+            self.query_pos_head,
+            attn_mask=attn_mask)
+
+        if self.training and dn_meta is not None:
+            dn_out_bboxes, out_bboxes = torch.split(out_bboxes, dn_meta['dn_num_split'], dim=2)
+            dn_out_logits, out_logits = torch.split(out_logits, dn_meta['dn_num_split'], dim=2)
+
+        out = {'pred_logits': out_logits[-1], 'pred_boxes': out_bboxes[-1]}
+
+        if self.training and self.aux_loss:
+            out['aux_outputs'] = self._set_aux_loss(out_logits[:-1], out_bboxes[:-1])
+            out['enc_aux_outputs'] = self._set_aux_loss(enc_topk_logits_list, enc_topk_bboxes_list)
+            out['enc_meta'] = {'class_agnostic': self.query_select_method == 'agnostic'}
+
+            if dn_meta is not None:
+                out['dn_aux_outputs'] = self._set_aux_loss(dn_out_logits, dn_out_bboxes)
+                out['dn_meta'] = dn_meta
+
+        return out
+
+
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{'pred_logits': a, 'pred_boxes': b}
+                for a, b in zip(outputs_class, outputs_coord)]
diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/utils.py b/rtdetrv2_pytorch/src/zoo/rtdetr/utils.py
new file mode 100644
index 0000000..1601dd3
--- /dev/null
+++ b/rtdetrv2_pytorch/src/zoo/rtdetr/utils.py
@@ -0,0 +1,172 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import math
+from typing import List
+
+import torch 
+import torch.nn as nn
+import torch.nn.functional as F 
+
+
+def inverse_sigmoid(x: torch.Tensor, eps: float=1e-5) -> torch.Tensor:
+    x = x.clip(min=0., max=1.)
+    return torch.log(x.clip(min=eps) / (1 - x).clip(min=eps))
+
+
+def bias_init_with_prob(prior_prob=0.01):
+    """initialize conv/fc bias value according to a given probability value."""
+    bias_init = float(-math.log((1 - prior_prob) / prior_prob))
+    return bias_init
+
+
+def deformable_attention_core_func(value, value_spatial_shapes, sampling_locations, attention_weights):
+    """
+    Args:
+        value (Tensor): [bs, value_length, n_head, c]
+        value_spatial_shapes (Tensor|List): [n_levels, 2]
+        value_level_start_index (Tensor|List): [n_levels]
+        sampling_locations (Tensor): [bs, query_length, n_head, n_levels, n_points, 2]
+        attention_weights (Tensor): [bs, query_length, n_head, n_levels, n_points]
+
+    Returns:
+        output (Tensor): [bs, Length_{query}, C]
+    """
+    bs, _, n_head, c = value.shape
+    _, Len_q, _, n_levels, n_points, _ = sampling_locations.shape
+
+    split_shape = [h * w for h, w in value_spatial_shapes]
+    value_list = value.split(split_shape, dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level, (h, w) in enumerate(value_spatial_shapes):
+        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
+        value_l_ = value_list[level].flatten(2).permute(
+            0, 2, 1).reshape(bs * n_head, c, h, w)
+        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, level].permute(
+            0, 2, 1, 3, 4).flatten(0, 1)
+        # N_*M_, D_, Lq_, P_
+        sampling_value_l_ = F.grid_sample(
+            value_l_,
+            sampling_grid_l_,
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_)
+    attention_weights = attention_weights.permute(0, 2, 1, 3, 4).reshape(
+        bs * n_head, 1, Len_q, n_levels * n_points)
+    output = (torch.stack(
+        sampling_value_list, dim=-2).flatten(-2) *
+              attention_weights).sum(-1).reshape(bs, n_head * c, Len_q)
+
+    return output.permute(0, 2, 1)
+
+
+
+def deformable_attention_core_func_v2(\
+    value: torch.Tensor, 
+    value_spatial_shapes,
+    sampling_locations: torch.Tensor, 
+    attention_weights: torch.Tensor, 
+    num_points_list: List[int], 
+    method='default'):
+    """
+    Args:
+        value (Tensor): [bs, value_length, n_head, c]
+        value_spatial_shapes (Tensor|List): [n_levels, 2]
+        value_level_start_index (Tensor|List): [n_levels]
+        sampling_locations (Tensor): [bs, query_length, n_head, n_levels * n_points, 2]
+        attention_weights (Tensor): [bs, query_length, n_head, n_levels * n_points]
+
+    Returns:
+        output (Tensor): [bs, Length_{query}, C]
+    """
+    bs, _, n_head, c = value.shape
+    _, Len_q, _, _, _ = sampling_locations.shape
+        
+    split_shape = [h * w for h, w in value_spatial_shapes]
+    value_list = value.permute(0, 2, 3, 1).flatten(0, 1).split(split_shape, dim=-1)
+
+    # sampling_offsets [8, 480, 8, 12, 2]
+    if method == 'default':
+        sampling_grids = 2 * sampling_locations - 1
+
+    elif method == 'discrete':
+        sampling_grids = sampling_locations
+
+    sampling_grids = sampling_grids.permute(0, 2, 1, 3, 4).flatten(0, 1)
+    sampling_locations_list = sampling_grids.split(num_points_list, dim=-2)
+
+    sampling_value_list = []
+    for level, (h, w) in enumerate(value_spatial_shapes):
+        value_l = value_list[level].reshape(bs * n_head, c, h, w)
+        sampling_grid_l: torch.Tensor = sampling_locations_list[level]
+
+        if method == 'default':
+            sampling_value_l = F.grid_sample(
+                value_l, 
+                sampling_grid_l, 
+                mode='bilinear', 
+                padding_mode='zeros', 
+                align_corners=False)
+        
+        elif method == 'discrete':
+            # n * m, seq, n, 2
+            sampling_coord = (sampling_grid_l * torch.tensor([[w, h]], device=value.device) + 0.5).to(torch.int64)
+
+            # FIX ME? for rectangle input
+            sampling_coord = sampling_coord.clamp(0, h - 1) 
+            sampling_coord = sampling_coord.reshape(bs * n_head, Len_q * num_points_list[level], 2) 
+
+            s_idx = torch.arange(sampling_coord.shape[0], device=value.device).unsqueeze(-1).repeat(1, sampling_coord.shape[1])
+            sampling_value_l: torch.Tensor = value_l[s_idx, :, sampling_coord[..., 1], sampling_coord[..., 0]] # n l c
+
+            sampling_value_l = sampling_value_l.permute(0, 2, 1).reshape(bs * n_head, c, Len_q, num_points_list[level])
+        
+        sampling_value_list.append(sampling_value_l)
+
+    attn_weights = attention_weights.permute(0, 2, 1, 3).reshape(bs * n_head, 1, Len_q, sum(num_points_list))
+    weighted_sample_locs = torch.concat(sampling_value_list, dim=-1) * attn_weights
+    output = weighted_sample_locs.sum(-1).reshape(bs, n_head * c, Len_q)
+
+    return output.permute(0, 2, 1)
+
+
+def get_activation(act: str, inpace: bool=True):
+    """get activation
+    """
+    if act is None:
+        return nn.Identity()
+
+    elif isinstance(act, nn.Module):
+        return act 
+
+    act = act.lower()
+    
+    if act == 'silu' or act == 'swish':
+        m = nn.SiLU()
+
+    elif act == 'relu':
+        m = nn.ReLU()
+
+    elif act == 'leaky_relu':
+        m = nn.LeakyReLU()
+
+    elif act == 'silu':
+        m = nn.SiLU()
+    
+    elif act == 'gelu':
+        m = nn.GELU()
+
+    elif act == 'hardsigmoid':
+        m = nn.Hardsigmoid()
+
+    else:
+        raise RuntimeError('')  
+
+    if hasattr(m, 'inplace'):
+        m.inplace = inpace
+    
+    return m 
diff --git a/rtdetrv2_pytorch/tools/README.md b/rtdetrv2_pytorch/tools/README.md
new file mode 100644
index 0000000..57d594a
--- /dev/null
+++ b/rtdetrv2_pytorch/tools/README.md
@@ -0,0 +1,124 @@
+### Getting Started: A Complete Workflow
+
+This guide provides a complete, step-by-step workflow from setting up the environment to training, exporting, and running inference with TensorRT.
+
+#### **1. Environment Setup with Docker (Recommended)**
+
+Using Docker is the recommended way to ensure all dependencies, drivers, and CUDA versions are perfectly aligned. This eliminates "it works on my machine" issues.
+
+*   **Step 1.1: Build and Run the Container**
+
+    From the project's root directory, run `docker compose`. This will build the image based on the `Dockerfile` and start the service in the background.
+
+    ```bash
+    docker compose up --build -d
+    ```
+
+*   **Step 1.2: Verify the Container is Running**
+
+    Check that the container is up and running. Note its name for the next step.
+    ```bash
+    docker ps
+    ```
+
+---
+
+#### **2. Training & Evaluation (Using `docker attach`)**
+
+This method directly attaches your terminal to the container's main process. It's simple but requires careful handling to avoid terminating your session.
+
+*   **Step 2.1: Attach to the Container**
+
+    Attach your terminal to the running container. You will be dropped into a bash shell.
+
+    ```bash
+    docker attach <your_container_name>
+    ```
+
+*   **Step 2.2: Run the Training Command**
+
+    Now, *inside the attached shell*, run your training command. `torchrun` will automatically use the GPUs assigned to the container. **Do not run it in the background (`&`)**.
+
+    ```bash
+    # Example for 4 GPUs assigned to the container
+    torchrun --nproc_per_node=4 --master-port=8989 \
+        tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \
+        --amp
+    ```
+
+*   **Step 2.3: Detach from the Session (IMPORTANT!)**
+
+    With your training running, you can safely detach and leave it running.
+
+    **WARNING:** **DO NOT PRESS `Ctrl+C`**. This will kill the training process and potentially the entire container.
+
+    To safely detach, press the sequence: **`Ctrl+P`**, followed immediately by **`Ctrl+Q`**.
+
+    You will return to your local terminal, and the container will continue running the training in the background.
+
+*   **Step 2.4: Re-attach to Your Session**
+
+    To check on your training progress, simply run the `docker attach` command again. You will see the live output from your training command.
+
+    ```bash
+    docker attach <your_container_name>
+    ```
+    (Remember to detach with `Ctrl+P`, `Ctrl+Q` when you're done.)
+
+---
+
+#### **3. Exporting & Inference**
+
+For tasks like exporting or running inference, which don't need to run for days, it's safer to use `docker exec` to open a new, separate shell.
+
+*   **Step 3.1: Open a New Shell in the Container**
+    ```bash
+    docker exec -it <your_container_name> bash
+    ```
+
+*   **Step 3.2: Run Export or Inference Commands**
+    Now, inside this new shell, run your commands.
+    ```bash
+    # Export to ONNX
+    python tools/export_onnx.py \
+        -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \
+        -r path/to/trained_checkpoint.pth \
+        --check
+    ```
+    
+    ```
+    # Convert to TensorRT
+    bash tools/onnx2trt.sh /path/to/your/model.onnx
+    ```
+
+    ```
+    # RUN TRT Inference
+    python references/deploy/rtdetrv2_tensorrt.py \
+    --engine /path/to/your/model.trt \
+    --image /path/to/your/image.jpg \
+    --output /path/to/save/output.jpg \
+    --threshold 0.5
+    ```
+
+### Utilities & Tips
+
+*   **Visualize training with TensorBoard:**
+    *   Use the standard port `6006` to avoid conflicts with training.
+    *   Ensure the port `6006` is exposed in your `docker-compose.yml`.
+
+    ```bash
+    # Inside the container
+    tensorboard --logdir=path/to/summary/ --host=0.0.0.0 --port=6006
+    ```
+
+*   **Managing the Container Lifecycle:**
+    *   **To temporarily stop** the container without deleting it (e.g., to pause training and resume later):
+        ```bash
+        docker compose stop
+        ```
+        You can restart it later with `docker compose start`.
+
+    *   **To stop and completely remove** the container, network, and volumes:
+        ```bash
+        docker compose down
+        ```
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/tools/export_onnx.py b/rtdetrv2_pytorch/tools/export_onnx.py
new file mode 100644
index 0000000..1586319
--- /dev/null
+++ b/rtdetrv2_pytorch/tools/export_onnx.py
@@ -0,0 +1,100 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import os
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
+
+import torch
+import torch.nn as nn
+
+from src.core import YAMLConfig, yaml_utils
+
+
+def main(args, ):
+    """main
+    """
+    update_dict = yaml_utils.parse_cli(args.update) if args.update else {}
+    update_dict.update({k: v for k, v in args.__dict__.items() \
+                        if k not in ['update', ] and v is not None})
+    cfg = YAMLConfig(args.config, **update_dict)
+
+    if args.resume:
+        checkpoint = torch.load(args.resume, map_location='cpu')
+        if 'ema' in checkpoint:
+            state = checkpoint['ema']['module']
+        else:
+            state = checkpoint['model']
+
+        # NOTE load train mode state -> convert to deploy mode
+        cfg.model.load_state_dict(state)
+
+    else:
+        # raise AttributeError('Only support resume to load model.state_dict by now.')
+        print('not load model.state_dict, use default init state dict...')
+
+    class Model(nn.Module):
+        def __init__(self, ) -> None:
+            super().__init__()
+            self.model = cfg.model.deploy()
+            self.postprocessor = cfg.postprocessor.deploy()
+
+        def forward(self, images, orig_target_sizes):
+            outputs = self.model(images)
+            outputs = self.postprocessor(outputs, orig_target_sizes)
+            return outputs
+
+    model = Model()
+
+    data = torch.rand(1, 3, args.input_size, args.input_size)
+    size = torch.tensor([[args.input_size, args.input_size]])
+    _ = model(data, size)
+
+    dynamic_axes = {
+        'images': {0: 'N', },
+        'orig_target_sizes': {0: 'N'}
+    }
+
+    torch.onnx.export(
+        model,
+        (data, size),
+        args.output_file,
+        input_names=['images', 'orig_target_sizes'],
+        output_names=['labels', 'boxes', 'scores'],
+        dynamic_axes=dynamic_axes,
+        opset_version=16,
+        verbose=False,
+        do_constant_folding=True,
+    )
+
+    if args.check:
+        import onnx
+        onnx_model = onnx.load(args.output_file)
+        onnx.checker.check_model(onnx_model)
+        print('Check export onnx model done...')
+
+    if args.simplify:
+        import onnx 
+        import onnxsim
+        dynamic = True 
+        # input_shapes = {'images': [1, 3, 640, 640], 'orig_target_sizes': [1, 2]} if dynamic else None
+        input_shapes = {'images': data.shape, 'orig_target_sizes': size.shape} if dynamic else None
+        onnx_model_simplify, check = onnxsim.simplify(args.output_file, input_shapes=input_shapes, dynamic_input_shape=dynamic)
+        onnx.save(onnx_model_simplify, args.output_file)
+        print(f'Simplify onnx model {check}...')
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config', '-c', type=str)
+    parser.add_argument('--resume', '-r', type=str)
+    parser.add_argument('--output_file', '-o', type=str, default='model.onnx')
+    parser.add_argument('--input_size', '-s', type=int, default=640)
+    parser.add_argument('--check', action='store_true', default=False)
+    parser.add_argument('--simplify', action='store_true', default=False)
+    parser.add_argument('--update', '-u', nargs='+', help='update yaml config')
+
+    args = parser.parse_args()
+
+    main(args)
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/tools/export_trt.py b/rtdetrv2_pytorch/tools/export_trt.py
new file mode 100644
index 0000000..facad52
--- /dev/null
+++ b/rtdetrv2_pytorch/tools/export_trt.py
@@ -0,0 +1,81 @@
+import os
+import argparse
+import tensorrt as trt
+
+def main(onnx_path, engine_path, max_batchsize, opt_batchsize, min_batchsize, use_fp16=True, verbose=False)->None:
+    """ Convert ONNX model to TensorRT engine.
+    Args:
+        onnx_path (str): Path to the input ONNX model.
+        engine_path (str): Path to save the output TensorRT engine.
+        use_fp16 (bool): Whether to use FP16 precision.
+        verbose (bool): Whether to enable verbose logging.
+    """
+    logger = trt.Logger(trt.Logger.VERBOSE if verbose else trt.Logger.INFO)
+
+    builder = trt.Builder(logger)
+    network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(network_flags)
+
+    parser = trt.OnnxParser(network, logger)
+    config = builder.create_builder_config()
+    config.set_preview_feature(trt.PreviewFeature.FASTER_DYNAMIC_SHAPES_0805, True)
+
+    if not os.path.isfile(onnx_path):
+        raise FileNotFoundError(f"ONNX file not found: {onnx_path}")
+    
+    print(f"[INFO] Loading ONNX file from {onnx_path}")
+    with open(onnx_path, "rb") as f:
+        if not parser.parse(f.read()):
+            for error in range(parser.num_errors):
+                print(parser.get_error(error))
+            raise RuntimeError("Failed to parse ONNX file")
+
+    config = builder.create_builder_config()
+    config.set_preview_feature(trt.PreviewFeature.FASTER_DYNAMIC_SHAPES_0805, True)
+    config.max_workspace_size = 1 << 30  # 1GB
+    
+    if use_fp16:
+        if builder.platform_has_fast_fp16:
+            config.set_flag(trt.BuilderFlag.FP16)
+            print("[INFO] FP16 optimization enabled.")
+        else:
+            print("[WARNING] FP16 not supported on this platform. Proceeding with FP32.")
+
+    profile = builder.create_optimization_profile()
+    profile.set_shape("images", min=(min_batchsize, 3, 640, 640), opt=(opt_batchsize, 3, 640, 640), max=(max_batchsize, 3, 640, 640))
+    profile.set_shape("orig_target_sizes", min=(1, 2), opt=(1, 2), max=(1, 2))
+    config.add_optimization_profile(profile)
+
+    print("[INFO] Building TensorRT engine...")
+    engine = builder.build_engine(network, config)
+
+    if engine is None:
+        raise RuntimeError("Failed to build the engine.")
+
+    print(f"[INFO] Saving engine to {engine_path}")
+    with open(engine_path, "wb") as f:
+        f.write(engine.serialize())
+    print("[INFO] Engine export complete.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert ONNX to TensorRT Engine")
+    parser.add_argument("--onnx", "-i", type=str, required=True, help="Path to input ONNX model file")
+    parser.add_argument("--saveEngine", "-o", type=str, default="model.engine", help="Path to output TensorRT engine file")
+    parser.add_argument("--maxBatchSize", "-Mb", type=int, default=32, help="Maximum batch size for inference")
+    parser.add_argument("--optBatchSize", "-ob", type=int, default=16, help="Optimal batch size for inference")
+    parser.add_argument("--minBatchSize", "-mb", type=int, default=1, help="Minimum batch size for inference")
+    parser.add_argument("--fp16", default=True, action="store_true", help="Enable FP16 precision mode")
+    parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
+
+    args = parser.parse_args()
+
+    main(
+        onnx_path=args.onnx,
+        engine_path=args.saveEngine,
+        max_batchsize=args.maxBatchSize,
+        opt_batchsize=args.optBatchSize,
+        min_batchsize=args.minBatchSize,
+        use_fp16=args.fp16,
+        verbose=args.verbose
+    )
diff --git a/rtdetrv2_pytorch/tools/onnx2trt.sh b/rtdetrv2_pytorch/tools/onnx2trt.sh
new file mode 100644
index 0000000..6f43595
--- /dev/null
+++ b/rtdetrv2_pytorch/tools/onnx2trt.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# A script to convert an ONNX model to a TensorRT engine using trtexec.
+# This script automatically sets the output engine path based on the input ONNX file.
+
+# Exit immediately if a command exits with a non-zero status.
+set -e
+
+# Check if an input file is provided.
+if [ -z "$1" ]; then
+    echo "Error: No ONNX file provided."
+    echo "Usage: $0 /path/to/your/model.onnx"
+    exit 1
+fi
+
+ONNX_FILE=$1
+# Replace the .onnx extension with .trt for the output file.
+ENGINE_FILE="${ONNX_FILE%.onnx}.trt"
+
+echo "==> Converting ONNX to TensorRT Engine <=="
+echo "  - Input ONNX:  $ONNX_FILE"
+echo "  - Output TRT:  $ENGINE_FILE"
+echo "  - Precision:   FP16"
+echo "=========================================="
+
+# Run the trtexec command.
+# --fp16 enables 16-bit floating-point precision for faster inference.
+# --verbose provides detailed output during the conversion process.
+trtexec --onnx="$ONNX_FILE" \
+        --saveEngine="$ENGINE_FILE" \
+        --fp16 \
+        --verbose
+
+echo "=========================================="
+echo "✅ Successfully created TensorRT engine: $ENGINE_FILE"
\ No newline at end of file
diff --git a/rtdetrv2_pytorch/tools/run_profile.py b/rtdetrv2_pytorch/tools/run_profile.py
new file mode 100644
index 0000000..bdcf989
--- /dev/null
+++ b/rtdetrv2_pytorch/tools/run_profile.py
@@ -0,0 +1,110 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import math
+import os
+import sys
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
+from typing import Any, Dict, List, Optional
+
+from src.core import YAMLConfig, yaml_utils
+
+__all__ = ["profile_stats"]
+
+def _auto_scale_flops(flops: float):
+    """Copied from torch.profiler.profile"""
+    flop_headers = [
+        "",
+        "K",
+        "M",
+        "G",
+        "T",
+        "P",
+    ]
+    assert flops > 0
+    log_flops = max(0, min(math.log10(flops) / 3, float(len(flop_headers) - 1)))
+    assert log_flops >= 0 and log_flops < len(flop_headers)
+    return (pow(10, (math.floor(log_flops) * -3.0)), flop_headers[int(log_flops)])
+
+def profile_stats(
+    model: nn.Module, 
+    data: Optional[Tensor]=None, 
+    shape: List[int]=[1, 3, 640, 640], 
+    verbose: bool=False
+) -> Dict[str, Any]:
+    is_training = model.training
+
+    model.train()
+    num_params = sum([p.numel() for p in model.parameters() if p.requires_grad])
+
+    model.eval()
+
+    if data is None:
+        dtype = next(model.parameters()).dtype
+        device = next(model.parameters()).device
+        data = torch.rand(*shape, dtype=dtype, device=device)
+        print(device)
+
+    def trace_handler(prof):
+        print(prof.key_averages().table(sort_by='self_cuda_time_total', row_limit=-1))
+
+    wait = 0
+    warmup = 1
+    active = 1
+    repeat = 1
+    skip_first = 0
+    with torch.profiler.profile(
+        activities=[
+            torch.profiler.ProfilerActivity.CPU,
+            torch.profiler.ProfilerActivity.CUDA,
+        ],
+        schedule=torch.profiler.schedule(
+            wait=wait,
+            warmup=warmup,
+            active=active,
+            repeat=repeat,
+            skip_first=skip_first,
+        ),
+        with_flops=True,
+    ) as p:
+        n_step = skip_first + (wait + warmup + active) * repeat
+        for _ in range(n_step):
+            _ = model(data)
+            p.step()
+
+    if is_training:
+        model.train()
+
+    statistics = p.key_averages()
+    info = statistics.table(sort_by='self_cuda_time_total', row_limit=-1)
+    num_flops = sum(event.flops for event in statistics if event.flops > 0) / active
+    (flops_scale, flops_header) = _auto_scale_flops(num_flops)
+
+    if verbose:
+        print(info)
+        print(f'Total number of trainable parameters: {num_params}')
+        print(f'Total number of flops: {num_flops * flops_scale:.3f}{flops_header} with {shape}')
+
+    return {'n_parameters': num_params, 'n_flops': num_flops, 'info': info}
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, required=True)
+    parser.add_argument('-d', '--device', type=str, default='cuda:0', help='device',)
+    parser.add_argument('-u', '--update', nargs='+', help='Update yaml config from command line.')
+    args = parser.parse_args()
+
+    update_dict = yaml_utils.parse_cli(args.update) if args.update else {}
+    update_dict.update({k: v for k, v in args.__dict__.items() \
+                        if k not in ['update', ] and v is not None})
+    cfg = YAMLConfig(args.config, **update_dict)
+    model = cfg.model.to(args.device)
+
+    profile_stats(model, verbose=True)
diff --git a/rtdetrv2_pytorch/tools/train.py b/rtdetrv2_pytorch/tools/train.py
new file mode 100644
index 0000000..280caa8
--- /dev/null
+++ b/rtdetrv2_pytorch/tools/train.py
@@ -0,0 +1,65 @@
+"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
+"""
+
+import os 
+import sys 
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
+
+import argparse
+
+from src.misc import dist_utils
+from src.core import YAMLConfig, yaml_utils
+from src.solver import TASKS
+
+
+def main(args, ) -> None:
+    """main
+    """
+    dist_utils.setup_distributed(args.print_rank, args.print_method, seed=args.seed)
+
+    assert not all([args.tuning, args.resume]), \
+        'Only support from_scrach or resume or tuning at one time'
+
+    update_dict = yaml_utils.parse_cli(args.update)
+    update_dict.update({k: v for k, v in args.__dict__.items() \
+        if k not in ['update', ] and v is not None})
+
+    cfg = YAMLConfig(args.config, **update_dict)
+    print('cfg: ', cfg.__dict__)
+
+    solver = TASKS[cfg.yaml_cfg['task']](cfg)
+    
+    if args.test_only:
+        solver.val()
+    else:
+        solver.fit()
+
+    dist_utils.cleanup()
+    
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    
+    # priority 0
+    parser.add_argument('-c', '--config', type=str, required=True)
+    parser.add_argument('-r', '--resume', type=str, help='resume from checkpoint')
+    parser.add_argument('-t', '--tuning', type=str, help='tuning from checkpoint')
+    parser.add_argument('-d', '--device', type=str, help='device',)
+    parser.add_argument('--seed', type=int, help='exp reproducibility')
+    parser.add_argument('--use-amp', action='store_true', help='auto mixed precision training')
+    parser.add_argument('--output-dir', type=str, help='output directoy')
+    parser.add_argument('--summary-dir', type=str, help='tensorboard summry')
+    parser.add_argument('--test-only', action='store_true', default=False,)
+
+    # priority 1
+    parser.add_argument('-u', '--update', nargs='+', help='update yaml config')
+
+    # env
+    parser.add_argument('--print-method', type=str, default='builtin', help='print method')
+    parser.add_argument('--print-rank', type=int, default=0, help='print rank id')
+
+    parser.add_argument('--local-rank', type=int, help='local rank id')
+    args = parser.parse_args()
+
+    main(args)