commit ec2379914810d4ca503155e838a427a7d83b3329 Author: 陈赣 Date: Wed Jun 3 12:42:47 2026 +0800 first commit diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000..8dee225 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,13 @@ +# These are supported funding model platforms + +github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] +patreon: # Replace with a single Patreon username +open_collective: # Replace with a single Open Collective username +ko_fi: # Replace with a single Ko-fi username +tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel +community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry +liberapay: # Replace with a single Liberapay username +issuehunt: # Replace with a single IssueHunt username +otechie: # Replace with a single Otechie username +lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry +custom: ['https://github.com/lyuwenyu/cvperception/assets/17582080/2b4bfcd5-5c0f-45fd-badf-3f6e5b0249ac']# Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..53c082a --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,21 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: '' +assignees: lyuwenyu + +--- + +**Star RTDETR** +请先在RTDETR主页点击**star**以支持本项目 +Star RTDETR to help more people discover this project. + +--- + +**Describe the bug** +A clear and concise description of what the bug is. +If applicable, add screenshots to help explain your problem. + +**To Reproduce** +Steps to reproduce the behavior. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3a63153 --- /dev/null +++ b/.gitignore @@ -0,0 +1,172 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + + +.DS_Store +*.ipynb +*.pth +*.pdparams +*.onnx +test.py +rtdetr_pytorch/output/ +rtdetr_pytorch/dataset/ +rtdetrv2_pytorch/output/ +rtdetrv2_pytorch/dataset/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..e04aa87 --- /dev/null +++ b/README.md @@ -0,0 +1,140 @@ +English | [简体中文](README_cn.md) + + +

RT-DETR: DETRs Beat YOLOs on Real-time Object Detection

+

+ + + license + + + prs + + + issues + + + issues + + + arXiv + + + emal + +

+ +--- + + +This is the official implementation of papers +- [DETRs Beat YOLOs on Real-time Object Detection](https://arxiv.org/abs/2304.08069) +- [RT-DETRv2: Improved Baseline with Bag-of-Freebies for Real-Time Detection Transformer](https://arxiv.org/abs/2407.17140) + + +
+Fig + + + + +
+
+ + +## 🚀 Updates +- \[2025.11.18\] Release the **newest** member of the RT-DETR family: [RT-DETRv4:Painlessly Furthering Real-Time Object Detection with Vision Foundation Models](https://github.com/RT-DETRs/RT-DETRv4). +By harnessing the rapidly evolving capabilities of Vision Foundation Models (VFMs), we boost lightweight detectors and, without incurring any extra inference latency, significantly improve the performance of the full-size model. +- \[2024.11.28\] Add torch tool for parameters and flops statistics. see [run_profile.py](./rtdetrv2_pytorch/tools/run_profile.py) +- \[2024.10.10\] Add sliced inference support for small object detecion. [#468](https://github.com/lyuwenyu/RT-DETR/pull/468) +- \[2024.09.23\] Add ✅[Regnet and DLA34](https://github.com/lyuwenyu/RT-DETR/tree/main/rtdetr_pytorch) for RTDETR. +- \[2024.08.27\] Add hubconf.py file to support torch hub. +- \[2024.08.22\] Improve the performance of ✅ [RT-DETRv2-S](./rtdetrv2_pytorch/) to 48.1 mAP (+1.6 compared to RT-DETR-R18). +- \[2024.07.24\] Release ✅ [RT-DETRv2](./rtdetrv2_pytorch/)! +- \[2024.02.27\] Our work has been accepted to CVPR 2024! +- \[2024.01.23\] Fix difference on data augmentation with paper in rtdetr_pytorch [#84](https://github.com/lyuwenyu/RT-DETR/commit/5dc64138e439247b4e707dd6cebfe19d8d77f5b1). +- \[2023.11.07\] Add pytorch ✅ *rtdetr_r34vd* for requests [#107](https://github.com/lyuwenyu/RT-DETR/issues/107), [#114](https://github.com/lyuwenyu/RT-DETR/issues/114). +- \[2023.11.05\] Upgrade the logic of `remap_mscoco_category` to facilitate training of custom datasets, see detils in [*Train custom data*](./rtdetr_pytorch/) part. [#81](https://github.com/lyuwenyu/RT-DETR/commit/95fc522fd7cf26c64ffd2ad0c622c392d29a9ebf). +- \[2023.10.23\] Add [*discussion for deployments*](https://github.com/lyuwenyu/RT-DETR/issues/95), supported onnxruntime, TensorRT, openVINO. +- \[2023.10.12\] Add tuning code for pytorch version, now you can tuning rtdetr based on pretrained weights. +- \[2023.09.19\] Upload ✅ [*pytorch weights*](https://github.com/lyuwenyu/RT-DETR/issues/42) convert from paddle version. +- \[2023.08.24] Release RT-DETR-R18 pretrained models on objects365. *49.2 mAP* and *217 FPS*. +- \[2023.08.22\] Upload ✅ [*rtdetr_pytorch*](./rtdetr_pytorch/) source code. Please enjoy it! +- \[2023.08.15\] Release RT-DETR-R101 pretrained models on objects365. *56.2 mAP* and *74 FPS*. +- \[2023.07.30\] Release RT-DETR-R50 pretrained models on objects365. *55.3 mAP* and *108 FPS*. +- \[2023.07.28\] Fix some bugs, and add some comments. [1](https://github.com/lyuwenyu/RT-DETR/pull/14), [2](https://github.com/lyuwenyu/RT-DETR/commit/3b5cbcf8ae3b907e6b8bb65498a6be7c6736eabc). +- \[2023.07.13\] Upload ✅ [*training logs on coco*](https://github.com/lyuwenyu/RT-DETR/issues/8). +- \[2023.05.17\] Release RT-DETR-R18, RT-DETR-R34, RT-DETR-R50-m(example for scaled). +- \[2023.04.17\] Release RT-DETR-R50, RT-DETR-R101, RT-DETR-L, RT-DETR-X. + +## 📣 News +- RTDETR and RTDETRv2 are now available in Hugging Face Transformers. [#413](https://github.com/lyuwenyu/RT-DETR/issues/413), [#549](https://github.com/lyuwenyu/RT-DETR/issues/549) +- RTDETR is now available in [ultralytics/ultralytics](https://docs.ultralytics.com/zh/models/rtdetr/). + +## 📍 Implementations +- 🔥 RT-DETRv2 + - paddle: [code&weight](./rtdetrv2_paddle/) + - pytorch: [code&weight](./rtdetrv2_pytorch/) +- 🔥 RT-DETR + - paddle: [code&weight](./rtdetr_paddle) + - pytorch: [code&weight](./rtdetr_pytorch) + + +| Model | Input shape | Dataset | $AP^{val}$ | $AP^{val}_{50}$| Params(M) | FLOPs(G) | T4 TensorRT FP16(FPS) +|:---:|:---:| :---:|:---:|:---:|:---:|:---:|:---:| +| RT-DETR-R18 | 640 | COCO | 46.5 | 63.8 | 20 | 60 | 217 | +| RT-DETR-R34 | 640 | COCO | 48.9 | 66.8 | 31 | 92 | 161 | +| RT-DETR-R50-m | 640 | COCO | 51.3 | 69.6 | 36 | 100 | 145 | +| RT-DETR-R50 | 640 | COCO | 53.1 | 71.3 | 42 | 136 | 108 | +| RT-DETR-R101 | 640 | COCO | 54.3 | 72.7 | 76 | 259 | 74 | +| RT-DETR-HGNetv2-L | 640 | COCO | 53.0 | 71.6 | 32 | 110 | 114 | +| RT-DETR-HGNetv2-X | 640 | COCO | 54.8 | 73.1 | 67 | 234 | 74 | +| RT-DETR-R18 | 640 | COCO + Objects365 | **49.2** | **66.6** | 20 | 60 | **217** | +| RT-DETR-R50 | 640 | COCO + Objects365 | **55.3** | **73.4** | 42 | 136 | **108** | +| RT-DETR-R101 | 640 | COCO + Objects365 | **56.2** | **74.6** | 76 | 259 | **74** | +**RT-DETRv2-S** | 640 | COCO | **48.1** (+1.6) | **65.1** | 20 | 60 | 217 | +**RT-DETRv2-M*** | 640 | COCO | **49.9** (+1.0) | **67.5** | 31 | 92 | 161 | +**RT-DETRv2-M** | 640 | COCO | **51.9** (+0.6) | **69.9** | 36 | 100 | 145 | +**RT-DETRv2-L** | 640 | COCO | **53.4** (+0.3) | **71.6** | 42 | 136 | 108 | +**RT-DETRv2-X** | 640 | COCO | 54.3 | **72.8** (+0.1) | 76 | 259| 74 | + +**Notes:** +- `COCO + Objects365` in the table means finetuned model on COCO using pretrained weights trained on Objects365. + + +## 🦄 Performance + +### 🏕️ Complex Scenarios +
+ +
+ +### 🌋 Difficult Conditions +
+ +
+ +## Citation +If you use `RT-DETR` or `RTDETRv2` in your work, please use the following BibTeX entries: +``` +@misc{lv2023detrs, + title={DETRs Beat YOLOs on Real-time Object Detection}, + author={Yian Zhao and Wenyu Lv and Shangliang Xu and Jinman Wei and Guanzhong Wang and Qingqing Dang and Yi Liu and Jie Chen}, + year={2023}, + eprint={2304.08069}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} + +@misc{lv2024rtdetrv2improvedbaselinebagoffreebies, + title={RT-DETRv2: Improved Baseline with Bag-of-Freebies for Real-Time Detection Transformer}, + author={Wenyu Lv and Yian Zhao and Qinyao Chang and Kui Huang and Guanzhong Wang and Yi Liu}, + year={2024}, + eprint={2407.17140}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2407.17140}, +} +``` diff --git a/README_cn.md b/README_cn.md new file mode 100644 index 0000000..8571b46 --- /dev/null +++ b/README_cn.md @@ -0,0 +1,64 @@ +简体中文 | [English](README.md) + +# RT-DETR + +文章"[DETRs Beat YOLOs on Real-time Object Detection](https://arxiv.org/abs/2304.08069)"和"[RT-DETRv2: Improved Baseline with Bag-of-Freebies for Real-Time Detection Transformer](https://arxiv.org/abs/2407.17140)"的官方实现. + +
+Fig + +
+ +
+ +
+ + +## 最新动态 +- 发布RT-DETRv2系列模型 +- 发布RT-DETR-R50, RT-DETR-R101模型 +- 发布RT-DETR-R50-m模型(scale模型的范例) +- 发布RT-DETR-R34, RT-DETR-R18模型 +- 发布RT-DETR-L, RT-DETR-X模型 + + +## 代码仓库 +- 🔥 RT-DETRv2 + - paddle: [code&weight](./rtdetrv2_paddle/) + - pytorch: [code&weight](./rtdetrv2_pytorch/) +- 🔥 RT-DETR + - paddle: [code&weight](./rtdetr_paddle) + - pytorch: [code&weight](./rtdetr_pytorch) + + +## 简介 + +RT-DETR是第一个实时端到端目标检测器。具体而言,我们设计了一个高效的混合编码器,通过解耦尺度内交互和跨尺度融合来高效处理多尺度特征,并提出了IoU感知的查询选择机制,以优化解码器查询的初始化。此外,RT-DETR支持通过使用不同的解码器层来灵活调整推理速度,而不需要重新训练,这有助于实时目标检测器的实际应用。RT-DETR-R50在COCO val2017上实现了53.1%的AP,在T4 GPU上实现了108FPS,RT-DETR-R101实现了54.3%的AP和74FPS,在速度和精度方面都优于相同规模的所有YOLO检测器。使用Objects365预训练之后, RT-DETR-R50 和 RT-DETR-R101 分别实现了 55.3% 和 56.2% AP的精度. +若要了解更多细节,请参考我们的论文[paper](https://arxiv.org/abs/2304.08069). + +
+ +
+ +## 引用RT-DETR +如果需要在你的研究中使用RT-DETR,请通过以下方式引用我们的论文: +``` +@misc{lv2023detrs, + title={DETRs Beat YOLOs on Real-time Object Detection}, + author={Yian Zhao and Wenyu Lv and Shangliang Xu and Jinman Wei and Guanzhong Wang and Qingqing Dang and Yi Liu and Jie Chen}, + year={2023}, + eprint={2304.08069}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} + +@misc{lv2024rtdetrv2improvedbaselinebagoffreebies, + title={RT-DETRv2: Improved Baseline with Bag-of-Freebies for Real-Time Detection Transformer}, + author={Wenyu Lv and Yian Zhao and Qinyao Chang and Kui Huang and Guanzhong Wang and Yi Liu}, + year={2024}, + eprint={2407.17140}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2407.17140}, +} +``` diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 0000000..779ce65 --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,59 @@ +# 论文测速使用的部分代码和工具 + + +## 测试YOLO系列的速度 [in progress] +以[yolov8](https://github.com/ultralytics/ultralytics)为例 + +
+1. 转onnx + +执行`yolov8_onnx.py`中的`export_onnx`函数,新增代码主要涉及输出格式的转换 +
+ + +
+2. 插入nms + +使用`utils.py`中的`yolo_insert_nms`函数,导出onnx模型后使用[Netron](https://netron.app/)查看结构. image +
+ + +
+3. 转tensorrt + +可以使用`trtexec.md`中的的脚本转换,或者使用`utils.py`中的Python代码转换 +```bash +# trtexec -h +trtexec --onnx=./yolov8l_w_nms.onnx --saveEngine=yolov8l_w_nms.engine --buildOnly --fp16 +``` +
+ + +
+4. trtexec测速 + +可以使用`trtexec.md`中的的脚本转换,去掉`--buildOnly`参数 + +
+ + + +
+5. profile分析(可选) + +在4的基础之上加以下命令 +```bash +nsys profile --force-overwrite=true -t 'nvtx,cuda,osrt,cudnn' -c cudaProfilerApi -o yolov8l_w_nms +``` +可以使用nsys可视化分析 +image + +
+ + +
+6. Python测速或者部署 + +在Coco val数据集上测模型的平均速度使用`trtinfer.py`中的代码推理 + +
diff --git a/benchmark/dataset.py b/benchmark/dataset.py new file mode 100644 index 0000000..ce60173 --- /dev/null +++ b/benchmark/dataset.py @@ -0,0 +1,102 @@ +'''by lyuwenyu +''' + +import os +import glob +from PIL import Image + +import torch +import torch.utils.data as data +import torchvision +import torchvision.transforms as T +import torchvision.transforms.functional as F + + +class ToTensor(T.ToTensor): + def __init__(self) -> None: + super().__init__() + + def __call__(self, pic): + if isinstance(pic, torch.Tensor): + return pic + return super().__call__(pic) + +class PadToSize(T.Pad): + def __init__(self, size, fill=0, padding_mode='constant'): + super().__init__(0, fill, padding_mode) + self.size = size + self.fill = fill + + def __call__(self, img): + """ + Args: + img (PIL Image or Tensor): Image to be padded. + + Returns: + PIL Image or Tensor: Padded image. + """ + w, h = F.get_image_size(img) + padding = (0, 0, self.size[0] - w, self.size[1] - h) + return F.pad(img, padding, self.fill, self.padding_mode) + + +class Dataset(data.Dataset): + def __init__(self, img_dir: str='', preprocess: T.Compose=None, device='cuda:0') -> None: + super().__init__() + + self.device = device + self.size = 640 + + self.im_path_list = list(glob.glob(os.path.join(img_dir, '*.jpg'))) + + if preprocess is None: + self.preprocess = T.Compose([ + T.Resize(size=639, max_size=640), + PadToSize(size=(640, 640), fill=114), + ToTensor(), + T.ConvertImageDtype(torch.float), + ]) + else: + self.preprocess = preprocess + + def __len__(self, ): + return len(self.im_path_list) + + def __getitem__(self, index): + # im = Image.open(self.img_path_list[index]).convert('RGB') + im = torchvision.io.read_file(self.im_path_list[index]) + im = torchvision.io.decode_jpeg(im, mode=torchvision.io.ImageReadMode.RGB, device=self.device) + _, h, w = im.shape # c,h,w + + im = self.preprocess(im) + + blob = { + 'image': im, + 'im_shape': torch.tensor([self.size, self.size]).to(im.device), + 'scale_factor': torch.tensor([self.size / h, self.size / w]).to(im.device), + 'orig_size': torch.tensor([w, h]).to(im.device), + } + + return blob + + @staticmethod + def post_process(): + pass + + @staticmethod + def collate_fn(): + pass + + +def draw_nms_result(blob, outputs, draw_score_threshold=0.25, name=''): + '''show result + Keys: + 'num_dets', 'det_boxes', 'det_scores', 'det_classes' + ''' + for i in range(blob['image'].shape[0]): + det_scores = outputs['det_scores'][i] + det_boxes = outputs['det_boxes'][i][det_scores > draw_score_threshold] + + im = (blob['image'][i] * 255).to(torch.uint8) + im = torchvision.utils.draw_bounding_boxes(im, boxes=det_boxes, width=2) + Image.fromarray(im.permute(1, 2, 0).cpu().numpy()).save(f'test_{name}_{i}.jpg') diff --git a/benchmark/trtexec.md b/benchmark/trtexec.md new file mode 100644 index 0000000..d41855b --- /dev/null +++ b/benchmark/trtexec.md @@ -0,0 +1,13 @@ + +```bash +# build tensorrt engine +trtexec --onnx=./yolov8l_w_nms.onnx --saveEngine=yolov8l_w_nms.engine --buildOnly --fp16 + +# using dynamic shapes +# --explicitBatch --minShapes=image:1x3x640x640 --optShapes=image:8x3x640x640 --maxShapes=image:16x3x640x640 --shapes=image:8x3x640x640 + +# timeline +nsys profile --force-overwrite=true -t 'nvtx,cuda,osrt,cudnn' -c cudaProfilerApi -o yolov8l_w_nms trtexec --loadEngine=./yolov8l_w_nms.engine --fp16 --avgRuns=10 --loadInputs='image:input_tensor.bin' + +# https://forums.developer.nvidia.com/t/about-loadinputs-in-trtexec/218880 +``` diff --git a/benchmark/trtinfer.py b/benchmark/trtinfer.py new file mode 100644 index 0000000..80eefb0 --- /dev/null +++ b/benchmark/trtinfer.py @@ -0,0 +1,153 @@ +'''by lyuwenyu +''' + +import time +import contextlib +from collections import namedtuple, OrderedDict + +import torch +import numpy as np +import tensorrt as trt + +from utils import TimeProfiler + +class TRTInference(object): + def __init__(self, engine_path, device='cuda:0', backend='torch', max_batch_size=32, verbose=False): + self.engine_path = engine_path + self.device = device + self.backend = backend + self.max_batch_size = max_batch_size + + self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO) + + self.engine = self.load_engine(engine_path) + + self.context = self.engine.create_execution_context() + + self.bindings = self.get_bindings(self.engine, self.context, self.max_batch_size, self.device) + self.bindings_addr = OrderedDict((n, v.ptr) for n, v in self.bindings.items()) + + self.input_names = self.get_input_names() + self.output_names = self.get_output_names() + + if self.backend == 'cuda': + self.stream = cuda.Stream() + + self.time_profile = TimeProfiler() + + def init(self, ): + self.dynamic = False + + def load_engine(self, path): + '''load engine + ''' + trt.init_libnvinfer_plugins(self.logger, '') + with open(path, 'rb') as f, trt.Runtime(self.logger) as runtime: + return runtime.deserialize_cuda_engine(f.read()) + + def get_input_names(self, ): + names = [] + for _, name in enumerate(self.engine): + if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT: + names.append(name) + return names + + def get_output_names(self, ): + names = [] + for _, name in enumerate(self.engine): + if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT: + names.append(name) + return names + + def get_bindings(self, engine, context, max_batch_size=32, device=None): + '''build binddings + ''' + Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr')) + bindings = OrderedDict() + # max_batch_size = 1 + + for i, name in enumerate(engine): + shape = engine.get_tensor_shape(name) + dtype = trt.nptype(engine.get_tensor_dtype(name)) + + if shape[0] == -1: + dynamic = True + shape[0] = max_batch_size + if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT: # dynamic + context.set_input_shape(name, shape) + + if self.backend == 'cuda': + if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT: + data = np.random.randn(*shape).astype(dtype) + ptr = cuda.mem_alloc(data.nbytes) + bindings[name] = Binding(name, dtype, shape, data, ptr) + else: + data = cuda.pagelocked_empty(trt.volume(shape), dtype) + ptr = cuda.mem_alloc(data.nbytes) + bindings[name] = Binding(name, dtype, shape, data, ptr) + + else: + data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device) + bindings[name] = Binding(name, dtype, shape, data, data.data_ptr()) + + return bindings + + def run_torch(self, blob): + '''torch input + ''' + for n in self.input_names: + if self.bindings[n].shape != blob[n].shape: + self.context.set_input_shape(n, blob[n].shape) + self.bindings[n] = self.bindings[n]._replace(shape=blob[n].shape) + + self.bindings_addr.update({n: blob[n].data_ptr() for n in self.input_names}) + self.context.execute_v2(list(self.bindings_addr.values())) + outputs = {n: self.bindings[n].data for n in self.output_names} + + return outputs + + + def async_run_cuda(self, blob): + '''numpy input + ''' + for n in self.input_names: + cuda.memcpy_htod_async(self.bindings_addr[n], blob[n], self.stream) + + bindings_addr = [int(v) for _, v in self.bindings_addr.items()] + self.context.execute_async_v2(bindings=bindings_addr, stream_handle=self.stream.handle) + + outputs = {} + for n in self.output_names: + cuda.memcpy_dtoh_async(self.bindings[n].data, self.bindings[n].ptr, self.stream) + outputs[n] = self.bindings[n].data + + self.stream.synchronize() + + return outputs + + def __call__(self, blob): + if self.backend == 'torch': + return self.run_torch(blob) + + elif self.backend == 'cuda': + return self.async_run_cuda(blob) + + def synchronize(self, ): + if self.backend == 'torch' and torch.cuda.is_available(): + torch.cuda.synchronize() + + elif self.backend == 'cuda': + self.stream.synchronize() + + def warmup(self, blob, n): + for _ in range(n): + _ = self(blob) + + def speed(self, blob, n): + self.time_profile.reset() + for _ in range(n): + with self.time_profile: + _ = self(blob) + + return self.time_profile.total / n + diff --git a/benchmark/utils.py b/benchmark/utils.py new file mode 100644 index 0000000..f47ea2e --- /dev/null +++ b/benchmark/utils.py @@ -0,0 +1,83 @@ +'''by lyuwenyu +''' + +import time +import contextlib +import numpy as np +from PIL import Image +from collections import OrderedDict + +import onnx +import torch +import onnx_graphsurgeon + + +def to_binary_data(path, size=(640, 640), output_name='input_tensor.bin'): + '''--loadInputs='image:input_tensor.bin' + ''' + im = Image.open(path).resize(size) + data = np.asarray(im, dtype=np.float32).transpose(2, 0, 1)[None] / 255. + data.tofile(output_name) + + +def yolo_insert_nms(path, score_threshold=0.01, iou_threshold=0.7, max_output_boxes=300, simplify=False): + ''' + http://www.xavierdupre.fr/app/onnxcustom/helpsphinx/api/onnxops/onnx__EfficientNMS_TRT.html + https://huggingface.co/spaces/muttalib1326/Punjabi_Character_Detection/blob/3dd1e17054c64e5f6b2254278f96cfa2bf418cd4/utils/add_nms.py + ''' + onnx_model = onnx.load(path) + + if simplify: + from onnxsim import simplify + onnx_model, _ = simplify(onnx_model, overwrite_input_shapes={'image': [1, 3, 640, 640]}) + + graph = onnx_graphsurgeon.import_onnx(onnx_model) + graph.toposort() + graph.fold_constants() + graph.cleanup() + + topk = max_output_boxes + attrs = OrderedDict(plugin_version='1', + background_class=-1, + max_output_boxes=topk, + score_threshold=score_threshold, + iou_threshold=iou_threshold, + score_activation=False, + box_coding=0, ) + + outputs = [onnx_graphsurgeon.Variable('num_dets', np.int32, [-1, 1]), + onnx_graphsurgeon.Variable('det_boxes', np.float32, [-1, topk, 4]), + onnx_graphsurgeon.Variable('det_scores', np.float32, [-1, topk]), + onnx_graphsurgeon.Variable('det_classes', np.int32, [-1, topk])] + + graph.layer(op='EfficientNMS_TRT', + name="batched_nms", + inputs=[graph.outputs[0], + graph.outputs[1]], + outputs=outputs, + attrs=attrs, ) + + graph.outputs = outputs + graph.cleanup().toposort() + + onnx.save(onnx_graphsurgeon.export_onnx(graph), f'yolo_w_nms.onnx') + + +class TimeProfiler(contextlib.ContextDecorator): + def __init__(self, ): + self.total = 0 + + def __enter__(self, ): + self.start = self.time() + return self + + def __exit__(self, type, value, traceback): + self.total += self.time() - self.start + + def reset(self, ): + self.total = 0 + + def time(self, ): + if torch.cuda.is_available(): + torch.cuda.synchronize() + return time.time() diff --git a/benchmark/yolov8_onnx.py b/benchmark/yolov8_onnx.py new file mode 100644 index 0000000..efa1a0c --- /dev/null +++ b/benchmark/yolov8_onnx.py @@ -0,0 +1,73 @@ +'''by lyuwenyu +''' + +import torch +import torchvision + +import numpy as np +import onnxruntime as ort + +from utils import yolo_insert_nms + +class YOLOv8(torch.nn.Module): + def __init__(self, name) -> None: + super().__init__() + from ultralytics import YOLO + # Load a model + # build a new model from scratch + # model = YOLO(f'{name}.yaml') + + # load a pretrained model (recommended for training) + model = YOLO(f'{name}.pt') + self.model = model.model + + def forward(self, x): + '''https://github.com/ultralytics/ultralytics/blob/main/ultralytics/nn/tasks.py#L216 + ''' + pred: torch.Tensor = self.model(x)[0] # n 84 8400, + pred = pred.permute(0, 2, 1) + nc = pred.shape[-1] - 4 + boxes, scores = pred.split([4, nc], dim=-1) + boxes = torchvision.ops.box_convert(boxes, in_fmt='cxcywh', out_fmt='xyxy') + + return boxes, scores + + + +def export_onnx(name='yolov8n'): + '''export onnx + ''' + m = YOLOv8(name) + + x = torch.rand(1, 3, 640, 640) + dynamic_axes = { + 'image': {0: '-1'} + } + torch.onnx.export(m, x, f'{name}.onnx', + input_names=['image'], + output_names=['boxes', 'scores'], + opset_version=13, + dynamic_axes=dynamic_axes) + + data = np.random.rand(1, 3, 640, 640).astype(np.float32) + sess = ort.InferenceSession(f'{name}.onnx') + _ = sess.run(output_names=None, input_feed={'image': data}) + + +if __name__ == '__main__': + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--name', type=str, default='yolov8l') + parser.add_argument('--score_threshold', type=float, default=0.001) + parser.add_argument('--iou_threshold', type=float, default=0.7) + parser.add_argument('--max_output_boxes', type=int, default=300) + args = parser.parse_args() + + export_onnx(name=args.name) + + yolo_insert_nms(path=f'{args.name}.onnx', + score_threshold=args.score_threshold, + iou_threshold=args.iou_threshold, + max_output_boxes=args.max_output_boxes, ) + diff --git a/hubconf.py b/hubconf.py new file mode 100644 index 0000000..27ff792 --- /dev/null +++ b/hubconf.py @@ -0,0 +1,174 @@ +"""Copyright(c) 2024 lyuwenyu. All Rights Reserved. +""" + + +import os +import sys +from pathlib import Path +from urllib.parse import urlparse + +ROOT = Path(__file__).absolute().parent / 'rtdetrv2_pytorch' +sys.path.append(str(ROOT)) + +from src.core import YAMLConfig + +import torch +import torch.nn as nn + +dependencies = ['torch', 'torchvision',] + + +def _load_checkpoint(path: str, map_location='cpu'): + scheme = urlparse(str(path)).scheme + if not scheme: + state = torch.load(path, map_location=map_location) + else: + state = torch.hub.load_state_dict_from_url(path, map_location=map_location) + return state + + +def _build_model(args, ): + """main + """ + cfg = YAMLConfig(args.config) + + if args.resume: + checkpoint = _load_checkpoint(args.resume, map_location='cpu') + if 'ema' in checkpoint: + state = checkpoint['ema']['module'] + else: + state = checkpoint['model'] + + # NOTE load train mode state + cfg.model.load_state_dict(state) + + + class Model(nn.Module): + def __init__(self, ) -> None: + super().__init__() + self.model = cfg.model.deploy() + self.postprocessor = cfg.postprocessor.deploy() + + def forward(self, images, orig_target_sizes): + outputs = self.model(images) + outputs = self.postprocessor(outputs, orig_target_sizes) + return outputs + + return Model() + + +CONFIG = { + # rtdetr + 'rtdetr_r18vd': { + 'config': ROOT / 'configs/rtdetr/rtdetr_r18vd_6x_coco.yml', + 'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r18vd_dec3_6x_coco_from_paddle.pth', + }, + 'rtdetr_r34vd': { + 'config': ROOT / 'configs/rtdetr/rtdetr_r34vd_6x_coco.yml', + 'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r34vd_dec4_6x_coco_from_paddle.pth', + }, + 'rtdetr_r50vd_m': { + 'config': ROOT / 'configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml', + 'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_m_6x_coco_from_paddle.pth', + }, + 'rtdetr_r50vd': { + 'config': ROOT / 'configs/rtdetr/rtdetr_r50vd_6x_coco.yml', + 'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_6x_coco_from_paddle.pth', + }, + 'rtdetr_r101vd': { + 'config': ROOT / 'configs/rtdetr/rtdetr_r101vd_6x_coco.yml', + 'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r101vd_6x_coco_from_paddle.pth', + }, + + # rtdetrv2 + 'rtdetrv2_r18vd': { + 'config': ROOT / 'configs/rtdetrv2/rtdetrv2_r18vd_120e_coco.yml', + 'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.2/rtdetrv2_r18vd_120e_coco_rerun_48.1.pth', + }, + 'rtdetrv2_r34vd': { + 'config': ROOT / 'configs/rtdetrv2/rtdetrv2_r34vd_120e_coco.yml', + 'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r34vd_120e_coco_ema.pth', + }, + 'rtdetrv2_r50vd_m': { + 'config': ROOT / 'configs/rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml', + 'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_m_7x_coco_ema.pth', + }, + 'rtdetrv2_r50vd': { + 'config': ROOT / 'configs/rtdetrv2/rtdetrv2_r50vd_6x_coco.yml', + 'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_6x_coco_ema.pth', + }, + 'rtdetrv2_r101vd': { + 'config': ROOT / 'configs/rtdetrv2/rtdetrv2_r101vd_6x_coco.yml', + 'resume': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r101vd_6x_coco_from_paddle.pth', + }, +} + + +# rtdetr +def rtdetr_r18vd(pretrained=True,): + args = type('Args', (), CONFIG['rtdetr_r18vd'])() + args.resume = args.resume if pretrained else '' + return _build_model(args, ) + + +def rtdetr_r34vd(pretrained=True,): + args = type('Args', (), CONFIG['rtdetr_r34vd'])() + args.resume = args.resume if pretrained else '' + return _build_model(args, ) + + +def rtdetr_r50vd_m(pretrained=True): + args = type('Args', (), CONFIG['rtdetr_r50vd_m'])() + args.resume = args.resume if pretrained else '' + return _build_model(args, ) + + +def rtdetr_r50vd(pretrained=True): + args = type('Args', (), CONFIG['rtdetr_r50vd'])() + args.resume = args.resume if pretrained else '' + return _build_model(args, ) + + +def rtdetr_r101vd(pretrained=True): + args = type('Args', (), CONFIG['rtdetr_r101vd'])() + args.resume = args.resume if pretrained else '' + return _build_model(args, ) + + +# rtdetrv2 +def rtdetrv2_r18vd(pretrained=True,): + args = type('Args', (), CONFIG['rtdetrv2_r18vd'])() + args.resume = args.resume if pretrained else '' + return _build_model(args, ) + + +def rtdetrv2_r34vd(pretrained=True,): + args = type('Args', (), CONFIG['rtdetrv2_r34vd'])() + args.resume = args.resume if pretrained else '' + return _build_model(args, ) + + +def rtdetrv2_r50vd_m(pretrained=True): + args = type('Args', (), CONFIG['rtdetrv2_r50vd_m'])() + args.resume = args.resume if pretrained else '' + return _build_model(args, ) + + +def rtdetrv2_r50vd(pretrained=True): + args = type('Args', (), CONFIG['rtdetrv2_r50vd'])() + args.resume = args.resume if pretrained else '' + return _build_model(args, ) + + +def rtdetrv2_r101vd(pretrained=True): + args = type('Args', (), CONFIG['rtdetrv2_r101vd'])() + args.resume = args.resume if pretrained else '' + return _build_model(args, ) + + +rtdetrv2_s = rtdetrv2_r18vd +rtdetrv2_m_r34 = rtdetrv2_r34vd +rtdetrv2_m_r50 = rtdetrv2_r50vd_m +rtdetrv2_l = rtdetrv2_r50vd +rtdetrv2_x = rtdetrv2_r101vd + diff --git a/rtdetr_paddle/README.md b/rtdetr_paddle/README.md new file mode 100644 index 0000000..98312fc --- /dev/null +++ b/rtdetr_paddle/README.md @@ -0,0 +1,244 @@ +English | [简体中文](README_cn.md) + +## Model Zoo on COCO + +| Model | Epoch | Backbone | Input shape | $AP^{val}$ | $AP^{val}_{50}$| Params(M) | FLOPs(G) | T4 TensorRT FP16(FPS) | Weight | Config | Log +|:--------------:|:-----:|:----------:| :-------:|:--------------------------:|:---------------------------:|:---------:|:--------:| :---------------------: |:------------------------------------------------------------------------------------:|:-------------------------------------------:|:---| +| RT-DETR-R18 | 6x | ResNet-18 | 640 | 46.5 | 63.8 | 20 | 60 | 217 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r18vd_dec3_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r18vd_6x_coco.yml) | [rtdetr_r18vd_dec3_6x_coco_log.txt](https://github.com/lyuwenyu/RT-DETR/files/12038864/rtdetr_r18vd_dec3_6x_coco_log.txt) +| RT-DETR-R34 | 6x | ResNet-34 | 640 | 48.9 | 66.8 | 31 | 92 | 161 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r34vd_dec4_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r34vd_6x_coco.yml) | [rtdetr_r34vd_dec4_6x_coco_log.txt](https://github.com/lyuwenyu/RT-DETR/files/12038861/rtdetr_r34vd_dec4_6x_coco_log.txt) +| RT-DETR-R50-m | 6x | ResNet-50 | 640 | 51.3 | 69.6 | 36 | 100 | 145 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_m_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml) | - +| RT-DETR-R50 | 6x | ResNet-50 | 640 | 53.1 | 71.3 | 42 | 136 | 108 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r50vd_6x_coco.yml) | [rtdetr_r50vd_6x_coco_log.txt](https://github.com/lyuwenyu/RT-DETR/files/12038669/rtdetr_r50vd_6x_coco_log.txt) +| RT-DETR-R101 | 6x | ResNet-101 | 640 | 54.3 | 72.7 | 76 | 259 | 74 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r101vd_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r101vd_6x_coco.yml) | [rtdetr_r101vd_6x_coco_log.txt](https://github.com/lyuwenyu/RT-DETR/files/12038707/rtdetr_r101vd_6x_coco_log.txt) +| RT-DETR-L | 6x | HGNetv2 | 640 | 53.0 | 71.6 | 32 | 110 | 114 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_hgnetv2_l_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_hgnetv2_l_6x_coco.yml) | [rtdetr_hgnetv2_l_6x_coco_log.txt](https://github.com/lyuwenyu/RT-DETR/files/12038753/rtdetr_hgnetv2_l_6x_coco_log.txt) +| RT-DETR-X | 6x | HGNetv2 | 640 | 54.8 | 73.1 | 67 | 234 | 74 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_hgnetv2_x_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_hgnetv2_x_6x_coco.yml) | [rtdetr_hgnetv2_x_6x_coco_log.txt](https://github.com/lyuwenyu/RT-DETR/files/12038795/rtdetr_hgnetv2_x_6x_coco_log.txt) + +**Notes:** +- RT-DETR uses 4 GPUs for training. +- RT-DETR was trained on COCO train2017 and evaluated on val2017. + + +## Model Zoo on Objects365 +| Model | Epoch | Dataset | Input shape | $AP^{val}$ | $AP^{val}_{50}$ | T4 TensorRT FP16(FPS) | Weight | Log +|:---:|:---:|:---:| :---:|:---:|:---:|:---:|:---:|:---:| +RT-DETR-R18 | 1x | Objects365 | 640 | 22.9 | 31.2 | - | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r18vd_1x_objects365.pdparams) | [log.txt](https://github.com/lyuwenyu/RT-DETR/files/12394706/rtdetr_r18vd_1x_objects365_log.txt) +RT-DETR-R18 | 5x | COCO + Objects365 | 640 | **49.2** | **66.6** | **217** | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r18vd_5x_coco_objects365.pdparams) | [log.txt](https://github.com/lyuwenyu/RT-DETR/files/12416808/rtdetr_r18vd_5x_coco_objects365_log.txt) +RT-DETR-R50 | 1x | Objects365 | 640 | 35.1 | 46.2 | - | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_1x_objects365.pdparams) |[log.txt](https://github.com/lyuwenyu/RT-DETR/files/12193246/rtdetr_r50vd_1x_objects365_log.txt) +RT-DETR-R50 | 2x | COCO + Objects365 | 640 | **55.3** | **73.4** | **108** | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_2x_coco_objects365.pdparams) | [log.txt](https://github.com/lyuwenyu/RT-DETR/files/12208338/rtdetr_r50vd_2x_coco_objects365_log.txt) +RT-DETR-R101 | 1x | Objects365 | 640 | 36.8 | 48.3 | - | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r101vd_1x_objects365.pdparams) | [log.txt](https://github.com/lyuwenyu/RT-DETR/files/12340691/rtdetr_r101vd_1x_objects365_log.txt) +RT-DETR-R101 | 2x | COCO + Objects365 | 640 | **56.2** | **74.6** | **74** |[download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r101vd_2x_coco_objects365.pdparams) | [log.txt](https://github.com/lyuwenyu/RT-DETR/files/12340672/rtdetr_r101vd_2x_coco_objects365_log.txt) + + +**Notes:** +- `COCO + Objects365` in the table means finetuned model on COCO using pretrained weights trained on Objects365. + + + +## Quick start + +
+Install requirements + + +```bash +pip install -r requirements.txt +``` + +
+ +
+Compile (optional) + +```bash +cd ./ppdet/modeling/transformers/ext_op/ + +python setup_ms_deformable_attn_op.py install +``` +See [details](./ppdet/modeling/transformers/ext_op/) +
+ + +
+Data preparation + +- Download and extract COCO 2017 train and val images. +``` +path/to/coco/ + annotations/ # annotation json files + train2017/ # train images + val2017/ # val images +``` +- Modify config [`dataset_dir`](configs/datasets/coco_detection.yml) +
+ + +
+Training & Evaluation & Testing + +- Training on a Single GPU: + +```shell +# training on single-GPU +export CUDA_VISIBLE_DEVICES=0 +python tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml --eval +``` + +- Training on Multiple GPUs: + +```shell +# training on multi-GPU +export CUDA_VISIBLE_DEVICES=0,1,2,3 +python -m paddle.distributed.launch --gpus 0,1,2,3 tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml --fleet --eval +``` + +- Evaluation: + +```shell +python tools/eval.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \ + -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams +``` + +- Inference: + +```shell +python tools/infer.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \ + -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams \ + --infer_img=./demo/000000570688.jpg +``` + +
+ + +## Finetune +
+Details + +1. prepare data as coco format. +``` +path/to/custom/data/ + annotations/ # annotation json files + train/ # train images + val/ # val images +``` +2. Modify dataset config [`dataset_dir`, `image_dir`, `anno_path`](configs/datasets/coco_detection.yml) + +3. Modify model config [`pretrain_weights`](configs/rtdetr/_base_/rtdetr_r50vd.yml) to coco pretrained parameters url in model zoo. + +```bash +# or modified in command line + +fleetrun --gpus=0,1,2,3 tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml -o pretrain_weights=https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams --eval +``` +
+ + + +## Deploy + +
+1. Export model + +```shell +python tools/export_model.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \ + -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams trt=True \ + --output_dir=output_inference +``` + +
+ +
+2. Convert to ONNX + +- Install [Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX) and ONNX + +```shell +pip install onnx==1.13.0 +pip install paddle2onnx==1.0.5 +``` + +- Convert: + +```shell +paddle2onnx --model_dir=./output_inference/rtdetr_r50vd_6x_coco/ \ + --model_filename model.pdmodel \ + --params_filename model.pdiparams \ + --opset_version 16 \ + --save_file rtdetr_r50vd_6x_coco.onnx +``` +
+ +
+3. Convert to TensorRT + +- TensorRT version >= 8.5.1 +- Inference can refer to [Bennchmark](../benchmark) + +```shell +trtexec --onnx=./rtdetr_r50vd_6x_coco.onnx \ + --workspace=4096 \ + --shapes=image:1x3x640x640 \ + --saveEngine=rtdetr_r50vd_6x_coco.trt \ + --avgRuns=100 \ + --fp16 +``` + +- +
+ + +## Others + +
+1. Parameters and FLOPs + +1. Find and modify paddle [`dynamic_flops.py` ](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/hapi/dynamic_flops.py#L28) source code in your local machine + +```python +# eg. /path/to/anaconda3/lib/python3.8/site-packages/paddle/hapi/dynamic_flops.py + +def flops(net, input_size, inputs=None, custom_ops=None, print_detail=False): + if isinstance(net, nn.Layer): + # If net is a dy2stat model, net.forward is StaticFunction instance, + # we set net.forward to original forward function. + _, net.forward = unwrap_decorators(net.forward) + + # by lyuwenyu + if inputs is None: + inputs = paddle.randn(input_size) + + return dynamic_flops( + net, inputs=inputs, custom_ops=custom_ops, print_detail=print_detail + ) + elif isinstance(net, paddle.static.Program): + return static_flops(net, print_detail=print_detail) + else: + warnings.warn( + "Your model must be an instance of paddle.nn.Layer or paddle.static.Program." + ) + return -1 +``` + +2. Run below code + +```python +import paddle +from ppdet.core.workspace import load_config, merge_config +from ppdet.core.workspace import create + +cfg_path = './configs/rtdetr/rtdetr_r50vd_6x_coco.yml' +cfg = load_config(cfg_path) +model = create(cfg.architecture) + +blob = { + 'image': paddle.randn([1, 3, 640, 640]), + 'im_shape': paddle.to_tensor([[640, 640]]), + 'scale_factor': paddle.to_tensor([[1., 1.]]) +} +paddle.flops(model, None, blob, custom_ops=None, print_detail=False) + +# Outpus +# Total Flops: 68348108800 Total Params: 41514204 + +``` + + +
diff --git a/rtdetr_paddle/README_cn.md b/rtdetr_paddle/README_cn.md new file mode 100644 index 0000000..ec3bf00 --- /dev/null +++ b/rtdetr_paddle/README_cn.md @@ -0,0 +1,202 @@ +简体中文 | [English](README_en.md) + +## 模型 + +| Model | Epoch | backbone | input shape | $AP^{val}$ | $AP^{val}_{50}$| Params(M) | FLOPs(G) | T4 TensorRT FP16(FPS) | Pretrained Model | config | +|:--------------:|:-----:|:----------:| :-------:|:--------------------------:|:---------------------------:|:---------:|:--------:| :---------------------: |:------------------------------------------------------------------------------------:|:-------------------------------------------:| +| RT-DETR-R18 | 6x | ResNet-18 | 640 | 46.5 | 63.8 | 20 | 60 | 217 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r18vd_dec3_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r18vd_6x_coco.yml) +| RT-DETR-R34 | 6x | ResNet-34 | 640 | 48.9 | 66.8 | 31 | 92 | 161 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r34vd_dec4_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r34vd_6x_coco.yml) +| RT-DETR-R50-m | 6x | ResNet-50 | 640 | 51.3 | 69.6 | 36 | 100 | 145 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_m_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml) +| RT-DETR-R50 | 6x | ResNet-50 | 640 | 53.1 | 71.3 | 42 | 136 | 108 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r50vd_6x_coco.yml) +| RT-DETR-R101 | 6x | ResNet-101 | 640 | 54.3 | 72.7 | 76 | 259 | 74 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_r101vd_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_r101vd_6x_coco.yml) +| RT-DETR-L | 6x | HGNetv2 | 640 | 53.0 | 71.6 | 32 | 110 | 114 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_hgnetv2_l_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_hgnetv2_l_6x_coco.yml) +| RT-DETR-X | 6x | HGNetv2 | 640 | 54.8 | 73.1 | 67 | 234 | 74 | [download](https://bj.bcebos.com/v1/paddledet/models/rtdetr_hgnetv2_x_6x_coco.pdparams) | [config](./configs/rtdetr/rtdetr_hgnetv2_x_6x_coco.yml) + + +**注意事项:** +- RT-DETR 使用4个GPU训练。 +- RT-DETR 在COCO train2017上训练,并在val2017上评估。 + +## 快速开始 + +
+依赖包 + + +```bash +pip install -r requirements.txt +``` + +
+ +
+准备数据 + +- 修改[配置文件`dataset_dir`](configs/datasets/coco_detection.yml) +
+ + +
+训练&评估 + +- 单卡GPU上训练: + +```shell +# training on single-GPU +export CUDA_VISIBLE_DEVICES=0 +python tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml --eval +``` + +- 多卡GPU上训练: + +```shell +# training on multi-GPU +export CUDA_VISIBLE_DEVICES=0,1,2,3 +python -m paddle.distributed.launch --gpus 0,1,2,3 tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml --fleet --eval +``` + +- 评估: + +```shell +python tools/eval.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \ + -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams +``` + +- 测试: + +```shell +python tools/infer.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \ + -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams \ + --infer_img=./demo/000000570688.jpg +``` + +详情请参考[快速开始文档](https://github.com/PaddlePaddle/PaddleDetection/blob/develop/docs/tutorials/GETTING_STARTED.md). + +
+ +## 部署 + +
+1. 导出模型 + +```shell +python tools/export_model.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \ + -o weights=https://bj.bcebos.com/v1/paddledet/models/rtdetr_r50vd_6x_coco.pdparams trt=True \ + --output_dir=output_inference +``` + +
+ +
+2. 转换模型至ONNX + +- 安装[Paddle2ONNX](https://github.com/PaddlePaddle/Paddle2ONNX) 和 ONNX + +```shell +pip install onnx==1.13.0 +pip install paddle2onnx==1.0.5 +``` + +- 转换模型: + +```shell +paddle2onnx --model_dir=./output_inference/rtdetr_r50vd_6x_coco/ \ + --model_filename model.pdmodel \ + --params_filename model.pdiparams \ + --opset_version 16 \ + --save_file rtdetr_r50vd_6x_coco.onnx +``` +
+ +
+3. 转换成TensorRT + +- 确保TensorRT的版本>=8.5.1 +- TRT推理可以参考[RT-DETR](https://github.com/lyuwenyu/RT-DETR)的部分代码或者其他网络资源 + +```shell +trtexec --onnx=./rtdetr_r50vd_6x_coco.onnx \ + --workspace=4096 \ + --shapes=image:1x3x640x640 \ + --saveEngine=rtdetr_r50vd_6x_coco.trt \ + --avgRuns=100 \ + --fp16 +``` + +- +
+ + +## 其他 + +
+1. 参数量和计算量统计 + +1. 找到[本地安装paddle的flops源代码](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/hapi/dynamic_flops.py#L28), 并修改为 + +```python +# anaconda3/lib/python3.8/site-packages/paddle/hapi/dynamic_flops.py +def flops(net, input_size, inputs=None, custom_ops=None, print_detail=False): + if isinstance(net, nn.Layer): + # If net is a dy2stat model, net.forward is StaticFunction instance, + # we set net.forward to original forward function. + _, net.forward = unwrap_decorators(net.forward) + + # by lyuwenyu + if inputs is None: + inputs = paddle.randn(input_size) + + return dynamic_flops( + net, inputs=inputs, custom_ops=custom_ops, print_detail=print_detail + ) + elif isinstance(net, paddle.static.Program): + return static_flops(net, print_detail=print_detail) + else: + warnings.warn( + "Your model must be an instance of paddle.nn.Layer or paddle.static.Program." + ) + return -1 +``` + +2. 使用以下代码片段实现参数量和计算量的统计 + +```python +import paddle +from ppdet.core.workspace import load_config, merge_config +from ppdet.core.workspace import create + +cfg_path = './configs/rtdetr/rtdetr_r50vd_6x_coco.yml' +cfg = load_config(cfg_path) +model = create(cfg.architecture) + +blob = { + 'image': paddle.randn([1, 3, 640, 640]), + 'im_shape': paddle.to_tensor([[640, 640]]), + 'scale_factor': paddle.to_tensor([[1., 1.]]) +} +paddle.flops(model, None, blob, custom_ops=None, print_detail=False) +``` +
+ + +
+2. YOLOs端到端速度测速 + +- 可以参考[RT-DETR](https://github.com/lyuwenyu/RT-DETR) benchmark部分或者其他网络资源 + +
+ + + +## 引用RT-DETR +如果需要在你的研究中使用RT-DETR,请通过以下方式引用我们的论文: +``` +@misc{lv2023detrs, + title={DETRs Beat YOLOs on Real-time Object Detection}, + author={Wenyu Lv and Shangliang Xu and Yian Zhao and Guanzhong Wang and Jinman Wei and Cheng Cui and Yuning Du and Qingqing Dang and Yi Liu}, + year={2023}, + eprint={2304.08069}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/rtdetr_paddle/configs/datasets/coco_detection.yml b/rtdetr_paddle/configs/datasets/coco_detection.yml new file mode 100644 index 0000000..176ba27 --- /dev/null +++ b/rtdetr_paddle/configs/datasets/coco_detection.yml @@ -0,0 +1,21 @@ +metric: COCO +num_classes: 80 + +TrainDataset: + name: COCODataSet + image_dir: train2017 + anno_path: annotations/instances_train2017.json + dataset_dir: dataset/coco + data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] + +EvalDataset: + name: COCODataSet + image_dir: val2017 + anno_path: annotations/instances_val2017.json + dataset_dir: dataset/coco + allow_empty: true + +TestDataset: + name: ImageFolder + anno_path: annotations/instances_val2017.json # also support txt (like VOC's label_list.txt) + dataset_dir: dataset/coco # if set, anno_path will be 'dataset_dir/anno_path' diff --git a/rtdetr_paddle/configs/datasets/voc.yml b/rtdetr_paddle/configs/datasets/voc.yml new file mode 100644 index 0000000..72182be --- /dev/null +++ b/rtdetr_paddle/configs/datasets/voc.yml @@ -0,0 +1,21 @@ +metric: VOC +map_type: 11point +num_classes: 20 + +TrainDataset: + name: VOCDataSet + dataset_dir: dataset/voc + anno_path: trainval.txt + label_list: label_list.txt + data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult'] + +EvalDataset: + name: VOCDataSet + dataset_dir: dataset/voc + anno_path: test.txt + label_list: label_list.txt + data_fields: ['image', 'gt_bbox', 'gt_class', 'difficult'] + +TestDataset: + name: ImageFolder + anno_path: dataset/voc/label_list.txt diff --git a/rtdetr_paddle/configs/rtdetr/_base_/optimizer_6x.yml b/rtdetr_paddle/configs/rtdetr/_base_/optimizer_6x.yml new file mode 100644 index 0000000..5abe2f7 --- /dev/null +++ b/rtdetr_paddle/configs/rtdetr/_base_/optimizer_6x.yml @@ -0,0 +1,19 @@ +epoch: 72 + +LearningRate: + base_lr: 0.0001 + schedulers: + - !PiecewiseDecay + gamma: 1.0 + milestones: [100] + use_warmup: true + - !LinearWarmup + start_factor: 0.001 + steps: 2000 + +OptimizerBuilder: + clip_grad_by_norm: 0.1 + regularizer: false + optimizer: + type: AdamW + weight_decay: 0.0001 diff --git a/rtdetr_paddle/configs/rtdetr/_base_/rtdetr_r50vd.yml b/rtdetr_paddle/configs/rtdetr/_base_/rtdetr_r50vd.yml new file mode 100644 index 0000000..7859dfb --- /dev/null +++ b/rtdetr_paddle/configs/rtdetr/_base_/rtdetr_r50vd.yml @@ -0,0 +1,71 @@ +architecture: DETR +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_vd_ssld_v2_pretrained.pdparams +norm_type: sync_bn +use_ema: True +ema_decay: 0.9999 +ema_decay_type: "exponential" +ema_filter_no_grad: True +hidden_dim: 256 +use_focal_loss: True +eval_size: [640, 640] # h, w + + +DETR: + backbone: ResNet + neck: HybridEncoder + transformer: RTDETRTransformer + detr_head: DINOHead + post_process: DETRPostProcess + +ResNet: + # index 0 stands for res2 + depth: 50 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [1, 2, 3] + lr_mult_list: [0.1, 0.1, 0.1, 0.1] + num_stages: 4 + freeze_stem_only: True + +HybridEncoder: + hidden_dim: 256 + use_encoder_idx: [2] + num_encoder_layers: 1 + encoder_layer: + name: TransformerLayer + d_model: 256 + nhead: 8 + dim_feedforward: 1024 + dropout: 0. + activation: 'gelu' + expansion: 1.0 + + +RTDETRTransformer: + num_queries: 300 + position_embed_type: sine + feat_strides: [8, 16, 32] + num_levels: 3 + nhead: 8 + num_decoder_layers: 6 + dim_feedforward: 1024 + dropout: 0.0 + activation: relu + num_denoising: 100 + label_noise_ratio: 0.5 + box_noise_scale: 1.0 + learnt_init_query: False + +DINOHead: + loss: + name: DINOLoss + loss_coeff: {class: 1, bbox: 5, giou: 2} + aux_loss: True + use_vfl: True + matcher: + name: HungarianMatcher + matcher_coeff: {class: 2, bbox: 5, giou: 2} + +DETRPostProcess: + num_top_queries: 300 diff --git a/rtdetr_paddle/configs/rtdetr/_base_/rtdetr_reader.yml b/rtdetr_paddle/configs/rtdetr/_base_/rtdetr_reader.yml new file mode 100644 index 0000000..b1a2a00 --- /dev/null +++ b/rtdetr_paddle/configs/rtdetr/_base_/rtdetr_reader.yml @@ -0,0 +1,43 @@ +worker_num: 4 +TrainReader: + sample_transforms: + - Decode: {} + - RandomDistort: {prob: 0.8} + - RandomExpand: {fill_value: [123.675, 116.28, 103.53]} + - RandomCrop: {prob: 0.8} + - RandomFlip: {} + batch_transforms: + - BatchRandomResize: {target_size: [480, 512, 544, 576, 608, 640, 640, 640, 672, 704, 736, 768, 800], random_size: True, random_interp: True, keep_ratio: False} + - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none} + - NormalizeBox: {} + - BboxXYXY2XYWH: {} + - Permute: {} + batch_size: 4 + shuffle: true + drop_last: true + collate_batch: false + use_shared_memory: false + + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {target_size: [640, 640], keep_ratio: False, interp: 2} # target_size: (h, w) + - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none} + - Permute: {} + batch_size: 4 + shuffle: false + drop_last: false + + +TestReader: + inputs_def: + image_shape: [3, 640, 640] + sample_transforms: + - Decode: {} + - Resize: {target_size: [640, 640], keep_ratio: False, interp: 2} + - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none} + - Permute: {} + batch_size: 1 + shuffle: false + drop_last: false diff --git a/rtdetr_paddle/configs/rtdetr/rtdetr_hgnetv2_l_6x_coco.yml b/rtdetr_paddle/configs/rtdetr/rtdetr_hgnetv2_l_6x_coco.yml new file mode 100644 index 0000000..4f3e77d --- /dev/null +++ b/rtdetr_paddle/configs/rtdetr/rtdetr_hgnetv2_l_6x_coco.yml @@ -0,0 +1,24 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_6x.yml', + '_base_/rtdetr_r50vd.yml', + '_base_/rtdetr_reader.yml', +] + +weights: output/rtdetr_hgnetv2_l_6x_coco/model_final +pretrain_weights: https://bj.bcebos.com/v1/paddledet/models/pretrained/PPHGNetV2_L_ssld_pretrained.pdparams +find_unused_parameters: True +log_iter: 200 + + +DETR: + backbone: PPHGNetV2 + +PPHGNetV2: + arch: 'L' + return_idx: [1, 2, 3] + freeze_stem_only: True + freeze_at: 0 + freeze_norm: True + lr_mult_list: [0., 0.05, 0.05, 0.05, 0.05] diff --git a/rtdetr_paddle/configs/rtdetr/rtdetr_hgnetv2_x_6x_coco.yml b/rtdetr_paddle/configs/rtdetr/rtdetr_hgnetv2_x_6x_coco.yml new file mode 100644 index 0000000..37f5d17 --- /dev/null +++ b/rtdetr_paddle/configs/rtdetr/rtdetr_hgnetv2_x_6x_coco.yml @@ -0,0 +1,40 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_6x.yml', + '_base_/rtdetr_r50vd.yml', + '_base_/rtdetr_reader.yml', +] + +weights: output/rtdetr_hgnetv2_l_6x_coco/model_final +pretrain_weights: https://bj.bcebos.com/v1/paddledet/models/pretrained/PPHGNetV2_X_ssld_pretrained.pdparams +find_unused_parameters: True +log_iter: 200 + + + +DETR: + backbone: PPHGNetV2 + + +PPHGNetV2: + arch: 'X' + return_idx: [1, 2, 3] + freeze_stem_only: True + freeze_at: 0 + freeze_norm: True + lr_mult_list: [0., 0.01, 0.01, 0.01, 0.01] + + +HybridEncoder: + hidden_dim: 384 + use_encoder_idx: [2] + num_encoder_layers: 1 + encoder_layer: + name: TransformerLayer + d_model: 384 + nhead: 8 + dim_feedforward: 2048 + dropout: 0. + activation: 'gelu' + expansion: 1.0 diff --git a/rtdetr_paddle/configs/rtdetr/rtdetr_r101vd_6x_coco.yml b/rtdetr_paddle/configs/rtdetr/rtdetr_r101vd_6x_coco.yml new file mode 100644 index 0000000..fd2f55a --- /dev/null +++ b/rtdetr_paddle/configs/rtdetr/rtdetr_r101vd_6x_coco.yml @@ -0,0 +1,37 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_6x.yml', + '_base_/rtdetr_r50vd.yml', + '_base_/rtdetr_reader.yml', +] + +weights: output/rtdetr_r101vd_6x_coco/model_final +find_unused_parameters: True +log_iter: 200 + +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet101_vd_ssld_pretrained.pdparams + +ResNet: + # index 0 stands for res2 + depth: 101 + variant: d + norm_type: bn + freeze_at: 0 + return_idx: [1, 2, 3] + lr_mult_list: [0.01, 0.01, 0.01, 0.01] + num_stages: 4 + freeze_stem_only: True + +HybridEncoder: + hidden_dim: 384 + use_encoder_idx: [2] + num_encoder_layers: 1 + encoder_layer: + name: TransformerLayer + d_model: 384 + nhead: 8 + dim_feedforward: 2048 + dropout: 0. + activation: 'gelu' + expansion: 1.0 diff --git a/rtdetr_paddle/configs/rtdetr/rtdetr_r18vd_6x_coco.yml b/rtdetr_paddle/configs/rtdetr/rtdetr_r18vd_6x_coco.yml new file mode 100644 index 0000000..8cf9818 --- /dev/null +++ b/rtdetr_paddle/configs/rtdetr/rtdetr_r18vd_6x_coco.yml @@ -0,0 +1,38 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_6x.yml', + '_base_/rtdetr_r50vd.yml', + '_base_/rtdetr_reader.yml', +] + +weights: output/rtdetr_r18_6x_coco/model_final +find_unused_parameters: True +log_iter: 200 + +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet18_vd_pretrained.pdparams +ResNet: + depth: 18 + variant: d + return_idx: [1, 2, 3] + freeze_at: -1 + freeze_norm: false + norm_decay: 0. + +HybridEncoder: + hidden_dim: 256 + use_encoder_idx: [2] + num_encoder_layers: 1 + encoder_layer: + name: TransformerLayer + d_model: 256 + nhead: 8 + dim_feedforward: 1024 + dropout: 0. + activation: 'gelu' + expansion: 0.5 + depth_mult: 1.0 + +RTDETRTransformer: + eval_idx: -1 + num_decoder_layers: 3 diff --git a/rtdetr_paddle/configs/rtdetr/rtdetr_r34vd_6x_coco.yml b/rtdetr_paddle/configs/rtdetr/rtdetr_r34vd_6x_coco.yml new file mode 100644 index 0000000..2ab07ba --- /dev/null +++ b/rtdetr_paddle/configs/rtdetr/rtdetr_r34vd_6x_coco.yml @@ -0,0 +1,38 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_6x.yml', + '_base_/rtdetr_r50vd.yml', + '_base_/rtdetr_reader.yml', +] + +weights: output/rtdetr_r34vd_6x_coco/model_final +find_unused_parameters: True +log_iter: 200 + +pretrain_weights: https://bj.bcebos.com/v1/paddledet/models/pretrained/ResNet34_vd_pretrained.pdparams +ResNet: + depth: 34 + variant: d + return_idx: [1, 2, 3] + freeze_at: -1 + freeze_norm: false + norm_decay: 0. + +HybridEncoder: + hidden_dim: 256 + use_encoder_idx: [2] + num_encoder_layers: 1 + encoder_layer: + name: TransformerLayer + d_model: 256 + nhead: 8 + dim_feedforward: 1024 + dropout: 0. + activation: 'gelu' + expansion: 0.5 + depth_mult: 1.0 + +RTDETRTransformer: + eval_idx: -1 + num_decoder_layers: 4 diff --git a/rtdetr_paddle/configs/rtdetr/rtdetr_r50vd_6x_coco.yml b/rtdetr_paddle/configs/rtdetr/rtdetr_r50vd_6x_coco.yml new file mode 100644 index 0000000..51bf443 --- /dev/null +++ b/rtdetr_paddle/configs/rtdetr/rtdetr_r50vd_6x_coco.yml @@ -0,0 +1,11 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_6x.yml', + '_base_/rtdetr_r50vd.yml', + '_base_/rtdetr_reader.yml', +] + +weights: output/rtdetr_r50vd_6x_coco/model_final +find_unused_parameters: True +log_iter: 200 diff --git a/rtdetr_paddle/configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml b/rtdetr_paddle/configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml new file mode 100644 index 0000000..d4ab6f9 --- /dev/null +++ b/rtdetr_paddle/configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml @@ -0,0 +1,28 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_6x.yml', + '_base_/rtdetr_r50vd.yml', + '_base_/rtdetr_reader.yml', +] + +weights: output/rtdetr_r50vd_m_6x_coco/model_final +find_unused_parameters: True +log_iter: 200 + +HybridEncoder: + hidden_dim: 256 + use_encoder_idx: [2] + num_encoder_layers: 1 + encoder_layer: + name: TransformerLayer + d_model: 256 + nhead: 8 + dim_feedforward: 1024 + dropout: 0. + activation: 'gelu' + expansion: 0.5 + depth_mult: 1.0 + +RTDETRTransformer: + eval_idx: 2 # use 3th decoder layer to eval diff --git a/rtdetr_paddle/configs/runtime.yml b/rtdetr_paddle/configs/runtime.yml new file mode 100644 index 0000000..a58b171 --- /dev/null +++ b/rtdetr_paddle/configs/runtime.yml @@ -0,0 +1,16 @@ +use_gpu: true +use_xpu: false +use_mlu: false +use_npu: false +log_iter: 20 +save_dir: output +snapshot_epoch: 1 +print_flops: false +print_params: false + +# Exporting the model +export: + post_process: True # Whether post-processing is included in the network when export model. + nms: True # Whether NMS is included in the network when export model. + benchmark: False # It is used to testing model performance, if set `True`, post-process and NMS will not be exported. + fuse_conv_bn: False diff --git a/rtdetr_paddle/dataset/coco/download_coco.py b/rtdetr_paddle/dataset/coco/download_coco.py new file mode 100644 index 0000000..993218f --- /dev/null +++ b/rtdetr_paddle/dataset/coco/download_coco.py @@ -0,0 +1,28 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import os.path as osp +import logging +# add python path of PaddleDetection to sys.path +parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3))) +if parent_path not in sys.path: + sys.path.append(parent_path) + +from ppdet.utils.download import download_dataset + +logging.basicConfig(level=logging.INFO) + +download_path = osp.split(osp.realpath(sys.argv[0]))[0] +download_dataset(download_path, 'coco') diff --git a/rtdetr_paddle/dataset/voc/create_list.py b/rtdetr_paddle/dataset/voc/create_list.py new file mode 100644 index 0000000..7696073 --- /dev/null +++ b/rtdetr_paddle/dataset/voc/create_list.py @@ -0,0 +1,28 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import os.path as osp +import logging +# add python path of PaddleDetection to sys.path +parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3))) +if parent_path not in sys.path: + sys.path.append(parent_path) + +from ppdet.utils.download import create_voc_list + +logging.basicConfig(level=logging.INFO) + +voc_path = osp.split(osp.realpath(sys.argv[0]))[0] +create_voc_list(voc_path) diff --git a/rtdetr_paddle/dataset/voc/download_voc.py b/rtdetr_paddle/dataset/voc/download_voc.py new file mode 100644 index 0000000..2375fbf --- /dev/null +++ b/rtdetr_paddle/dataset/voc/download_voc.py @@ -0,0 +1,28 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import os.path as osp +import logging +# add python path of PaddleDetection to sys.path +parent_path = osp.abspath(osp.join(__file__, *(['..'] * 3))) +if parent_path not in sys.path: + sys.path.append(parent_path) + +from ppdet.utils.download import download_dataset + +logging.basicConfig(level=logging.INFO) + +download_path = osp.split(osp.realpath(sys.argv[0]))[0] +download_dataset(download_path, 'voc') diff --git a/rtdetr_paddle/dataset/voc/label_list.txt b/rtdetr_paddle/dataset/voc/label_list.txt new file mode 100644 index 0000000..8420ab3 --- /dev/null +++ b/rtdetr_paddle/dataset/voc/label_list.txt @@ -0,0 +1,20 @@ +aeroplane +bicycle +bird +boat +bottle +bus +car +cat +chair +cow +diningtable +dog +horse +motorbike +person +pottedplant +sheep +sofa +train +tvmonitor diff --git a/rtdetr_paddle/ppdet/__init__.py b/rtdetr_paddle/ppdet/__init__.py new file mode 100644 index 0000000..fa1d8af --- /dev/null +++ b/rtdetr_paddle/ppdet/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import (core, data, engine, modeling, optimizer, metrics, utils) + + +try: + from .version import full_version as __version__ + from .version import commit as __git_commit__ +except ImportError: + import sys + sys.stderr.write("Warning: import ppdet from source directory " \ + "without installing, run 'python setup.py install' to " \ + "install ppdet firstly\n") diff --git a/rtdetr_paddle/ppdet/core/__init__.py b/rtdetr_paddle/ppdet/core/__init__.py new file mode 100644 index 0000000..d042771 --- /dev/null +++ b/rtdetr_paddle/ppdet/core/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import config diff --git a/rtdetr_paddle/ppdet/core/config/__init__.py b/rtdetr_paddle/ppdet/core/config/__init__.py new file mode 100644 index 0000000..d0c32e2 --- /dev/null +++ b/rtdetr_paddle/ppdet/core/config/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/rtdetr_paddle/ppdet/core/config/schema.py b/rtdetr_paddle/ppdet/core/config/schema.py new file mode 100644 index 0000000..2e41b5c --- /dev/null +++ b/rtdetr_paddle/ppdet/core/config/schema.py @@ -0,0 +1,248 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import inspect +import importlib +import re + +try: + from docstring_parser import parse as doc_parse +except Exception: + + def doc_parse(*args): + pass + + +try: + from typeguard import check_type +except Exception: + + def check_type(*args): + pass + + +__all__ = ['SchemaValue', 'SchemaDict', 'SharedConfig', 'extract_schema'] + + +class SchemaValue(object): + def __init__(self, name, doc='', type=None): + super(SchemaValue, self).__init__() + self.name = name + self.doc = doc + self.type = type + + def set_default(self, value): + self.default = value + + def has_default(self): + return hasattr(self, 'default') + + +class SchemaDict(dict): + def __init__(self, **kwargs): + super(SchemaDict, self).__init__() + self.schema = {} + self.strict = False + self.doc = "" + self.update(kwargs) + + def __setitem__(self, key, value): + # XXX also update regular dict to SchemaDict?? + if isinstance(value, dict) and key in self and isinstance(self[key], + SchemaDict): + self[key].update(value) + else: + super(SchemaDict, self).__setitem__(key, value) + + def __missing__(self, key): + if self.has_default(key): + return self.schema[key].default + elif key in self.schema: + return self.schema[key] + else: + raise KeyError(key) + + def copy(self): + newone = SchemaDict() + newone.__dict__.update(self.__dict__) + newone.update(self) + return newone + + def set_schema(self, key, value): + assert isinstance(value, SchemaValue) + self.schema[key] = value + + def set_strict(self, strict): + self.strict = strict + + def has_default(self, key): + return key in self.schema and self.schema[key].has_default() + + def is_default(self, key): + if not self.has_default(key): + return False + if hasattr(self[key], '__dict__'): + return True + else: + return key not in self or self[key] == self.schema[key].default + + def find_default_keys(self): + return [ + k for k in list(self.keys()) + list(self.schema.keys()) + if self.is_default(k) + ] + + def mandatory(self): + return any([k for k in self.schema.keys() if not self.has_default(k)]) + + def find_missing_keys(self): + missing = [ + k for k in self.schema.keys() + if k not in self and not self.has_default(k) + ] + placeholders = [k for k in self if self[k] in ('', '')] + return missing + placeholders + + def find_extra_keys(self): + return list(set(self.keys()) - set(self.schema.keys())) + + def find_mismatch_keys(self): + mismatch_keys = [] + for arg in self.schema.values(): + if arg.type is not None: + try: + check_type("{}.{}".format(self.name, arg.name), + self[arg.name], arg.type) + except Exception: + mismatch_keys.append(arg.name) + return mismatch_keys + + def validate(self): + missing_keys = self.find_missing_keys() + if missing_keys: + raise ValueError("Missing param for class<{}>: {}".format( + self.name, ", ".join(missing_keys))) + extra_keys = self.find_extra_keys() + if extra_keys and self.strict: + raise ValueError("Extraneous param for class<{}>: {}".format( + self.name, ", ".join(extra_keys))) + mismatch_keys = self.find_mismatch_keys() + if mismatch_keys: + raise TypeError("Wrong param type for class<{}>: {}".format( + self.name, ", ".join(mismatch_keys))) + + +class SharedConfig(object): + """ + Representation class for `__shared__` annotations, which work as follows: + + - if `key` is set for the module in config file, its value will take + precedence + - if `key` is not set for the module but present in the config file, its + value will be used + - otherwise, use the provided `default_value` as fallback + + Args: + key: config[key] will be injected + default_value: fallback value + """ + + def __init__(self, key, default_value=None): + super(SharedConfig, self).__init__() + self.key = key + self.default_value = default_value + + +def extract_schema(cls): + """ + Extract schema from a given class + + Args: + cls (type): Class from which to extract. + + Returns: + schema (SchemaDict): Extracted schema. + """ + ctor = cls.__init__ + # python 2 compatibility + if hasattr(inspect, 'getfullargspec'): + argspec = inspect.getfullargspec(ctor) + annotations = argspec.annotations + has_kwargs = argspec.varkw is not None + else: + argspec = inspect.getfullargspec(ctor) + # python 2 type hinting workaround, see pep-3107 + # however, since `typeguard` does not support python 2, type checking + # is still python 3 only for now + annotations = getattr(ctor, '__annotations__', {}) + has_kwargs = argspec.varkw is not None + + names = [arg for arg in argspec.args if arg != 'self'] + defaults = argspec.defaults + num_defaults = argspec.defaults is not None and len(argspec.defaults) or 0 + num_required = len(names) - num_defaults + + docs = cls.__doc__ + if docs is None and getattr(cls, '__category__', None) == 'op': + docs = cls.__call__.__doc__ + try: + docstring = doc_parse(docs) + except Exception: + docstring = None + + if docstring is None: + comments = {} + else: + comments = {} + for p in docstring.params: + match_obj = re.match('^([a-zA-Z_]+[a-zA-Z_0-9]*).*', p.arg_name) + if match_obj is not None: + comments[match_obj.group(1)] = p.description + + schema = SchemaDict() + schema.name = cls.__name__ + schema.doc = "" + if docs is not None: + start_pos = docs[0] == '\n' and 1 or 0 + schema.doc = docs[start_pos:].split("\n")[0].strip() + # XXX handle paddle's weird doc convention + if '**' == schema.doc[:2] and '**' == schema.doc[-2:]: + schema.doc = schema.doc[2:-2].strip() + schema.category = hasattr(cls, '__category__') and getattr( + cls, '__category__') or 'module' + schema.strict = not has_kwargs + schema.pymodule = importlib.import_module(cls.__module__) + schema.inject = getattr(cls, '__inject__', []) + schema.shared = getattr(cls, '__shared__', []) + for idx, name in enumerate(names): + comment = name in comments and comments[name] or name + if name in schema.inject: + type_ = None + else: + type_ = name in annotations and annotations[name] or None + value_schema = SchemaValue(name, comment, type_) + if name in schema.shared: + assert idx >= num_required, "shared config must have default value" + default = defaults[idx - num_required] + value_schema.set_default(SharedConfig(name, default)) + elif idx >= num_required: + default = defaults[idx - num_required] + value_schema.set_default(default) + schema.set_schema(name, value_schema) + + return schema diff --git a/rtdetr_paddle/ppdet/core/config/yaml_helpers.py b/rtdetr_paddle/ppdet/core/config/yaml_helpers.py new file mode 100644 index 0000000..181cfe6 --- /dev/null +++ b/rtdetr_paddle/ppdet/core/config/yaml_helpers.py @@ -0,0 +1,118 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import inspect + +import yaml +from .schema import SharedConfig + +__all__ = ['serializable', 'Callable'] + + +def represent_dictionary_order(self, dict_data): + return self.represent_mapping('tag:yaml.org,2002:map', dict_data.items()) + + +def setup_orderdict(): + from collections import OrderedDict + yaml.add_representer(OrderedDict, represent_dictionary_order) + + +def _make_python_constructor(cls): + def python_constructor(loader, node): + if isinstance(node, yaml.SequenceNode): + args = loader.construct_sequence(node, deep=True) + return cls(*args) + else: + kwargs = loader.construct_mapping(node, deep=True) + try: + return cls(**kwargs) + except Exception as ex: + print("Error when construct {} instance from yaml config". + format(cls.__name__)) + raise ex + + return python_constructor + + +def _make_python_representer(cls): + # python 2 compatibility + if hasattr(inspect, 'getfullargspec'): + argspec = inspect.getfullargspec(cls) + else: + argspec = inspect.getfullargspec(cls.__init__) + argnames = [arg for arg in argspec.args if arg != 'self'] + + def python_representer(dumper, obj): + if argnames: + data = {name: getattr(obj, name) for name in argnames} + else: + data = obj.__dict__ + if '_id' in data: + del data['_id'] + return dumper.represent_mapping(u'!{}'.format(cls.__name__), data) + + return python_representer + + +def serializable(cls): + """ + Add loader and dumper for given class, which must be + "trivially serializable" + + Args: + cls: class to be serialized + + Returns: cls + """ + yaml.add_constructor(u'!{}'.format(cls.__name__), + _make_python_constructor(cls)) + yaml.add_representer(cls, _make_python_representer(cls)) + return cls + + +yaml.add_representer(SharedConfig, + lambda d, o: d.represent_data(o.default_value)) + + +@serializable +class Callable(object): + """ + Helper to be used in Yaml for creating arbitrary class objects + + Args: + full_type (str): the full module path to target function + """ + + def __init__(self, full_type, args=[], kwargs={}): + super(Callable, self).__init__() + self.full_type = full_type + self.args = args + self.kwargs = kwargs + + def __call__(self): + if '.' in self.full_type: + idx = self.full_type.rfind('.') + module = importlib.import_module(self.full_type[:idx]) + func_name = self.full_type[idx + 1:] + else: + try: + module = importlib.import_module('builtins') + except Exception: + module = importlib.import_module('__builtin__') + func_name = self.full_type + + func = getattr(module, func_name) + return func(*self.args, **self.kwargs) diff --git a/rtdetr_paddle/ppdet/core/workspace.py b/rtdetr_paddle/ppdet/core/workspace.py new file mode 100644 index 0000000..6735bcf --- /dev/null +++ b/rtdetr_paddle/ppdet/core/workspace.py @@ -0,0 +1,292 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import importlib +import os +import sys + +import yaml +import collections + +try: + collectionsAbc = collections.abc +except AttributeError: + collectionsAbc = collections + +from .config.schema import SchemaDict, SharedConfig, extract_schema +from .config.yaml_helpers import serializable + +__all__ = [ + 'global_config', + 'load_config', + 'merge_config', + 'get_registered_modules', + 'create', + 'register', + 'serializable', + 'dump_value', +] + + +def dump_value(value): + # XXX this is hackish, but collections.abc is not available in python 2 + if hasattr(value, '__dict__') or isinstance(value, (dict, tuple, list)): + value = yaml.dump(value, default_flow_style=True) + value = value.replace('\n', '') + value = value.replace('...', '') + return "'{}'".format(value) + else: + # primitive types + return str(value) + + +class AttrDict(dict): + """Single level attribute dict, NOT recursive""" + + def __init__(self, **kwargs): + super(AttrDict, self).__init__() + super(AttrDict, self).update(kwargs) + + def __getattr__(self, key): + if key in self: + return self[key] + raise AttributeError("object has no attribute '{}'".format(key)) + + def __setattr__(self, key, value): + self[key] = value + + def copy(self): + new_dict = AttrDict() + for k, v in self.items(): + new_dict.update({k: v}) + return new_dict + + +global_config = AttrDict() + +BASE_KEY = '_BASE_' + + +# parse and load _BASE_ recursively +def _load_config_with_base(file_path): + with open(file_path) as f: + file_cfg = yaml.load(f, Loader=yaml.Loader) + + # NOTE: cfgs outside have higher priority than cfgs in _BASE_ + if BASE_KEY in file_cfg: + all_base_cfg = AttrDict() + base_ymls = list(file_cfg[BASE_KEY]) + for base_yml in base_ymls: + if base_yml.startswith("~"): + base_yml = os.path.expanduser(base_yml) + if not base_yml.startswith('/'): + base_yml = os.path.join(os.path.dirname(file_path), base_yml) + + with open(base_yml) as f: + base_cfg = _load_config_with_base(base_yml) + all_base_cfg = merge_config(base_cfg, all_base_cfg) + + del file_cfg[BASE_KEY] + return merge_config(file_cfg, all_base_cfg) + + return file_cfg + + +def load_config(file_path): + """ + Load config from file. + + Args: + file_path (str): Path of the config file to be loaded. + + Returns: global config + """ + _, ext = os.path.splitext(file_path) + assert ext in ['.yml', '.yaml'], "only support yaml files for now" + + # load config from file and merge into global config + cfg = _load_config_with_base(file_path) + cfg['filename'] = os.path.splitext(os.path.split(file_path)[-1])[0] + merge_config(cfg) + + return global_config + + +def dict_merge(dct, merge_dct): + """ Recursive dict merge. Inspired by :meth:``dict.update()``, instead of + updating only top-level keys, dict_merge recurses down into dicts nested + to an arbitrary depth, updating keys. The ``merge_dct`` is merged into + ``dct``. + + Args: + dct: dict onto which the merge is executed + merge_dct: dct merged into dct + + Returns: dct + """ + for k, v in merge_dct.items(): + if (k in dct and isinstance(dct[k], dict) and + isinstance(merge_dct[k], collectionsAbc.Mapping)): + dict_merge(dct[k], merge_dct[k]) + else: + dct[k] = merge_dct[k] + return dct + + +def merge_config(config, another_cfg=None): + """ + Merge config into global config or another_cfg. + + Args: + config (dict): Config to be merged. + + Returns: global config + """ + global global_config + dct = another_cfg or global_config + return dict_merge(dct, config) + + +def get_registered_modules(): + return {k: v for k, v in global_config.items() if isinstance(v, SchemaDict)} + + +def make_partial(cls): + op_module = importlib.import_module(cls.__op__.__module__) + op = getattr(op_module, cls.__op__.__name__) + cls.__category__ = getattr(cls, '__category__', None) or 'op' + + def partial_apply(self, *args, **kwargs): + kwargs_ = self.__dict__.copy() + kwargs_.update(kwargs) + return op(*args, **kwargs_) + + if getattr(cls, '__append_doc__', True): # XXX should default to True? + if sys.version_info[0] > 2: + cls.__doc__ = "Wrapper for `{}` OP".format(op.__name__) + cls.__init__.__doc__ = op.__doc__ + cls.__call__ = partial_apply + cls.__call__.__doc__ = op.__doc__ + else: + # XXX work around for python 2 + partial_apply.__doc__ = op.__doc__ + cls.__call__ = partial_apply + return cls + + +def register(cls): + """ + Register a given module class. + + Args: + cls (type): Module class to be registered. + + Returns: cls + """ + if cls.__name__ in global_config: + raise ValueError("Module class already registered: {}".format( + cls.__name__)) + if hasattr(cls, '__op__'): + cls = make_partial(cls) + global_config[cls.__name__] = extract_schema(cls) + return cls + + +def create(cls_or_name, **kwargs): + """ + Create an instance of given module class. + + Args: + cls_or_name (type or str): Class of which to create instance. + + Returns: instance of type `cls_or_name` + """ + assert type(cls_or_name) in [type, str + ], "should be a class or name of a class" + name = type(cls_or_name) == str and cls_or_name or cls_or_name.__name__ + if name in global_config: + if isinstance(global_config[name], SchemaDict): + pass + elif hasattr(global_config[name], "__dict__"): + # support instance return directly + return global_config[name] + else: + raise ValueError("The module {} is not registered".format(name)) + else: + raise ValueError("The module {} is not registered".format(name)) + + config = global_config[name] + cls = getattr(config.pymodule, name) + cls_kwargs = {} + cls_kwargs.update(global_config[name]) + + # parse `shared` annoation of registered modules + if getattr(config, 'shared', None): + for k in config.shared: + target_key = config[k] + shared_conf = config.schema[k].default + assert isinstance(shared_conf, SharedConfig) + if target_key is not None and not isinstance(target_key, + SharedConfig): + continue # value is given for the module + elif shared_conf.key in global_config: + # `key` is present in config + cls_kwargs[k] = global_config[shared_conf.key] + else: + cls_kwargs[k] = shared_conf.default_value + + # parse `inject` annoation of registered modules + if getattr(cls, 'from_config', None): + cls_kwargs.update(cls.from_config(config, **kwargs)) + + if getattr(config, 'inject', None): + for k in config.inject: + target_key = config[k] + # optional dependency + if target_key is None: + continue + + if isinstance(target_key, dict) or hasattr(target_key, '__dict__'): + if 'name' not in target_key.keys(): + continue + inject_name = str(target_key['name']) + if inject_name not in global_config: + raise ValueError( + "Missing injection name {} and check it's name in cfg file". + format(k)) + target = global_config[inject_name] + for i, v in target_key.items(): + if i == 'name': + continue + target[i] = v + if isinstance(target, SchemaDict): + cls_kwargs[k] = create(inject_name) + elif isinstance(target_key, str): + if target_key not in global_config: + raise ValueError("Missing injection config:", target_key) + target = global_config[target_key] + if isinstance(target, SchemaDict): + cls_kwargs[k] = create(target_key) + elif hasattr(target, '__dict__'): # serialized object + cls_kwargs[k] = target + else: + raise ValueError("Unsupported injection type:", target_key) + # prevent modification of global config values of reference types + # (e.g., list, dict) from within the created module instances + #kwargs = copy.deepcopy(kwargs) + return cls(**cls_kwargs) diff --git a/rtdetr_paddle/ppdet/data/__init__.py b/rtdetr_paddle/ppdet/data/__init__.py new file mode 100644 index 0000000..a12aa32 --- /dev/null +++ b/rtdetr_paddle/ppdet/data/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import source +from . import transform +from . import reader + +from .source import * +from .transform import * +from .reader import * diff --git a/rtdetr_paddle/ppdet/data/reader.py b/rtdetr_paddle/ppdet/data/reader.py new file mode 100644 index 0000000..587f3ae --- /dev/null +++ b/rtdetr_paddle/ppdet/data/reader.py @@ -0,0 +1,274 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import os +import traceback +import six +import sys +if sys.version_info >= (3, 0): + pass +else: + pass +import numpy as np +import paddle +import paddle.nn.functional as F + +from copy import deepcopy + +from paddle.io import DataLoader, DistributedBatchSampler +from .utils import default_collate_fn + +from ppdet.core.workspace import register +from . import transform +from .shm_utils import _get_shared_memory_size_in_M + +from ppdet.utils.logger import setup_logger +logger = setup_logger('reader') + +MAIN_PID = os.getpid() + + +class Compose(object): + def __init__(self, transforms, num_classes=80): + self.transforms = transforms + self.transforms_cls = [] + for t in self.transforms: + for k, v in t.items(): + op_cls = getattr(transform, k) + f = op_cls(**v) + if hasattr(f, 'num_classes'): + f.num_classes = num_classes + + self.transforms_cls.append(f) + + def __call__(self, data): + for f in self.transforms_cls: + try: + data = f(data) + except Exception as e: + stack_info = traceback.format_exc() + logger.warning("fail to map sample transform [{}] " + "with error: {} and stack:\n{}".format( + f, e, str(stack_info))) + raise e + + return data + + +class BatchCompose(Compose): + def __init__(self, transforms, num_classes=80, collate_batch=True): + super(BatchCompose, self).__init__(transforms, num_classes) + self.collate_batch = collate_batch + + def __call__(self, data): + for f in self.transforms_cls: + try: + data = f(data) + except Exception as e: + stack_info = traceback.format_exc() + logger.warning("fail to map batch transform [{}] " + "with error: {} and stack:\n{}".format( + f, e, str(stack_info))) + raise e + + # remove keys which is not needed by model + extra_key = ['h', 'w', 'flipped'] + for k in extra_key: + for sample in data: + if k in sample: + sample.pop(k) + + # batch data, if user-define batch function needed + # use user-defined here + if self.collate_batch: + batch_data = default_collate_fn(data) + else: + batch_data = {} + for k in data[0].keys(): + tmp_data = [] + for i in range(len(data)): + tmp_data.append(data[i][k]) + if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k: + tmp_data = np.stack(tmp_data, axis=0) + batch_data[k] = tmp_data + return batch_data + + +class BaseDataLoader(object): + """ + Base DataLoader implementation for detection models + + Args: + sample_transforms (list): a list of transforms to perform + on each sample + batch_transforms (list): a list of transforms to perform + on batch + batch_size (int): batch size for batch collating, default 1. + shuffle (bool): whether to shuffle samples + drop_last (bool): whether to drop the last incomplete, + default False + num_classes (int): class number of dataset, default 80 + collate_batch (bool): whether to collate batch in dataloader. + If set to True, the samples will collate into batch according + to the batch size. Otherwise, the ground-truth will not collate, + which is used when the number of ground-truch is different in + samples. + use_shared_memory (bool): whether to use shared memory to + accelerate data loading, enable this only if you + are sure that the shared memory size of your OS + is larger than memory cost of input datas of model. + Note that shared memory will be automatically + disabled if the shared memory of OS is less than + 1G, which is not enough for detection models. + Default False. + """ + + def __init__(self, + sample_transforms=[], + batch_transforms=[], + batch_size=1, + shuffle=False, + drop_last=False, + num_classes=80, + collate_batch=True, + use_shared_memory=False, + **kwargs): + # sample transform + self._sample_transforms = Compose( + sample_transforms, num_classes=num_classes) + + # batch transfrom + self._batch_transforms = BatchCompose(batch_transforms, num_classes, + collate_batch) + self.batch_size = batch_size + self.shuffle = shuffle + self.drop_last = drop_last + self.use_shared_memory = use_shared_memory + self.kwargs = kwargs + + def __call__(self, + dataset, + worker_num, + batch_sampler=None, + return_list=False): + self.dataset = dataset + self.dataset.check_or_download_dataset() + self.dataset.parse_dataset() + # get data + self.dataset.set_transform(self._sample_transforms) + # set kwargs + self.dataset.set_kwargs(**self.kwargs) + # batch sampler + if batch_sampler is None: + self._batch_sampler = DistributedBatchSampler( + self.dataset, + batch_size=self.batch_size, + shuffle=self.shuffle, + drop_last=self.drop_last) + else: + self._batch_sampler = batch_sampler + + # DataLoader do not start sub-process in Windows and Mac + # system, do not need to use shared memory + use_shared_memory = self.use_shared_memory and \ + sys.platform not in ['win32', 'darwin'] + # check whether shared memory size is bigger than 1G(1024M) + if use_shared_memory: + shm_size = _get_shared_memory_size_in_M() + if shm_size is not None and shm_size < 1024.: + logger.warning("Shared memory size is less than 1G, " + "disable shared_memory in DataLoader") + use_shared_memory = False + + self.dataloader = DataLoader( + dataset=self.dataset, + batch_sampler=self._batch_sampler, + collate_fn=self._batch_transforms, + num_workers=worker_num, + return_list=return_list, + use_shared_memory=use_shared_memory) + self.loader = iter(self.dataloader) + + return self + + def __len__(self): + return len(self._batch_sampler) + + def __iter__(self): + return self + + def __next__(self): + try: + return next(self.loader) + except StopIteration: + self.loader = iter(self.dataloader) + six.reraise(*sys.exc_info()) + + def next(self): + # python2 compatibility + return self.__next__() + + +@register +class TrainReader(BaseDataLoader): + __shared__ = ['num_classes'] + + def __init__(self, + sample_transforms=[], + batch_transforms=[], + batch_size=1, + shuffle=True, + drop_last=True, + num_classes=80, + collate_batch=True, + **kwargs): + super(TrainReader, self).__init__(sample_transforms, batch_transforms, + batch_size, shuffle, drop_last, + num_classes, collate_batch, **kwargs) + + +@register +class EvalReader(BaseDataLoader): + __shared__ = ['num_classes'] + + def __init__(self, + sample_transforms=[], + batch_transforms=[], + batch_size=1, + shuffle=False, + drop_last=False, + num_classes=80, + **kwargs): + super(EvalReader, self).__init__(sample_transforms, batch_transforms, + batch_size, shuffle, drop_last, + num_classes, **kwargs) + + +@register +class TestReader(BaseDataLoader): + __shared__ = ['num_classes'] + + def __init__(self, + sample_transforms=[], + batch_transforms=[], + batch_size=1, + shuffle=False, + drop_last=False, + num_classes=80, + **kwargs): + super(TestReader, self).__init__(sample_transforms, batch_transforms, + batch_size, shuffle, drop_last, + num_classes, **kwargs) + diff --git a/rtdetr_paddle/ppdet/data/shm_utils.py b/rtdetr_paddle/ppdet/data/shm_utils.py new file mode 100644 index 0000000..a929a80 --- /dev/null +++ b/rtdetr_paddle/ppdet/data/shm_utils.py @@ -0,0 +1,70 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +SIZE_UNIT = ['K', 'M', 'G', 'T'] +SHM_QUERY_CMD = 'df -h' +SHM_KEY = 'shm' +SHM_DEFAULT_MOUNT = '/dev/shm' + +# [ shared memory size check ] +# In detection models, image/target data occupies a lot of memory, and +# will occupy lots of shared memory in multi-process DataLoader, we use +# following code to get shared memory size and perform a size check to +# disable shared memory use if shared memory size is not enough. +# Shared memory getting process as follows: +# 1. use `df -h` get all mount info +# 2. pick up spaces whose mount info contains 'shm' +# 3. if 'shm' space number is only 1, return its size +# 4. if there are multiple 'shm' space, try to find the default mount +# directory '/dev/shm' is Linux-like system, otherwise return the +# biggest space size. + + +def _parse_size_in_M(size_str): + if size_str[-1] == 'B': + num, unit = size_str[:-2], size_str[-2] + else: + num, unit = size_str[:-1], size_str[-1] + assert unit in SIZE_UNIT, \ + "unknown shm size unit {}".format(unit) + return float(num) * \ + (1024 ** (SIZE_UNIT.index(unit) - 1)) + + +def _get_shared_memory_size_in_M(): + try: + df_infos = os.popen(SHM_QUERY_CMD).readlines() + except: + return None + else: + shm_infos = [] + for df_info in df_infos: + info = df_info.strip() + if info.find(SHM_KEY) >= 0: + shm_infos.append(info.split()) + + if len(shm_infos) == 0: + return None + elif len(shm_infos) == 1: + return _parse_size_in_M(shm_infos[0][3]) + else: + default_mount_infos = [ + si for si in shm_infos if si[-1] == SHM_DEFAULT_MOUNT + ] + if default_mount_infos: + return _parse_size_in_M(default_mount_infos[0][3]) + else: + return max([_parse_size_in_M(si[3]) for si in shm_infos]) diff --git a/rtdetr_paddle/ppdet/data/source/__init__.py b/rtdetr_paddle/ppdet/data/source/__init__.py new file mode 100644 index 0000000..0c44b43 --- /dev/null +++ b/rtdetr_paddle/ppdet/data/source/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .coco import * +from .voc import * +from .category import * +from .dataset import ImageFolder diff --git a/rtdetr_paddle/ppdet/data/source/category.py b/rtdetr_paddle/ppdet/data/source/category.py new file mode 100644 index 0000000..c927897 --- /dev/null +++ b/rtdetr_paddle/ppdet/data/source/category.py @@ -0,0 +1,926 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +from ppdet.data.source.voc import pascalvoc_label +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + +__all__ = ['get_categories'] + + +def get_categories(metric_type, anno_file=None, arch=None): + """ + Get class id to category id map and category id + to category name map from annotation file. + + Args: + metric_type (str): metric type, currently support 'coco', 'voc', 'oid' + and 'widerface'. + anno_file (str): annotation file path + """ + if arch == 'keypoint_arch': + return (None, {'id': 'keypoint'}) + + if anno_file == None or (not os.path.isfile(anno_file)): + logger.warning( + "anno_file '{}' is None or not set or not exist, " + "please recheck TrainDataset/EvalDataset/TestDataset.anno_path, " + "otherwise the default categories will be used by metric_type.". + format(anno_file)) + + if metric_type.lower() == 'coco' or metric_type.lower( + ) == 'rbox' or metric_type.lower() == 'snipercoco': + if anno_file and os.path.isfile(anno_file): + if anno_file.endswith('json'): + # lazy import pycocotools here + from pycocotools.coco import COCO + coco = COCO(anno_file) + cats = coco.loadCats(coco.getCatIds()) + + clsid2catid = {i: cat['id'] for i, cat in enumerate(cats)} + catid2name = {cat['id']: cat['name'] for cat in cats} + + elif anno_file.endswith('txt'): + cats = [] + with open(anno_file) as f: + for line in f.readlines(): + cats.append(line.strip()) + if cats[0] == 'background': cats = cats[1:] + + clsid2catid = {i: i for i in range(len(cats))} + catid2name = {i: name for i, name in enumerate(cats)} + + else: + raise ValueError("anno_file {} should be json or txt.".format( + anno_file)) + return clsid2catid, catid2name + + # anno file not exist, load default categories of COCO17 + else: + if metric_type.lower() == 'rbox': + logger.warning( + "metric_type: {}, load default categories of DOTA.".format( + metric_type)) + return _dota_category() + logger.warning("metric_type: {}, load default categories of COCO.". + format(metric_type)) + return _coco17_category() + + elif metric_type.lower() == 'voc': + if anno_file and os.path.isfile(anno_file): + cats = [] + with open(anno_file) as f: + for line in f.readlines(): + cats.append(line.strip()) + + if cats[0] == 'background': + cats = cats[1:] + + clsid2catid = {i: i for i in range(len(cats))} + catid2name = {i: name for i, name in enumerate(cats)} + + return clsid2catid, catid2name + + # anno file not exist, load default categories of + # VOC all 20 categories + else: + logger.warning("metric_type: {}, load default categories of VOC.". + format(metric_type)) + return _vocall_category() + + elif metric_type.lower() == 'oid': + if anno_file and os.path.isfile(anno_file): + logger.warning("only default categories support for OID19") + return _oid19_category() + + elif metric_type.lower() == 'keypointtopdowncocoeval' or metric_type.lower( + ) == 'keypointtopdownmpiieval': + return (None, {'id': 'keypoint'}) + + elif metric_type.lower() == 'pose3deval': + return (None, {'id': 'pose3d'}) + + elif metric_type.lower() in ['mot', 'motdet', 'reid']: + if anno_file and os.path.isfile(anno_file): + cats = [] + with open(anno_file) as f: + for line in f.readlines(): + cats.append(line.strip()) + if cats[0] == 'background': + cats = cats[1:] + clsid2catid = {i: i for i in range(len(cats))} + catid2name = {i: name for i, name in enumerate(cats)} + return clsid2catid, catid2name + # anno file not exist, load default category 'pedestrian'. + else: + logger.warning( + "metric_type: {}, load default categories of pedestrian MOT.". + format(metric_type)) + return _mot_category(category='pedestrian') + + elif metric_type.lower() in ['kitti', 'bdd100kmot']: + return _mot_category(category='vehicle') + + elif metric_type.lower() in ['mcmot']: + if anno_file and os.path.isfile(anno_file): + cats = [] + with open(anno_file) as f: + for line in f.readlines(): + cats.append(line.strip()) + if cats[0] == 'background': + cats = cats[1:] + clsid2catid = {i: i for i in range(len(cats))} + catid2name = {i: name for i, name in enumerate(cats)} + return clsid2catid, catid2name + # anno file not exist, load default categories of visdrone all 10 categories + else: + logger.warning( + "metric_type: {}, load default categories of VisDrone.".format( + metric_type)) + return _visdrone_category() + + else: + raise ValueError("unknown metric type {}".format(metric_type)) + + +def _mot_category(category='pedestrian'): + """ + Get class id to category id map and category id + to category name map of mot dataset + """ + label_map = {category: 0} + label_map = sorted(label_map.items(), key=lambda x: x[1]) + cats = [l[0] for l in label_map] + + clsid2catid = {i: i for i in range(len(cats))} + catid2name = {i: name for i, name in enumerate(cats)} + + return clsid2catid, catid2name + + +def _coco17_category(): + """ + Get class id to category id map and category id + to category name map of COCO2017 dataset + + """ + clsid2catid = { + 1: 1, + 2: 2, + 3: 3, + 4: 4, + 5: 5, + 6: 6, + 7: 7, + 8: 8, + 9: 9, + 10: 10, + 11: 11, + 12: 13, + 13: 14, + 14: 15, + 15: 16, + 16: 17, + 17: 18, + 18: 19, + 19: 20, + 20: 21, + 21: 22, + 22: 23, + 23: 24, + 24: 25, + 25: 27, + 26: 28, + 27: 31, + 28: 32, + 29: 33, + 30: 34, + 31: 35, + 32: 36, + 33: 37, + 34: 38, + 35: 39, + 36: 40, + 37: 41, + 38: 42, + 39: 43, + 40: 44, + 41: 46, + 42: 47, + 43: 48, + 44: 49, + 45: 50, + 46: 51, + 47: 52, + 48: 53, + 49: 54, + 50: 55, + 51: 56, + 52: 57, + 53: 58, + 54: 59, + 55: 60, + 56: 61, + 57: 62, + 58: 63, + 59: 64, + 60: 65, + 61: 67, + 62: 70, + 63: 72, + 64: 73, + 65: 74, + 66: 75, + 67: 76, + 68: 77, + 69: 78, + 70: 79, + 71: 80, + 72: 81, + 73: 82, + 74: 84, + 75: 85, + 76: 86, + 77: 87, + 78: 88, + 79: 89, + 80: 90 + } + + catid2name = { + 0: 'background', + 1: 'person', + 2: 'bicycle', + 3: 'car', + 4: 'motorcycle', + 5: 'airplane', + 6: 'bus', + 7: 'train', + 8: 'truck', + 9: 'boat', + 10: 'traffic light', + 11: 'fire hydrant', + 13: 'stop sign', + 14: 'parking meter', + 15: 'bench', + 16: 'bird', + 17: 'cat', + 18: 'dog', + 19: 'horse', + 20: 'sheep', + 21: 'cow', + 22: 'elephant', + 23: 'bear', + 24: 'zebra', + 25: 'giraffe', + 27: 'backpack', + 28: 'umbrella', + 31: 'handbag', + 32: 'tie', + 33: 'suitcase', + 34: 'frisbee', + 35: 'skis', + 36: 'snowboard', + 37: 'sports ball', + 38: 'kite', + 39: 'baseball bat', + 40: 'baseball glove', + 41: 'skateboard', + 42: 'surfboard', + 43: 'tennis racket', + 44: 'bottle', + 46: 'wine glass', + 47: 'cup', + 48: 'fork', + 49: 'knife', + 50: 'spoon', + 51: 'bowl', + 52: 'banana', + 53: 'apple', + 54: 'sandwich', + 55: 'orange', + 56: 'broccoli', + 57: 'carrot', + 58: 'hot dog', + 59: 'pizza', + 60: 'donut', + 61: 'cake', + 62: 'chair', + 63: 'couch', + 64: 'potted plant', + 65: 'bed', + 67: 'dining table', + 70: 'toilet', + 72: 'tv', + 73: 'laptop', + 74: 'mouse', + 75: 'remote', + 76: 'keyboard', + 77: 'cell phone', + 78: 'microwave', + 79: 'oven', + 80: 'toaster', + 81: 'sink', + 82: 'refrigerator', + 84: 'book', + 85: 'clock', + 86: 'vase', + 87: 'scissors', + 88: 'teddy bear', + 89: 'hair drier', + 90: 'toothbrush' + } + + clsid2catid = {k - 1: v for k, v in clsid2catid.items()} + catid2name.pop(0) + + return clsid2catid, catid2name + + +def _dota_category(): + """ + Get class id to category id map and category id + to category name map of dota dataset + """ + catid2name = { + 0: 'background', + 1: 'plane', + 2: 'baseball-diamond', + 3: 'bridge', + 4: 'ground-track-field', + 5: 'small-vehicle', + 6: 'large-vehicle', + 7: 'ship', + 8: 'tennis-court', + 9: 'basketball-court', + 10: 'storage-tank', + 11: 'soccer-ball-field', + 12: 'roundabout', + 13: 'harbor', + 14: 'swimming-pool', + 15: 'helicopter' + } + catid2name.pop(0) + clsid2catid = {i: i + 1 for i in range(len(catid2name))} + return clsid2catid, catid2name + + +def _vocall_category(): + """ + Get class id to category id map and category id + to category name map of mixup voc dataset + + """ + label_map = pascalvoc_label() + label_map = sorted(label_map.items(), key=lambda x: x[1]) + cats = [l[0] for l in label_map] + + clsid2catid = {i: i for i in range(len(cats))} + catid2name = {i: name for i, name in enumerate(cats)} + + return clsid2catid, catid2name + + +def _oid19_category(): + clsid2catid = {k: k + 1 for k in range(500)} + + catid2name = { + 0: "background", + 1: "Infant bed", + 2: "Rose", + 3: "Flag", + 4: "Flashlight", + 5: "Sea turtle", + 6: "Camera", + 7: "Animal", + 8: "Glove", + 9: "Crocodile", + 10: "Cattle", + 11: "House", + 12: "Guacamole", + 13: "Penguin", + 14: "Vehicle registration plate", + 15: "Bench", + 16: "Ladybug", + 17: "Human nose", + 18: "Watermelon", + 19: "Flute", + 20: "Butterfly", + 21: "Washing machine", + 22: "Raccoon", + 23: "Segway", + 24: "Taco", + 25: "Jellyfish", + 26: "Cake", + 27: "Pen", + 28: "Cannon", + 29: "Bread", + 30: "Tree", + 31: "Shellfish", + 32: "Bed", + 33: "Hamster", + 34: "Hat", + 35: "Toaster", + 36: "Sombrero", + 37: "Tiara", + 38: "Bowl", + 39: "Dragonfly", + 40: "Moths and butterflies", + 41: "Antelope", + 42: "Vegetable", + 43: "Torch", + 44: "Building", + 45: "Power plugs and sockets", + 46: "Blender", + 47: "Billiard table", + 48: "Cutting board", + 49: "Bronze sculpture", + 50: "Turtle", + 51: "Broccoli", + 52: "Tiger", + 53: "Mirror", + 54: "Bear", + 55: "Zucchini", + 56: "Dress", + 57: "Volleyball", + 58: "Guitar", + 59: "Reptile", + 60: "Golf cart", + 61: "Tart", + 62: "Fedora", + 63: "Carnivore", + 64: "Car", + 65: "Lighthouse", + 66: "Coffeemaker", + 67: "Food processor", + 68: "Truck", + 69: "Bookcase", + 70: "Surfboard", + 71: "Footwear", + 72: "Bench", + 73: "Necklace", + 74: "Flower", + 75: "Radish", + 76: "Marine mammal", + 77: "Frying pan", + 78: "Tap", + 79: "Peach", + 80: "Knife", + 81: "Handbag", + 82: "Laptop", + 83: "Tent", + 84: "Ambulance", + 85: "Christmas tree", + 86: "Eagle", + 87: "Limousine", + 88: "Kitchen & dining room table", + 89: "Polar bear", + 90: "Tower", + 91: "Football", + 92: "Willow", + 93: "Human head", + 94: "Stop sign", + 95: "Banana", + 96: "Mixer", + 97: "Binoculars", + 98: "Dessert", + 99: "Bee", + 100: "Chair", + 101: "Wood-burning stove", + 102: "Flowerpot", + 103: "Beaker", + 104: "Oyster", + 105: "Woodpecker", + 106: "Harp", + 107: "Bathtub", + 108: "Wall clock", + 109: "Sports uniform", + 110: "Rhinoceros", + 111: "Beehive", + 112: "Cupboard", + 113: "Chicken", + 114: "Man", + 115: "Blue jay", + 116: "Cucumber", + 117: "Balloon", + 118: "Kite", + 119: "Fireplace", + 120: "Lantern", + 121: "Missile", + 122: "Book", + 123: "Spoon", + 124: "Grapefruit", + 125: "Squirrel", + 126: "Orange", + 127: "Coat", + 128: "Punching bag", + 129: "Zebra", + 130: "Billboard", + 131: "Bicycle", + 132: "Door handle", + 133: "Mechanical fan", + 134: "Ring binder", + 135: "Table", + 136: "Parrot", + 137: "Sock", + 138: "Vase", + 139: "Weapon", + 140: "Shotgun", + 141: "Glasses", + 142: "Seahorse", + 143: "Belt", + 144: "Watercraft", + 145: "Window", + 146: "Giraffe", + 147: "Lion", + 148: "Tire", + 149: "Vehicle", + 150: "Canoe", + 151: "Tie", + 152: "Shelf", + 153: "Picture frame", + 154: "Printer", + 155: "Human leg", + 156: "Boat", + 157: "Slow cooker", + 158: "Croissant", + 159: "Candle", + 160: "Pancake", + 161: "Pillow", + 162: "Coin", + 163: "Stretcher", + 164: "Sandal", + 165: "Woman", + 166: "Stairs", + 167: "Harpsichord", + 168: "Stool", + 169: "Bus", + 170: "Suitcase", + 171: "Human mouth", + 172: "Juice", + 173: "Skull", + 174: "Door", + 175: "Violin", + 176: "Chopsticks", + 177: "Digital clock", + 178: "Sunflower", + 179: "Leopard", + 180: "Bell pepper", + 181: "Harbor seal", + 182: "Snake", + 183: "Sewing machine", + 184: "Goose", + 185: "Helicopter", + 186: "Seat belt", + 187: "Coffee cup", + 188: "Microwave oven", + 189: "Hot dog", + 190: "Countertop", + 191: "Serving tray", + 192: "Dog bed", + 193: "Beer", + 194: "Sunglasses", + 195: "Golf ball", + 196: "Waffle", + 197: "Palm tree", + 198: "Trumpet", + 199: "Ruler", + 200: "Helmet", + 201: "Ladder", + 202: "Office building", + 203: "Tablet computer", + 204: "Toilet paper", + 205: "Pomegranate", + 206: "Skirt", + 207: "Gas stove", + 208: "Cookie", + 209: "Cart", + 210: "Raven", + 211: "Egg", + 212: "Burrito", + 213: "Goat", + 214: "Kitchen knife", + 215: "Skateboard", + 216: "Salt and pepper shakers", + 217: "Lynx", + 218: "Boot", + 219: "Platter", + 220: "Ski", + 221: "Swimwear", + 222: "Swimming pool", + 223: "Drinking straw", + 224: "Wrench", + 225: "Drum", + 226: "Ant", + 227: "Human ear", + 228: "Headphones", + 229: "Fountain", + 230: "Bird", + 231: "Jeans", + 232: "Television", + 233: "Crab", + 234: "Microphone", + 235: "Home appliance", + 236: "Snowplow", + 237: "Beetle", + 238: "Artichoke", + 239: "Jet ski", + 240: "Stationary bicycle", + 241: "Human hair", + 242: "Brown bear", + 243: "Starfish", + 244: "Fork", + 245: "Lobster", + 246: "Corded phone", + 247: "Drink", + 248: "Saucer", + 249: "Carrot", + 250: "Insect", + 251: "Clock", + 252: "Castle", + 253: "Tennis racket", + 254: "Ceiling fan", + 255: "Asparagus", + 256: "Jaguar", + 257: "Musical instrument", + 258: "Train", + 259: "Cat", + 260: "Rifle", + 261: "Dumbbell", + 262: "Mobile phone", + 263: "Taxi", + 264: "Shower", + 265: "Pitcher", + 266: "Lemon", + 267: "Invertebrate", + 268: "Turkey", + 269: "High heels", + 270: "Bust", + 271: "Elephant", + 272: "Scarf", + 273: "Barrel", + 274: "Trombone", + 275: "Pumpkin", + 276: "Box", + 277: "Tomato", + 278: "Frog", + 279: "Bidet", + 280: "Human face", + 281: "Houseplant", + 282: "Van", + 283: "Shark", + 284: "Ice cream", + 285: "Swim cap", + 286: "Falcon", + 287: "Ostrich", + 288: "Handgun", + 289: "Whiteboard", + 290: "Lizard", + 291: "Pasta", + 292: "Snowmobile", + 293: "Light bulb", + 294: "Window blind", + 295: "Muffin", + 296: "Pretzel", + 297: "Computer monitor", + 298: "Horn", + 299: "Furniture", + 300: "Sandwich", + 301: "Fox", + 302: "Convenience store", + 303: "Fish", + 304: "Fruit", + 305: "Earrings", + 306: "Curtain", + 307: "Grape", + 308: "Sofa bed", + 309: "Horse", + 310: "Luggage and bags", + 311: "Desk", + 312: "Crutch", + 313: "Bicycle helmet", + 314: "Tick", + 315: "Airplane", + 316: "Canary", + 317: "Spatula", + 318: "Watch", + 319: "Lily", + 320: "Kitchen appliance", + 321: "Filing cabinet", + 322: "Aircraft", + 323: "Cake stand", + 324: "Candy", + 325: "Sink", + 326: "Mouse", + 327: "Wine", + 328: "Wheelchair", + 329: "Goldfish", + 330: "Refrigerator", + 331: "French fries", + 332: "Drawer", + 333: "Treadmill", + 334: "Picnic basket", + 335: "Dice", + 336: "Cabbage", + 337: "Football helmet", + 338: "Pig", + 339: "Person", + 340: "Shorts", + 341: "Gondola", + 342: "Honeycomb", + 343: "Doughnut", + 344: "Chest of drawers", + 345: "Land vehicle", + 346: "Bat", + 347: "Monkey", + 348: "Dagger", + 349: "Tableware", + 350: "Human foot", + 351: "Mug", + 352: "Alarm clock", + 353: "Pressure cooker", + 354: "Human hand", + 355: "Tortoise", + 356: "Baseball glove", + 357: "Sword", + 358: "Pear", + 359: "Miniskirt", + 360: "Traffic sign", + 361: "Girl", + 362: "Roller skates", + 363: "Dinosaur", + 364: "Porch", + 365: "Human beard", + 366: "Submarine sandwich", + 367: "Screwdriver", + 368: "Strawberry", + 369: "Wine glass", + 370: "Seafood", + 371: "Racket", + 372: "Wheel", + 373: "Sea lion", + 374: "Toy", + 375: "Tea", + 376: "Tennis ball", + 377: "Waste container", + 378: "Mule", + 379: "Cricket ball", + 380: "Pineapple", + 381: "Coconut", + 382: "Doll", + 383: "Coffee table", + 384: "Snowman", + 385: "Lavender", + 386: "Shrimp", + 387: "Maple", + 388: "Cowboy hat", + 389: "Goggles", + 390: "Rugby ball", + 391: "Caterpillar", + 392: "Poster", + 393: "Rocket", + 394: "Organ", + 395: "Saxophone", + 396: "Traffic light", + 397: "Cocktail", + 398: "Plastic bag", + 399: "Squash", + 400: "Mushroom", + 401: "Hamburger", + 402: "Light switch", + 403: "Parachute", + 404: "Teddy bear", + 405: "Winter melon", + 406: "Deer", + 407: "Musical keyboard", + 408: "Plumbing fixture", + 409: "Scoreboard", + 410: "Baseball bat", + 411: "Envelope", + 412: "Adhesive tape", + 413: "Briefcase", + 414: "Paddle", + 415: "Bow and arrow", + 416: "Telephone", + 417: "Sheep", + 418: "Jacket", + 419: "Boy", + 420: "Pizza", + 421: "Otter", + 422: "Office supplies", + 423: "Couch", + 424: "Cello", + 425: "Bull", + 426: "Camel", + 427: "Ball", + 428: "Duck", + 429: "Whale", + 430: "Shirt", + 431: "Tank", + 432: "Motorcycle", + 433: "Accordion", + 434: "Owl", + 435: "Porcupine", + 436: "Sun hat", + 437: "Nail", + 438: "Scissors", + 439: "Swan", + 440: "Lamp", + 441: "Crown", + 442: "Piano", + 443: "Sculpture", + 444: "Cheetah", + 445: "Oboe", + 446: "Tin can", + 447: "Mango", + 448: "Tripod", + 449: "Oven", + 450: "Mouse", + 451: "Barge", + 452: "Coffee", + 453: "Snowboard", + 454: "Common fig", + 455: "Salad", + 456: "Marine invertebrates", + 457: "Umbrella", + 458: "Kangaroo", + 459: "Human arm", + 460: "Measuring cup", + 461: "Snail", + 462: "Loveseat", + 463: "Suit", + 464: "Teapot", + 465: "Bottle", + 466: "Alpaca", + 467: "Kettle", + 468: "Trousers", + 469: "Popcorn", + 470: "Centipede", + 471: "Spider", + 472: "Sparrow", + 473: "Plate", + 474: "Bagel", + 475: "Personal care", + 476: "Apple", + 477: "Brassiere", + 478: "Bathroom cabinet", + 479: "studio couch", + 480: "Computer keyboard", + 481: "Table tennis racket", + 482: "Sushi", + 483: "Cabinetry", + 484: "Street light", + 485: "Towel", + 486: "Nightstand", + 487: "Rabbit", + 488: "Dolphin", + 489: "Dog", + 490: "Jug", + 491: "Wok", + 492: "Fire hydrant", + 493: "Human eye", + 494: "Skyscraper", + 495: "Backpack", + 496: "Potato", + 497: "Paper towel", + 498: "Lifejacket", + 499: "Bicycle wheel", + 500: "Toilet", + } + + return clsid2catid, catid2name + + +def _visdrone_category(): + clsid2catid = {i: i for i in range(10)} + + catid2name = { + 0: 'pedestrian', + 1: 'people', + 2: 'bicycle', + 3: 'car', + 4: 'van', + 5: 'truck', + 6: 'tricycle', + 7: 'awning-tricycle', + 8: 'bus', + 9: 'motor' + } + return clsid2catid, catid2name diff --git a/rtdetr_paddle/ppdet/data/source/coco.py b/rtdetr_paddle/ppdet/data/source/coco.py new file mode 100644 index 0000000..330dae6 --- /dev/null +++ b/rtdetr_paddle/ppdet/data/source/coco.py @@ -0,0 +1,587 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import copy +try: + from collections.abc import Sequence +except Exception: + from collections import Sequence +import numpy as np +from ppdet.core.workspace import register, serializable +from .dataset import DetDataset + +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + +__all__ = ['COCODataSet', 'SlicedCOCODataSet', 'SemiCOCODataSet'] + + +@register +@serializable +class COCODataSet(DetDataset): + """ + Load dataset with COCO format. + + Args: + dataset_dir (str): root directory for dataset. + image_dir (str): directory for images. + anno_path (str): coco annotation file path. + data_fields (list): key name of data dictionary, at least have 'image'. + sample_num (int): number of samples to load, -1 means all. + load_crowd (bool): whether to load crowded ground-truth. + False as default + allow_empty (bool): whether to load empty entry. False as default + empty_ratio (float): the ratio of empty record number to total + record's, if empty_ratio is out of [0. ,1.), do not sample the + records and use all the empty entries. 1. as default + repeat (int): repeat times for dataset, use in benchmark. + """ + + def __init__(self, + dataset_dir=None, + image_dir=None, + anno_path=None, + data_fields=['image'], + sample_num=-1, + load_crowd=False, + allow_empty=False, + empty_ratio=1., + repeat=1): + super(COCODataSet, self).__init__( + dataset_dir, + image_dir, + anno_path, + data_fields, + sample_num, + repeat=repeat) + self.load_image_only = False + self.load_semantic = False + self.load_crowd = load_crowd + self.allow_empty = allow_empty + self.empty_ratio = empty_ratio + + def _sample_empty(self, records, num): + # if empty_ratio is out of [0. ,1.), do not sample the records + if self.empty_ratio < 0. or self.empty_ratio >= 1.: + return records + import random + sample_num = min( + int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records)) + records = random.sample(records, sample_num) + return records + + def parse_dataset(self): + anno_path = os.path.join(self.dataset_dir, self.anno_path) + image_dir = os.path.join(self.dataset_dir, self.image_dir) + + assert anno_path.endswith('.json'), \ + 'invalid coco annotation file: ' + anno_path + from pycocotools.coco import COCO + coco = COCO(anno_path) + img_ids = coco.getImgIds() + img_ids.sort() + cat_ids = coco.getCatIds() + records = [] + empty_records = [] + ct = 0 + + self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)}) + self.cname2cid = dict({ + coco.loadCats(catid)[0]['name']: clsid + for catid, clsid in self.catid2clsid.items() + }) + + if 'annotations' not in coco.dataset: + self.load_image_only = True + logger.warning('Annotation file: {} does not contains ground truth ' + 'and load image information only.'.format(anno_path)) + + for img_id in img_ids: + img_anno = coco.loadImgs([img_id])[0] + im_fname = img_anno['file_name'] + im_w = float(img_anno['width']) + im_h = float(img_anno['height']) + + im_path = os.path.join(image_dir, + im_fname) if image_dir else im_fname + is_empty = False + if not os.path.exists(im_path): + logger.warning('Illegal image file: {}, and it will be ' + 'ignored'.format(im_path)) + continue + + if im_w < 0 or im_h < 0: + logger.warning('Illegal width: {} or height: {} in annotation, ' + 'and im_id: {} will be ignored'.format( + im_w, im_h, img_id)) + continue + + coco_rec = { + 'im_file': im_path, + 'im_id': np.array([img_id]), + 'h': im_h, + 'w': im_w, + } if 'image' in self.data_fields else {} + + if not self.load_image_only: + ins_anno_ids = coco.getAnnIds( + imgIds=[img_id], iscrowd=None if self.load_crowd else False) + instances = coco.loadAnns(ins_anno_ids) + + bboxes = [] + is_rbox_anno = False + for inst in instances: + # check gt bbox + if inst.get('ignore', False): + continue + if 'bbox' not in inst.keys(): + continue + else: + if not any(np.array(inst['bbox'])): + continue + + x1, y1, box_w, box_h = inst['bbox'] + x2 = x1 + box_w + y2 = y1 + box_h + eps = 1e-5 + if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps: + inst['clean_bbox'] = [ + round(float(x), 3) for x in [x1, y1, x2, y2] + ] + bboxes.append(inst) + else: + logger.warning( + 'Found an invalid bbox in annotations: im_id: {}, ' + 'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format( + img_id, float(inst['area']), x1, y1, x2, y2)) + + num_bbox = len(bboxes) + if num_bbox <= 0 and not self.allow_empty: + continue + elif num_bbox <= 0: + is_empty = True + + gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32) + gt_class = np.zeros((num_bbox, 1), dtype=np.int32) + is_crowd = np.zeros((num_bbox, 1), dtype=np.int32) + gt_poly = [None] * num_bbox + gt_track_id = -np.ones((num_bbox, 1), dtype=np.int32) + + has_segmentation = False + has_track_id = False + for i, box in enumerate(bboxes): + catid = box['category_id'] + gt_class[i][0] = self.catid2clsid[catid] + gt_bbox[i, :] = box['clean_bbox'] + is_crowd[i][0] = box['iscrowd'] + # check RLE format + if 'segmentation' in box and box['iscrowd'] == 1: + gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] + elif 'segmentation' in box and box['segmentation']: + if not np.array( + box['segmentation'], + dtype=object).size > 0 and not self.allow_empty: + bboxes.pop(i) + gt_poly.pop(i) + np.delete(is_crowd, i) + np.delete(gt_class, i) + np.delete(gt_bbox, i) + else: + gt_poly[i] = box['segmentation'] + has_segmentation = True + + if 'track_id' in box: + gt_track_id[i][0] = box['track_id'] + has_track_id = True + + if has_segmentation and not any( + gt_poly) and not self.allow_empty: + continue + + gt_rec = { + 'is_crowd': is_crowd, + 'gt_class': gt_class, + 'gt_bbox': gt_bbox, + 'gt_poly': gt_poly, + } + if has_track_id: + gt_rec.update({'gt_track_id': gt_track_id}) + + for k, v in gt_rec.items(): + if k in self.data_fields: + coco_rec[k] = v + + # TODO: remove load_semantic + if self.load_semantic and 'semantic' in self.data_fields: + seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps', + 'train2017', im_fname[:-3] + 'png') + coco_rec.update({'semantic': seg_path}) + + logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format( + im_path, img_id, im_h, im_w)) + if is_empty: + empty_records.append(coco_rec) + else: + records.append(coco_rec) + ct += 1 + if self.sample_num > 0 and ct >= self.sample_num: + break + assert ct > 0, 'not found any coco record in %s' % (anno_path) + logger.info('Load [{} samples valid, {} samples invalid] in file {}.'. + format(ct, len(img_ids) - ct, anno_path)) + if self.allow_empty and len(empty_records) > 0: + empty_records = self._sample_empty(empty_records, len(records)) + records += empty_records + self.roidbs = records + + +@register +@serializable +class SlicedCOCODataSet(COCODataSet): + """Sliced COCODataSet""" + + def __init__( + self, + dataset_dir=None, + image_dir=None, + anno_path=None, + data_fields=['image'], + sample_num=-1, + load_crowd=False, + allow_empty=False, + empty_ratio=1., + repeat=1, + sliced_size=[640, 640], + overlap_ratio=[0.25, 0.25], ): + super(SlicedCOCODataSet, self).__init__( + dataset_dir=dataset_dir, + image_dir=image_dir, + anno_path=anno_path, + data_fields=data_fields, + sample_num=sample_num, + load_crowd=load_crowd, + allow_empty=allow_empty, + empty_ratio=empty_ratio, + repeat=repeat, ) + self.sliced_size = sliced_size + self.overlap_ratio = overlap_ratio + + def parse_dataset(self): + anno_path = os.path.join(self.dataset_dir, self.anno_path) + image_dir = os.path.join(self.dataset_dir, self.image_dir) + + assert anno_path.endswith('.json'), \ + 'invalid coco annotation file: ' + anno_path + from pycocotools.coco import COCO + coco = COCO(anno_path) + img_ids = coco.getImgIds() + img_ids.sort() + cat_ids = coco.getCatIds() + records = [] + empty_records = [] + ct = 0 + ct_sub = 0 + + self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)}) + self.cname2cid = dict({ + coco.loadCats(catid)[0]['name']: clsid + for catid, clsid in self.catid2clsid.items() + }) + + if 'annotations' not in coco.dataset: + self.load_image_only = True + logger.warning('Annotation file: {} does not contains ground truth ' + 'and load image information only.'.format(anno_path)) + try: + import sahi + from sahi.slicing import slice_image + except Exception as e: + logger.error( + 'sahi not found, plaese install sahi. ' + 'for example: `pip install sahi`, see https://github.com/obss/sahi.' + ) + raise e + + sub_img_ids = 0 + for img_id in img_ids: + img_anno = coco.loadImgs([img_id])[0] + im_fname = img_anno['file_name'] + im_w = float(img_anno['width']) + im_h = float(img_anno['height']) + + im_path = os.path.join(image_dir, + im_fname) if image_dir else im_fname + is_empty = False + if not os.path.exists(im_path): + logger.warning('Illegal image file: {}, and it will be ' + 'ignored'.format(im_path)) + continue + + if im_w < 0 or im_h < 0: + logger.warning('Illegal width: {} or height: {} in annotation, ' + 'and im_id: {} will be ignored'.format( + im_w, im_h, img_id)) + continue + + slice_image_result = sahi.slicing.slice_image( + image=im_path, + slice_height=self.sliced_size[0], + slice_width=self.sliced_size[1], + overlap_height_ratio=self.overlap_ratio[0], + overlap_width_ratio=self.overlap_ratio[1]) + + sub_img_num = len(slice_image_result) + for _ind in range(sub_img_num): + im = slice_image_result.images[_ind] + coco_rec = { + 'image': im, + 'im_id': np.array([sub_img_ids + _ind]), + 'h': im.shape[0], + 'w': im.shape[1], + 'ori_im_id': np.array([img_id]), + 'st_pix': np.array( + slice_image_result.starting_pixels[_ind], + dtype=np.float32), + 'is_last': 1 if _ind == sub_img_num - 1 else 0, + } if 'image' in self.data_fields else {} + records.append(coco_rec) + ct_sub += sub_img_num + ct += 1 + if self.sample_num > 0 and ct >= self.sample_num: + break + assert ct > 0, 'not found any coco record in %s' % (anno_path) + logger.info('{} samples and slice to {} sub_samples in file {}'.format( + ct, ct_sub, anno_path)) + if self.allow_empty and len(empty_records) > 0: + empty_records = self._sample_empty(empty_records, len(records)) + records += empty_records + self.roidbs = records + + +@register +@serializable +class SemiCOCODataSet(COCODataSet): + """Semi-COCODataSet used for supervised and unsupervised dataSet""" + + def __init__(self, + dataset_dir=None, + image_dir=None, + anno_path=None, + data_fields=['image'], + sample_num=-1, + load_crowd=False, + allow_empty=False, + empty_ratio=1., + repeat=1, + supervised=True): + super(SemiCOCODataSet, self).__init__( + dataset_dir, image_dir, anno_path, data_fields, sample_num, + load_crowd, allow_empty, empty_ratio, repeat) + self.supervised = supervised + self.length = -1 # defalut -1 means all + + def parse_dataset(self): + anno_path = os.path.join(self.dataset_dir, self.anno_path) + image_dir = os.path.join(self.dataset_dir, self.image_dir) + + assert anno_path.endswith('.json'), \ + 'invalid coco annotation file: ' + anno_path + from pycocotools.coco import COCO + coco = COCO(anno_path) + img_ids = coco.getImgIds() + img_ids.sort() + cat_ids = coco.getCatIds() + records = [] + empty_records = [] + ct = 0 + + self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)}) + self.cname2cid = dict({ + coco.loadCats(catid)[0]['name']: clsid + for catid, clsid in self.catid2clsid.items() + }) + + if 'annotations' not in coco.dataset or self.supervised == False: + self.load_image_only = True + logger.warning('Annotation file: {} does not contains ground truth ' + 'and load image information only.'.format(anno_path)) + + for img_id in img_ids: + img_anno = coco.loadImgs([img_id])[0] + im_fname = img_anno['file_name'] + im_w = float(img_anno['width']) + im_h = float(img_anno['height']) + + im_path = os.path.join(image_dir, + im_fname) if image_dir else im_fname + is_empty = False + if not os.path.exists(im_path): + logger.warning('Illegal image file: {}, and it will be ' + 'ignored'.format(im_path)) + continue + + if im_w < 0 or im_h < 0: + logger.warning('Illegal width: {} or height: {} in annotation, ' + 'and im_id: {} will be ignored'.format( + im_w, im_h, img_id)) + continue + + coco_rec = { + 'im_file': im_path, + 'im_id': np.array([img_id]), + 'h': im_h, + 'w': im_w, + } if 'image' in self.data_fields else {} + + if not self.load_image_only: + ins_anno_ids = coco.getAnnIds( + imgIds=[img_id], iscrowd=None if self.load_crowd else False) + instances = coco.loadAnns(ins_anno_ids) + + bboxes = [] + is_rbox_anno = False + for inst in instances: + # check gt bbox + if inst.get('ignore', False): + continue + if 'bbox' not in inst.keys(): + continue + else: + if not any(np.array(inst['bbox'])): + continue + + x1, y1, box_w, box_h = inst['bbox'] + x2 = x1 + box_w + y2 = y1 + box_h + eps = 1e-5 + if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps: + inst['clean_bbox'] = [ + round(float(x), 3) for x in [x1, y1, x2, y2] + ] + bboxes.append(inst) + else: + logger.warning( + 'Found an invalid bbox in annotations: im_id: {}, ' + 'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format( + img_id, float(inst['area']), x1, y1, x2, y2)) + + num_bbox = len(bboxes) + if num_bbox <= 0 and not self.allow_empty: + continue + elif num_bbox <= 0: + is_empty = True + + gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32) + gt_class = np.zeros((num_bbox, 1), dtype=np.int32) + is_crowd = np.zeros((num_bbox, 1), dtype=np.int32) + gt_poly = [None] * num_bbox + + has_segmentation = False + for i, box in enumerate(bboxes): + catid = box['category_id'] + gt_class[i][0] = self.catid2clsid[catid] + gt_bbox[i, :] = box['clean_bbox'] + is_crowd[i][0] = box['iscrowd'] + # check RLE format + if 'segmentation' in box and box['iscrowd'] == 1: + gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] + elif 'segmentation' in box and box['segmentation']: + if not np.array(box['segmentation'] + ).size > 0 and not self.allow_empty: + bboxes.pop(i) + gt_poly.pop(i) + np.delete(is_crowd, i) + np.delete(gt_class, i) + np.delete(gt_bbox, i) + else: + gt_poly[i] = box['segmentation'] + has_segmentation = True + + if has_segmentation and not any( + gt_poly) and not self.allow_empty: + continue + + gt_rec = { + 'is_crowd': is_crowd, + 'gt_class': gt_class, + 'gt_bbox': gt_bbox, + 'gt_poly': gt_poly, + } + + for k, v in gt_rec.items(): + if k in self.data_fields: + coco_rec[k] = v + + # TODO: remove load_semantic + if self.load_semantic and 'semantic' in self.data_fields: + seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps', + 'train2017', im_fname[:-3] + 'png') + coco_rec.update({'semantic': seg_path}) + + logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format( + im_path, img_id, im_h, im_w)) + if is_empty: + empty_records.append(coco_rec) + else: + records.append(coco_rec) + ct += 1 + if self.sample_num > 0 and ct >= self.sample_num: + break + assert ct > 0, 'not found any coco record in %s' % (anno_path) + logger.info('Load [{} samples valid, {} samples invalid] in file {}.'. + format(ct, len(img_ids) - ct, anno_path)) + if self.allow_empty and len(empty_records) > 0: + empty_records = self._sample_empty(empty_records, len(records)) + records += empty_records + self.roidbs = records + + if self.supervised: + logger.info(f'Use {len(self.roidbs)} sup_samples data as LABELED') + else: + if self.length > 0: # unsup length will be decide by sup length + all_roidbs = self.roidbs.copy() + selected_idxs = [ + np.random.choice(len(all_roidbs)) + for _ in range(self.length) + ] + self.roidbs = [all_roidbs[i] for i in selected_idxs] + logger.info( + f'Use {len(self.roidbs)} unsup_samples data as UNLABELED') + + def __getitem__(self, idx): + n = len(self.roidbs) + if self.repeat > 1: + idx %= n + # data batch + roidb = copy.deepcopy(self.roidbs[idx]) + if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch: + idx = np.random.randint(n) + roidb = [roidb, copy.deepcopy(self.roidbs[idx])] + elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch: + idx = np.random.randint(n) + roidb = [roidb, copy.deepcopy(self.roidbs[idx])] + elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch: + roidb = [roidb, ] + [ + copy.deepcopy(self.roidbs[np.random.randint(n)]) + for _ in range(4) + ] + if isinstance(roidb, Sequence): + for r in roidb: + r['curr_iter'] = self._curr_iter + else: + roidb['curr_iter'] = self._curr_iter + self._curr_iter += 1 + + return self.transform(roidb) diff --git a/rtdetr_paddle/ppdet/data/source/dataset.py b/rtdetr_paddle/ppdet/data/source/dataset.py new file mode 100644 index 0000000..4f22b22 --- /dev/null +++ b/rtdetr_paddle/ppdet/data/source/dataset.py @@ -0,0 +1,307 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import copy +import numpy as np +try: + from collections.abc import Sequence +except Exception: + from collections import Sequence +from paddle.io import Dataset +from ppdet.core.workspace import register, serializable +from ppdet.utils.download import get_dataset_path +from ppdet.data import source + +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + + +@serializable +class DetDataset(Dataset): + """ + Load detection dataset. + + Args: + dataset_dir (str): root directory for dataset. + image_dir (str): directory for images. + anno_path (str): annotation file path. + data_fields (list): key name of data dictionary, at least have 'image'. + sample_num (int): number of samples to load, -1 means all. + use_default_label (bool): whether to load default label list. + repeat (int): repeat times for dataset, use in benchmark. + """ + + def __init__(self, + dataset_dir=None, + image_dir=None, + anno_path=None, + data_fields=['image'], + sample_num=-1, + use_default_label=None, + repeat=1, + **kwargs): + super(DetDataset, self).__init__() + self.dataset_dir = dataset_dir if dataset_dir is not None else '' + self.anno_path = anno_path + self.image_dir = image_dir if image_dir is not None else '' + self.data_fields = data_fields + self.sample_num = sample_num + self.use_default_label = use_default_label + self.repeat = repeat + self._epoch = 0 + self._curr_iter = 0 + + def __len__(self, ): + return len(self.roidbs) * self.repeat + + def __call__(self, *args, **kwargs): + return self + + def __getitem__(self, idx): + n = len(self.roidbs) + if self.repeat > 1: + idx %= n + # data batch + roidb = copy.deepcopy(self.roidbs[idx]) + if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch: + idx = np.random.randint(n) + roidb = [roidb, copy.deepcopy(self.roidbs[idx])] + elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch: + idx = np.random.randint(n) + roidb = [roidb, copy.deepcopy(self.roidbs[idx])] + elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch: + roidb = [roidb, ] + [ + copy.deepcopy(self.roidbs[np.random.randint(n)]) + for _ in range(4) + ] + elif self.pre_img_epoch == 0 or self._epoch < self.pre_img_epoch: + # Add previous image as input, only used in CenterTrack + idx_pre_img = idx - 1 + if idx_pre_img < 0: + idx_pre_img = idx + 1 + roidb = [roidb, ] + [copy.deepcopy(self.roidbs[idx_pre_img])] + if isinstance(roidb, Sequence): + for r in roidb: + r['curr_iter'] = self._curr_iter + else: + roidb['curr_iter'] = self._curr_iter + self._curr_iter += 1 + + return self.transform(roidb) + + def check_or_download_dataset(self): + self.dataset_dir = get_dataset_path(self.dataset_dir, self.anno_path, + self.image_dir) + + def set_kwargs(self, **kwargs): + self.mixup_epoch = kwargs.get('mixup_epoch', -1) + self.cutmix_epoch = kwargs.get('cutmix_epoch', -1) + self.mosaic_epoch = kwargs.get('mosaic_epoch', -1) + self.pre_img_epoch = kwargs.get('pre_img_epoch', -1) + + def set_transform(self, transform): + self.transform = transform + + def set_epoch(self, epoch_id): + self._epoch = epoch_id + + def parse_dataset(self, ): + raise NotImplementedError( + "Need to implement parse_dataset method of Dataset") + + def get_anno(self): + if self.anno_path is None: + return + return os.path.join(self.dataset_dir, self.anno_path) + + +def _is_valid_file(f, extensions=('.jpg', '.jpeg', '.png', '.bmp')): + return f.lower().endswith(extensions) + + +def _make_dataset(dir): + dir = os.path.expanduser(dir) + if not os.path.isdir(dir): + raise ('{} should be a dir'.format(dir)) + images = [] + for root, _, fnames in sorted(os.walk(dir, followlinks=True)): + for fname in sorted(fnames): + path = os.path.join(root, fname) + if _is_valid_file(path): + images.append(path) + return images + + +@register +@serializable +class ImageFolder(DetDataset): + def __init__(self, + dataset_dir=None, + image_dir=None, + anno_path=None, + sample_num=-1, + use_default_label=None, + **kwargs): + super(ImageFolder, self).__init__( + dataset_dir, + image_dir, + anno_path, + sample_num=sample_num, + use_default_label=use_default_label) + self._imid2path = {} + self.roidbs = None + self.sample_num = sample_num + + def check_or_download_dataset(self): + return + + def get_anno(self): + if self.anno_path is None: + return + if self.dataset_dir: + return os.path.join(self.dataset_dir, self.anno_path) + else: + return self.anno_path + + def parse_dataset(self, ): + if not self.roidbs: + self.roidbs = self._load_images() + + def _parse(self): + image_dir = self.image_dir + if not isinstance(image_dir, Sequence): + image_dir = [image_dir] + images = [] + for im_dir in image_dir: + if os.path.isdir(im_dir): + im_dir = os.path.join(self.dataset_dir, im_dir) + images.extend(_make_dataset(im_dir)) + elif os.path.isfile(im_dir) and _is_valid_file(im_dir): + images.append(im_dir) + return images + + def _load_images(self): + images = self._parse() + ct = 0 + records = [] + for image in images: + assert image != '' and os.path.isfile(image), \ + "Image {} not found".format(image) + if self.sample_num > 0 and ct >= self.sample_num: + break + rec = {'im_id': np.array([ct]), 'im_file': image} + self._imid2path[ct] = image + ct += 1 + records.append(rec) + assert len(records) > 0, "No image file found" + return records + + def get_imid2path(self): + return self._imid2path + + def set_images(self, images): + self.image_dir = images + self.roidbs = self._load_images() + + def set_slice_images(self, + images, + slice_size=[640, 640], + overlap_ratio=[0.25, 0.25]): + self.image_dir = images + ori_records = self._load_images() + try: + import sahi + from sahi.slicing import slice_image + except Exception as e: + logger.error( + 'sahi not found, plaese install sahi. ' + 'for example: `pip install sahi`, see https://github.com/obss/sahi.' + ) + raise e + + sub_img_ids = 0 + ct = 0 + ct_sub = 0 + records = [] + for i, ori_rec in enumerate(ori_records): + im_path = ori_rec['im_file'] + slice_image_result = sahi.slicing.slice_image( + image=im_path, + slice_height=slice_size[0], + slice_width=slice_size[1], + overlap_height_ratio=overlap_ratio[0], + overlap_width_ratio=overlap_ratio[1]) + + sub_img_num = len(slice_image_result) + for _ind in range(sub_img_num): + im = slice_image_result.images[_ind] + rec = { + 'image': im, + 'im_id': np.array([sub_img_ids + _ind]), + 'h': im.shape[0], + 'w': im.shape[1], + 'ori_im_id': np.array([ori_rec['im_id'][0]]), + 'st_pix': np.array( + slice_image_result.starting_pixels[_ind], + dtype=np.float32), + 'is_last': 1 if _ind == sub_img_num - 1 else 0, + } if 'image' in self.data_fields else {} + records.append(rec) + ct_sub += sub_img_num + ct += 1 + logger.info('{} samples and slice to {} sub_samples.'.format(ct, + ct_sub)) + self.roidbs = records + + def get_label_list(self): + # Only VOC dataset needs label list in ImageFold + return self.anno_path + + +@register +class CommonDataset(object): + def __init__(self, **dataset_args): + super(CommonDataset, self).__init__() + dataset_args = copy.deepcopy(dataset_args) + type = dataset_args.pop("name") + self.dataset = getattr(source, type)(**dataset_args) + + def __call__(self): + return self.dataset + + +@register +class TrainDataset(CommonDataset): + pass + + +@register +class EvalMOTDataset(CommonDataset): + pass + + +@register +class TestMOTDataset(CommonDataset): + pass + + +@register +class EvalDataset(CommonDataset): + pass + + +@register +class TestDataset(CommonDataset): + pass diff --git a/rtdetr_paddle/ppdet/data/source/voc.py b/rtdetr_paddle/ppdet/data/source/voc.py new file mode 100644 index 0000000..2f10358 --- /dev/null +++ b/rtdetr_paddle/ppdet/data/source/voc.py @@ -0,0 +1,234 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import numpy as np + +import xml.etree.ElementTree as ET + +from ppdet.core.workspace import register, serializable + +from .dataset import DetDataset + +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + + +@register +@serializable +class VOCDataSet(DetDataset): + """ + Load dataset with PascalVOC format. + + Notes: + `anno_path` must contains xml file and image file path for annotations. + + Args: + dataset_dir (str): root directory for dataset. + image_dir (str): directory for images. + anno_path (str): voc annotation file path. + data_fields (list): key name of data dictionary, at least have 'image'. + sample_num (int): number of samples to load, -1 means all. + label_list (str): if use_default_label is False, will load + mapping between category and class index. + allow_empty (bool): whether to load empty entry. False as default + empty_ratio (float): the ratio of empty record number to total + record's, if empty_ratio is out of [0. ,1.), do not sample the + records and use all the empty entries. 1. as default + repeat (int): repeat times for dataset, use in benchmark. + """ + + def __init__(self, + dataset_dir=None, + image_dir=None, + anno_path=None, + data_fields=['image'], + sample_num=-1, + label_list=None, + allow_empty=False, + empty_ratio=1., + repeat=1): + super(VOCDataSet, self).__init__( + dataset_dir=dataset_dir, + image_dir=image_dir, + anno_path=anno_path, + data_fields=data_fields, + sample_num=sample_num, + repeat=repeat) + self.label_list = label_list + self.allow_empty = allow_empty + self.empty_ratio = empty_ratio + + def _sample_empty(self, records, num): + # if empty_ratio is out of [0. ,1.), do not sample the records + if self.empty_ratio < 0. or self.empty_ratio >= 1.: + return records + import random + sample_num = min( + int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records)) + records = random.sample(records, sample_num) + return records + + def parse_dataset(self, ): + anno_path = os.path.join(self.dataset_dir, self.anno_path) + image_dir = os.path.join(self.dataset_dir, self.image_dir) + + # mapping category name to class id + # first_class:0, second_class:1, ... + records = [] + empty_records = [] + ct = 0 + cname2cid = {} + if self.label_list: + label_path = os.path.join(self.dataset_dir, self.label_list) + if not os.path.exists(label_path): + raise ValueError("label_list {} does not exists".format( + label_path)) + with open(label_path, 'r') as fr: + label_id = 0 + for line in fr.readlines(): + cname2cid[line.strip()] = label_id + label_id += 1 + else: + cname2cid = pascalvoc_label() + + with open(anno_path, 'r') as fr: + while True: + line = fr.readline() + if not line: + break + img_file, xml_file = [os.path.join(image_dir, x) \ + for x in line.strip().split()[:2]] + if not os.path.exists(img_file): + logger.warning( + 'Illegal image file: {}, and it will be ignored'.format( + img_file)) + continue + if not os.path.isfile(xml_file): + logger.warning( + 'Illegal xml file: {}, and it will be ignored'.format( + xml_file)) + continue + tree = ET.parse(xml_file) + if tree.find('id') is None: + im_id = np.array([ct]) + else: + im_id = np.array([int(tree.find('id').text)]) + + objs = tree.findall('object') + im_w = float(tree.find('size').find('width').text) + im_h = float(tree.find('size').find('height').text) + if im_w < 0 or im_h < 0: + logger.warning( + 'Illegal width: {} or height: {} in annotation, ' + 'and {} will be ignored'.format(im_w, im_h, xml_file)) + continue + + num_bbox, i = len(objs), 0 + gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32) + gt_class = np.zeros((num_bbox, 1), dtype=np.int32) + gt_score = np.zeros((num_bbox, 1), dtype=np.float32) + difficult = np.zeros((num_bbox, 1), dtype=np.int32) + for obj in objs: + cname = obj.find('name').text + + # user dataset may not contain difficult field + _difficult = obj.find('difficult') + _difficult = int( + _difficult.text) if _difficult is not None else 0 + + x1 = float(obj.find('bndbox').find('xmin').text) + y1 = float(obj.find('bndbox').find('ymin').text) + x2 = float(obj.find('bndbox').find('xmax').text) + y2 = float(obj.find('bndbox').find('ymax').text) + x1 = max(0, x1) + y1 = max(0, y1) + x2 = min(im_w - 1, x2) + y2 = min(im_h - 1, y2) + if x2 > x1 and y2 > y1: + gt_bbox[i, :] = [x1, y1, x2, y2] + gt_class[i, 0] = cname2cid[cname] + gt_score[i, 0] = 1. + difficult[i, 0] = _difficult + i += 1 + else: + logger.warning( + 'Found an invalid bbox in annotations: xml_file: {}' + ', x1: {}, y1: {}, x2: {}, y2: {}.'.format( + xml_file, x1, y1, x2, y2)) + gt_bbox = gt_bbox[:i, :] + gt_class = gt_class[:i, :] + gt_score = gt_score[:i, :] + difficult = difficult[:i, :] + + voc_rec = { + 'im_file': img_file, + 'im_id': im_id, + 'h': im_h, + 'w': im_w + } if 'image' in self.data_fields else {} + + gt_rec = { + 'gt_class': gt_class, + 'gt_score': gt_score, + 'gt_bbox': gt_bbox, + 'difficult': difficult + } + for k, v in gt_rec.items(): + if k in self.data_fields: + voc_rec[k] = v + + if len(objs) == 0: + empty_records.append(voc_rec) + else: + records.append(voc_rec) + + ct += 1 + if self.sample_num > 0 and ct >= self.sample_num: + break + assert ct > 0, 'not found any voc record in %s' % (self.anno_path) + logger.debug('{} samples in file {}'.format(ct, anno_path)) + if self.allow_empty and len(empty_records) > 0: + empty_records = self._sample_empty(empty_records, len(records)) + records += empty_records + self.roidbs, self.cname2cid = records, cname2cid + + def get_label_list(self): + return os.path.join(self.dataset_dir, self.label_list) + + +def pascalvoc_label(): + labels_map = { + 'aeroplane': 0, + 'bicycle': 1, + 'bird': 2, + 'boat': 3, + 'bottle': 4, + 'bus': 5, + 'car': 6, + 'cat': 7, + 'chair': 8, + 'cow': 9, + 'diningtable': 10, + 'dog': 11, + 'horse': 12, + 'motorbike': 13, + 'person': 14, + 'pottedplant': 15, + 'sheep': 16, + 'sofa': 17, + 'train': 18, + 'tvmonitor': 19 + } + return labels_map diff --git a/rtdetr_paddle/ppdet/data/transform/__init__.py b/rtdetr_paddle/ppdet/data/transform/__init__.py new file mode 100644 index 0000000..0b71513 --- /dev/null +++ b/rtdetr_paddle/ppdet/data/transform/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import operators +from . import batch_operators + + +from .operators import * +from .batch_operators import * + + +__all__ = [] +__all__ += registered_ops + diff --git a/rtdetr_paddle/ppdet/data/transform/batch_operators.py b/rtdetr_paddle/ppdet/data/transform/batch_operators.py new file mode 100644 index 0000000..c381382 --- /dev/null +++ b/rtdetr_paddle/ppdet/data/transform/batch_operators.py @@ -0,0 +1,322 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import typing + +try: + from collections.abc import Sequence +except Exception: + from collections import Sequence + +import cv2 +import numpy as np +from .operators import register_op, BaseOperator, Resize +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + +__all__ = [ + 'PadBatch', + 'BatchRandomResize', + 'PadGT', +] + + +@register_op +class PadBatch(BaseOperator): + """ + Pad a batch of samples so they can be divisible by a stride. + The layout of each image should be 'CHW'. + Args: + pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure + height and width is divisible by `pad_to_stride`. + """ + + def __init__(self, pad_to_stride=0): + super(PadBatch, self).__init__() + self.pad_to_stride = pad_to_stride + + def __call__(self, samples, context=None): + """ + Args: + samples (list): a batch of sample, each is dict. + """ + coarsest_stride = self.pad_to_stride + + # multi scale input is nested list + if isinstance(samples, + typing.Sequence) and len(samples) > 0 and isinstance( + samples[0], typing.Sequence): + inner_samples = samples[0] + else: + inner_samples = samples + + max_shape = np.array( + [data['image'].shape for data in inner_samples]).max(axis=0) + if coarsest_stride > 0: + max_shape[1] = int( + np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride) + max_shape[2] = int( + np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride) + + for data in inner_samples: + im = data['image'] + im_c, im_h, im_w = im.shape[:] + padding_im = np.zeros( + (im_c, max_shape[1], max_shape[2]), dtype=np.float32) + padding_im[:, :im_h, :im_w] = im + data['image'] = padding_im + if 'semantic' in data and data['semantic'] is not None: + semantic = data['semantic'] + padding_sem = np.zeros( + (1, max_shape[1], max_shape[2]), dtype=np.float32) + padding_sem[:, :im_h, :im_w] = semantic + data['semantic'] = padding_sem + if 'gt_segm' in data and data['gt_segm'] is not None: + gt_segm = data['gt_segm'] + padding_segm = np.zeros( + (gt_segm.shape[0], max_shape[1], max_shape[2]), + dtype=np.uint8) + padding_segm[:, :im_h, :im_w] = gt_segm + data['gt_segm'] = padding_segm + + return samples + + +@register_op +class BatchRandomResize(BaseOperator): + """ + Resize image to target size randomly. random target_size and interpolation method + Args: + target_size (int, list, tuple): image target size, if random size is True, must be list or tuple + keep_ratio (bool): whether keep_raio or not, default true + interp (int): the interpolation method + random_size (bool): whether random select target size of image + random_interp (bool): whether random select interpolation method + """ + + def __init__(self, + target_size, + keep_ratio, + interp=cv2.INTER_NEAREST, + random_size=True, + random_interp=False): + super(BatchRandomResize, self).__init__() + self.keep_ratio = keep_ratio + self.interps = [ + cv2.INTER_NEAREST, + cv2.INTER_LINEAR, + cv2.INTER_AREA, + cv2.INTER_CUBIC, + cv2.INTER_LANCZOS4, + ] + self.interp = interp + assert isinstance(target_size, ( + int, Sequence)), "target_size must be int, list or tuple" + if random_size and not isinstance(target_size, list): + raise TypeError( + "Type of target_size is invalid when random_size is True. Must be List, now is {}". + format(type(target_size))) + self.target_size = target_size + self.random_size = random_size + self.random_interp = random_interp + + def __call__(self, samples, context=None): + if self.random_size: + index = np.random.choice(len(self.target_size)) + target_size = self.target_size[index] + else: + target_size = self.target_size + + if self.random_interp: + interp = np.random.choice(self.interps) + else: + interp = self.interp + + resizer = Resize(target_size, keep_ratio=self.keep_ratio, interp=interp) + return resizer(samples, context=context) + + +@register_op +class PadGT(BaseOperator): + """ + Pad 0 to `gt_class`, `gt_bbox`, `gt_score`... + The num_max_boxes is the largest for batch. + Args: + return_gt_mask (bool): If true, return `pad_gt_mask`, + 1 means bbox, 0 means no bbox. + """ + + def __init__(self, return_gt_mask=True, pad_img=False, minimum_gtnum=0): + super(PadGT, self).__init__() + self.return_gt_mask = return_gt_mask + self.pad_img = pad_img + self.minimum_gtnum = minimum_gtnum + + def _impad(self, + img: np.ndarray, + *, + shape=None, + padding=None, + pad_val=0, + padding_mode='constant') -> np.ndarray: + """Pad the given image to a certain shape or pad on all sides with + specified padding mode and padding value. + + Args: + img (ndarray): Image to be padded. + shape (tuple[int]): Expected padding shape (h, w). Default: None. + padding (int or tuple[int]): Padding on each border. If a single int is + provided this is used to pad all borders. If tuple of length 2 is + provided this is the padding on left/right and top/bottom + respectively. If a tuple of length 4 is provided this is the + padding for the left, top, right and bottom borders respectively. + Default: None. Note that `shape` and `padding` can not be both + set. + pad_val (Number | Sequence[Number]): Values to be filled in padding + areas when padding_mode is 'constant'. Default: 0. + padding_mode (str): Type of padding. Should be: constant, edge, + reflect or symmetric. Default: constant. + - constant: pads with a constant value, this value is specified + with pad_val. + - edge: pads with the last value at the edge of the image. + - reflect: pads with reflection of image without repeating the last + value on the edge. For example, padding [1, 2, 3, 4] with 2 + elements on both sides in reflect mode will result in + [3, 2, 1, 2, 3, 4, 3, 2]. + - symmetric: pads with reflection of image repeating the last value + on the edge. For example, padding [1, 2, 3, 4] with 2 elements on + both sides in symmetric mode will result in + [2, 1, 1, 2, 3, 4, 4, 3] + + Returns: + ndarray: The padded image. + """ + + assert (shape is not None) ^ (padding is not None) + if shape is not None: + width = max(shape[1] - img.shape[1], 0) + height = max(shape[0] - img.shape[0], 0) + padding = (0, 0, int(width), int(height)) + + # check pad_val + import numbers + if isinstance(pad_val, tuple): + assert len(pad_val) == img.shape[-1] + elif not isinstance(pad_val, numbers.Number): + raise TypeError('pad_val must be a int or a tuple. ' + f'But received {type(pad_val)}') + + # check padding + if isinstance(padding, tuple) and len(padding) in [2, 4]: + if len(padding) == 2: + padding = (padding[0], padding[1], padding[0], padding[1]) + elif isinstance(padding, numbers.Number): + padding = (padding, padding, padding, padding) + else: + raise ValueError('Padding must be a int or a 2, or 4 element tuple.' + f'But received {padding}') + + # check padding mode + assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric'] + + border_type = { + 'constant': cv2.BORDER_CONSTANT, + 'edge': cv2.BORDER_REPLICATE, + 'reflect': cv2.BORDER_REFLECT_101, + 'symmetric': cv2.BORDER_REFLECT + } + img = cv2.copyMakeBorder( + img, + padding[1], + padding[3], + padding[0], + padding[2], + border_type[padding_mode], + value=pad_val) + + return img + + def checkmaxshape(self, samples): + maxh, maxw = 0, 0 + for sample in samples: + h, w = sample['im_shape'] + if h > maxh: + maxh = h + if w > maxw: + maxw = w + return (maxh, maxw) + + def __call__(self, samples, context=None): + num_max_boxes = max([len(s['gt_bbox']) for s in samples]) + num_max_boxes = max(self.minimum_gtnum, num_max_boxes) + if self.pad_img: + maxshape = self.checkmaxshape(samples) + for sample in samples: + if self.pad_img: + img = sample['image'] + padimg = self._impad(img, shape=maxshape) + sample['image'] = padimg + if self.return_gt_mask: + sample['pad_gt_mask'] = np.zeros( + (num_max_boxes, 1), dtype=np.float32) + if num_max_boxes == 0: + continue + + num_gt = len(sample['gt_bbox']) + pad_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32) + pad_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32) + if num_gt > 0: + pad_gt_class[:num_gt] = sample['gt_class'] + pad_gt_bbox[:num_gt] = sample['gt_bbox'] + sample['gt_class'] = pad_gt_class + sample['gt_bbox'] = pad_gt_bbox + # pad_gt_mask + if 'pad_gt_mask' in sample: + sample['pad_gt_mask'][:num_gt] = 1 + # gt_score + if 'gt_score' in sample: + pad_gt_score = np.zeros((num_max_boxes, 1), dtype=np.float32) + if num_gt > 0: + pad_gt_score[:num_gt] = sample['gt_score'] + sample['gt_score'] = pad_gt_score + if 'is_crowd' in sample: + pad_is_crowd = np.zeros((num_max_boxes, 1), dtype=np.int32) + if num_gt > 0: + pad_is_crowd[:num_gt] = sample['is_crowd'] + sample['is_crowd'] = pad_is_crowd + if 'difficult' in sample: + pad_diff = np.zeros((num_max_boxes, 1), dtype=np.int32) + if num_gt > 0: + pad_diff[:num_gt] = sample['difficult'] + sample['difficult'] = pad_diff + if 'gt_joints' in sample: + num_joints = sample['gt_joints'].shape[1] + pad_gt_joints = np.zeros( + (num_max_boxes, num_joints, 3), dtype=np.float32) + if num_gt > 0: + pad_gt_joints[:num_gt] = sample['gt_joints'] + sample['gt_joints'] = pad_gt_joints + if 'gt_areas' in sample: + pad_gt_areas = np.zeros((num_max_boxes, 1), dtype=np.float32) + if num_gt > 0: + pad_gt_areas[:num_gt, 0] = sample['gt_areas'] + sample['gt_areas'] = pad_gt_areas + return samples + + + diff --git a/rtdetr_paddle/ppdet/data/transform/op_helper.py b/rtdetr_paddle/ppdet/data/transform/op_helper.py new file mode 100644 index 0000000..6c40030 --- /dev/null +++ b/rtdetr_paddle/ppdet/data/transform/op_helper.py @@ -0,0 +1,494 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# this file contains helper methods for BBOX processing + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import random +import math +import cv2 + + +def meet_emit_constraint(src_bbox, sample_bbox): + center_x = (src_bbox[2] + src_bbox[0]) / 2 + center_y = (src_bbox[3] + src_bbox[1]) / 2 + if center_x >= sample_bbox[0] and \ + center_x <= sample_bbox[2] and \ + center_y >= sample_bbox[1] and \ + center_y <= sample_bbox[3]: + return True + return False + + +def clip_bbox(src_bbox): + src_bbox[0] = max(min(src_bbox[0], 1.0), 0.0) + src_bbox[1] = max(min(src_bbox[1], 1.0), 0.0) + src_bbox[2] = max(min(src_bbox[2], 1.0), 0.0) + src_bbox[3] = max(min(src_bbox[3], 1.0), 0.0) + return src_bbox + + +def bbox_area(src_bbox): + if src_bbox[2] < src_bbox[0] or src_bbox[3] < src_bbox[1]: + return 0. + else: + width = src_bbox[2] - src_bbox[0] + height = src_bbox[3] - src_bbox[1] + return width * height + + +def is_overlap(object_bbox, sample_bbox): + if object_bbox[0] >= sample_bbox[2] or \ + object_bbox[2] <= sample_bbox[0] or \ + object_bbox[1] >= sample_bbox[3] or \ + object_bbox[3] <= sample_bbox[1]: + return False + else: + return True + + +def filter_and_process(sample_bbox, bboxes, labels, scores=None, + keypoints=None): + new_bboxes = [] + new_labels = [] + new_scores = [] + new_keypoints = [] + new_kp_ignore = [] + for i in range(len(bboxes)): + new_bbox = [0, 0, 0, 0] + obj_bbox = [bboxes[i][0], bboxes[i][1], bboxes[i][2], bboxes[i][3]] + if not meet_emit_constraint(obj_bbox, sample_bbox): + continue + if not is_overlap(obj_bbox, sample_bbox): + continue + sample_width = sample_bbox[2] - sample_bbox[0] + sample_height = sample_bbox[3] - sample_bbox[1] + new_bbox[0] = (obj_bbox[0] - sample_bbox[0]) / sample_width + new_bbox[1] = (obj_bbox[1] - sample_bbox[1]) / sample_height + new_bbox[2] = (obj_bbox[2] - sample_bbox[0]) / sample_width + new_bbox[3] = (obj_bbox[3] - sample_bbox[1]) / sample_height + new_bbox = clip_bbox(new_bbox) + if bbox_area(new_bbox) > 0: + new_bboxes.append(new_bbox) + new_labels.append([labels[i][0]]) + if scores is not None: + new_scores.append([scores[i][0]]) + if keypoints is not None: + sample_keypoint = keypoints[0][i] + for j in range(len(sample_keypoint)): + kp_len = sample_height if j % 2 else sample_width + sample_coord = sample_bbox[1] if j % 2 else sample_bbox[0] + sample_keypoint[j] = ( + sample_keypoint[j] - sample_coord) / kp_len + sample_keypoint[j] = max(min(sample_keypoint[j], 1.0), 0.0) + new_keypoints.append(sample_keypoint) + new_kp_ignore.append(keypoints[1][i]) + + bboxes = np.array(new_bboxes) + labels = np.array(new_labels) + scores = np.array(new_scores) + if keypoints is not None: + keypoints = np.array(new_keypoints) + new_kp_ignore = np.array(new_kp_ignore) + return bboxes, labels, scores, (keypoints, new_kp_ignore) + return bboxes, labels, scores + + +def bbox_area_sampling(bboxes, labels, scores, target_size, min_size): + new_bboxes = [] + new_labels = [] + new_scores = [] + for i, bbox in enumerate(bboxes): + w = float((bbox[2] - bbox[0]) * target_size) + h = float((bbox[3] - bbox[1]) * target_size) + if w * h < float(min_size * min_size): + continue + else: + new_bboxes.append(bbox) + new_labels.append(labels[i]) + if scores is not None and scores.size != 0: + new_scores.append(scores[i]) + bboxes = np.array(new_bboxes) + labels = np.array(new_labels) + scores = np.array(new_scores) + return bboxes, labels, scores + + +def generate_sample_bbox(sampler): + scale = np.random.uniform(sampler[2], sampler[3]) + aspect_ratio = np.random.uniform(sampler[4], sampler[5]) + aspect_ratio = max(aspect_ratio, (scale**2.0)) + aspect_ratio = min(aspect_ratio, 1 / (scale**2.0)) + bbox_width = scale * (aspect_ratio**0.5) + bbox_height = scale / (aspect_ratio**0.5) + xmin_bound = 1 - bbox_width + ymin_bound = 1 - bbox_height + xmin = np.random.uniform(0, xmin_bound) + ymin = np.random.uniform(0, ymin_bound) + xmax = xmin + bbox_width + ymax = ymin + bbox_height + sampled_bbox = [xmin, ymin, xmax, ymax] + return sampled_bbox + + +def generate_sample_bbox_square(sampler, image_width, image_height): + scale = np.random.uniform(sampler[2], sampler[3]) + aspect_ratio = np.random.uniform(sampler[4], sampler[5]) + aspect_ratio = max(aspect_ratio, (scale**2.0)) + aspect_ratio = min(aspect_ratio, 1 / (scale**2.0)) + bbox_width = scale * (aspect_ratio**0.5) + bbox_height = scale / (aspect_ratio**0.5) + if image_height < image_width: + bbox_width = bbox_height * image_height / image_width + else: + bbox_height = bbox_width * image_width / image_height + xmin_bound = 1 - bbox_width + ymin_bound = 1 - bbox_height + xmin = np.random.uniform(0, xmin_bound) + ymin = np.random.uniform(0, ymin_bound) + xmax = xmin + bbox_width + ymax = ymin + bbox_height + sampled_bbox = [xmin, ymin, xmax, ymax] + return sampled_bbox + + +def data_anchor_sampling(bbox_labels, image_width, image_height, scale_array, + resize_width): + num_gt = len(bbox_labels) + # np.random.randint range: [low, high) + rand_idx = np.random.randint(0, num_gt) if num_gt != 0 else 0 + + if num_gt != 0: + norm_xmin = bbox_labels[rand_idx][0] + norm_ymin = bbox_labels[rand_idx][1] + norm_xmax = bbox_labels[rand_idx][2] + norm_ymax = bbox_labels[rand_idx][3] + + xmin = norm_xmin * image_width + ymin = norm_ymin * image_height + wid = image_width * (norm_xmax - norm_xmin) + hei = image_height * (norm_ymax - norm_ymin) + range_size = 0 + + area = wid * hei + for scale_ind in range(0, len(scale_array) - 1): + if area > scale_array[scale_ind] ** 2 and area < \ + scale_array[scale_ind + 1] ** 2: + range_size = scale_ind + 1 + break + + if area > scale_array[len(scale_array) - 2]**2: + range_size = len(scale_array) - 2 + + scale_choose = 0.0 + if range_size == 0: + rand_idx_size = 0 + else: + # np.random.randint range: [low, high) + rng_rand_size = np.random.randint(0, range_size + 1) + rand_idx_size = rng_rand_size % (range_size + 1) + + if rand_idx_size == range_size: + min_resize_val = scale_array[rand_idx_size] / 2.0 + max_resize_val = min(2.0 * scale_array[rand_idx_size], + 2 * math.sqrt(wid * hei)) + scale_choose = random.uniform(min_resize_val, max_resize_val) + else: + min_resize_val = scale_array[rand_idx_size] / 2.0 + max_resize_val = 2.0 * scale_array[rand_idx_size] + scale_choose = random.uniform(min_resize_val, max_resize_val) + + sample_bbox_size = wid * resize_width / scale_choose + + w_off_orig = 0.0 + h_off_orig = 0.0 + if sample_bbox_size < max(image_height, image_width): + if wid <= sample_bbox_size: + w_off_orig = np.random.uniform(xmin + wid - sample_bbox_size, + xmin) + else: + w_off_orig = np.random.uniform(xmin, + xmin + wid - sample_bbox_size) + + if hei <= sample_bbox_size: + h_off_orig = np.random.uniform(ymin + hei - sample_bbox_size, + ymin) + else: + h_off_orig = np.random.uniform(ymin, + ymin + hei - sample_bbox_size) + + else: + w_off_orig = np.random.uniform(image_width - sample_bbox_size, 0.0) + h_off_orig = np.random.uniform(image_height - sample_bbox_size, 0.0) + + w_off_orig = math.floor(w_off_orig) + h_off_orig = math.floor(h_off_orig) + + # Figure out top left coordinates. + w_off = float(w_off_orig / image_width) + h_off = float(h_off_orig / image_height) + + sampled_bbox = [ + w_off, h_off, w_off + float(sample_bbox_size / image_width), + h_off + float(sample_bbox_size / image_height) + ] + return sampled_bbox + else: + return 0 + + +def jaccard_overlap(sample_bbox, object_bbox): + if sample_bbox[0] >= object_bbox[2] or \ + sample_bbox[2] <= object_bbox[0] or \ + sample_bbox[1] >= object_bbox[3] or \ + sample_bbox[3] <= object_bbox[1]: + return 0 + intersect_xmin = max(sample_bbox[0], object_bbox[0]) + intersect_ymin = max(sample_bbox[1], object_bbox[1]) + intersect_xmax = min(sample_bbox[2], object_bbox[2]) + intersect_ymax = min(sample_bbox[3], object_bbox[3]) + intersect_size = (intersect_xmax - intersect_xmin) * ( + intersect_ymax - intersect_ymin) + sample_bbox_size = bbox_area(sample_bbox) + object_bbox_size = bbox_area(object_bbox) + overlap = intersect_size / ( + sample_bbox_size + object_bbox_size - intersect_size) + return overlap + + +def intersect_bbox(bbox1, bbox2): + if bbox2[0] > bbox1[2] or bbox2[2] < bbox1[0] or \ + bbox2[1] > bbox1[3] or bbox2[3] < bbox1[1]: + intersection_box = [0.0, 0.0, 0.0, 0.0] + else: + intersection_box = [ + max(bbox1[0], bbox2[0]), max(bbox1[1], bbox2[1]), + min(bbox1[2], bbox2[2]), min(bbox1[3], bbox2[3]) + ] + return intersection_box + + +def bbox_coverage(bbox1, bbox2): + inter_box = intersect_bbox(bbox1, bbox2) + intersect_size = bbox_area(inter_box) + + if intersect_size > 0: + bbox1_size = bbox_area(bbox1) + return intersect_size / bbox1_size + else: + return 0. + + +def satisfy_sample_constraint(sampler, + sample_bbox, + gt_bboxes, + satisfy_all=False): + if sampler[6] == 0 and sampler[7] == 0: + return True + satisfied = [] + for i in range(len(gt_bboxes)): + object_bbox = [ + gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3] + ] + overlap = jaccard_overlap(sample_bbox, object_bbox) + if sampler[6] != 0 and \ + overlap < sampler[6]: + satisfied.append(False) + continue + if sampler[7] != 0 and \ + overlap > sampler[7]: + satisfied.append(False) + continue + satisfied.append(True) + if not satisfy_all: + return True + + if satisfy_all: + return np.all(satisfied) + else: + return False + + +def satisfy_sample_constraint_coverage(sampler, sample_bbox, gt_bboxes): + if sampler[6] == 0 and sampler[7] == 0: + has_jaccard_overlap = False + else: + has_jaccard_overlap = True + if sampler[8] == 0 and sampler[9] == 0: + has_object_coverage = False + else: + has_object_coverage = True + + if not has_jaccard_overlap and not has_object_coverage: + return True + found = False + for i in range(len(gt_bboxes)): + object_bbox = [ + gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3] + ] + if has_jaccard_overlap: + overlap = jaccard_overlap(sample_bbox, object_bbox) + if sampler[6] != 0 and \ + overlap < sampler[6]: + continue + if sampler[7] != 0 and \ + overlap > sampler[7]: + continue + found = True + if has_object_coverage: + object_coverage = bbox_coverage(object_bbox, sample_bbox) + if sampler[8] != 0 and \ + object_coverage < sampler[8]: + continue + if sampler[9] != 0 and \ + object_coverage > sampler[9]: + continue + found = True + if found: + return True + return found + + +def crop_image_sampling(img, sample_bbox, image_width, image_height, + target_size): + # no clipping here + xmin = int(sample_bbox[0] * image_width) + xmax = int(sample_bbox[2] * image_width) + ymin = int(sample_bbox[1] * image_height) + ymax = int(sample_bbox[3] * image_height) + + w_off = xmin + h_off = ymin + width = xmax - xmin + height = ymax - ymin + cross_xmin = max(0.0, float(w_off)) + cross_ymin = max(0.0, float(h_off)) + cross_xmax = min(float(w_off + width - 1.0), float(image_width)) + cross_ymax = min(float(h_off + height - 1.0), float(image_height)) + cross_width = cross_xmax - cross_xmin + cross_height = cross_ymax - cross_ymin + + roi_xmin = 0 if w_off >= 0 else abs(w_off) + roi_ymin = 0 if h_off >= 0 else abs(h_off) + roi_width = cross_width + roi_height = cross_height + + roi_y1 = int(roi_ymin) + roi_y2 = int(roi_ymin + roi_height) + roi_x1 = int(roi_xmin) + roi_x2 = int(roi_xmin + roi_width) + + cross_y1 = int(cross_ymin) + cross_y2 = int(cross_ymin + cross_height) + cross_x1 = int(cross_xmin) + cross_x2 = int(cross_xmin + cross_width) + + sample_img = np.zeros((height, width, 3)) + sample_img[roi_y1: roi_y2, roi_x1: roi_x2] = \ + img[cross_y1: cross_y2, cross_x1: cross_x2] + + sample_img = cv2.resize( + sample_img, (target_size, target_size), interpolation=cv2.INTER_AREA) + + return sample_img + + +def is_poly(segm): + assert isinstance(segm, (list, dict)), \ + "Invalid segm type: {}".format(type(segm)) + return isinstance(segm, list) + + +def gaussian_radius(bbox_size, min_overlap): + height, width = bbox_size + + a1 = 1 + b1 = (height + width) + c1 = width * height * (1 - min_overlap) / (1 + min_overlap) + sq1 = np.sqrt(b1**2 - 4 * a1 * c1) + radius1 = (b1 + sq1) / (2 * a1) + + a2 = 4 + b2 = 2 * (height + width) + c2 = (1 - min_overlap) * width * height + sq2 = np.sqrt(b2**2 - 4 * a2 * c2) + radius2 = (b2 + sq2) / 2 + + a3 = 4 * min_overlap + b3 = -2 * min_overlap * (height + width) + c3 = (min_overlap - 1) * width * height + sq3 = np.sqrt(b3**2 - 4 * a3 * c3) + radius3 = (b3 + sq3) / 2 + return min(radius1, radius2, radius3) + + +def draw_gaussian(heatmap, center, radius, k=1, delte=6): + diameter = 2 * radius + 1 + sigma = diameter / delte + gaussian = gaussian2D((diameter, diameter), sigma_x=sigma, sigma_y=sigma) + + x, y = center + + height, width = heatmap.shape[0:2] + + left, right = min(x, radius), min(width - x, radius + 1) + top, bottom = min(y, radius), min(height - y, radius + 1) + + masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] + masked_gaussian = gaussian[radius - top:radius + bottom, radius - left: + radius + right] + np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap) + + +def gaussian2D(shape, sigma_x=1, sigma_y=1): + m, n = [(ss - 1.) / 2. for ss in shape] + y, x = np.ogrid[-m:m + 1, -n:n + 1] + + h = np.exp(-(x * x / (2 * sigma_x * sigma_x) + y * y / (2 * sigma_y * + sigma_y))) + h[h < np.finfo(h.dtype).eps * h.max()] = 0 + return h + + +def draw_umich_gaussian(heatmap, center, radius, k=1): + """ + draw_umich_gaussian, refer to https://github.com/xingyizhou/CenterNet/blob/master/src/lib/utils/image.py#L126 + """ + diameter = 2 * radius + 1 + gaussian = gaussian2D( + (diameter, diameter), sigma_x=diameter / 6, sigma_y=diameter / 6) + + x, y = int(center[0]), int(center[1]) + + height, width = heatmap.shape[0:2] + + left, right = min(x, radius), min(width - x, radius + 1) + top, bottom = min(y, radius), min(height - y, radius + 1) + + masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] + masked_gaussian = gaussian[radius - top:radius + bottom, radius - left: + radius + right] + if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: + np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap) + return heatmap + + +def get_border(border, size): + i = 1 + while size - border // i <= border // i: + i *= 2 + return border // i diff --git a/rtdetr_paddle/ppdet/data/transform/operators.py b/rtdetr_paddle/ppdet/data/transform/operators.py new file mode 100644 index 0000000..b64f032 --- /dev/null +++ b/rtdetr_paddle/ppdet/data/transform/operators.py @@ -0,0 +1,3797 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# function: +# operators to process sample, +# eg: decode/resize/crop image + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +try: + from collections.abc import Sequence +except Exception: + from collections import Sequence + +from numbers import Number, Integral + +import uuid +import random +import math +import numpy as np +import os +import copy +import logging +import cv2 +from PIL import Image, ImageDraw +import pickle +import threading +MUTEX = threading.Lock() + +from ppdet.core.workspace import serializable +from ..reader import Compose + +from .op_helper import (satisfy_sample_constraint, filter_and_process, + generate_sample_bbox, clip_bbox, data_anchor_sampling, + satisfy_sample_constraint_coverage, crop_image_sampling, + generate_sample_bbox_square, bbox_area_sampling, is_poly) + +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + +registered_ops = [] + + +def register_op(cls): + registered_ops.append(cls.__name__) + if not hasattr(BaseOperator, cls.__name__): + setattr(BaseOperator, cls.__name__, cls) + else: + raise KeyError("The {} class has been registered.".format(cls.__name__)) + return serializable(cls) + + +class BboxError(ValueError): + pass + + +class ImageError(ValueError): + pass + + +class BaseOperator(object): + def __init__(self, name=None): + if name is None: + name = self.__class__.__name__ + self._id = name + '_' + str(uuid.uuid4())[-6:] + + def apply(self, sample, context=None): + """ Process a sample. + Args: + sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx} + context (dict): info about this sample processing + Returns: + result (dict): a processed sample + """ + return sample + + def __call__(self, sample, context=None): + """ Process a sample. + Args: + sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx} + context (dict): info about this sample processing + Returns: + result (dict): a processed sample + """ + if isinstance(sample, Sequence): + for i in range(len(sample)): + sample[i] = self.apply(sample[i], context) + else: + sample = self.apply(sample, context) + return sample + + def __str__(self): + return str(self._id) + + +@register_op +class Decode(BaseOperator): + def __init__(self): + """ Transform the image data to numpy format following the rgb format + """ + super(Decode, self).__init__() + + def apply(self, sample, context=None): + """ load image if 'im_file' field is not empty but 'image' is""" + if 'image' not in sample: + with open(sample['im_file'], 'rb') as f: + sample['image'] = f.read() + sample.pop('im_file') + + try: + im = sample['image'] + data = np.frombuffer(im, dtype='uint8') + im = cv2.imdecode(data, 1) # BGR mode, but need RGB mode + if 'keep_ori_im' in sample and sample['keep_ori_im']: + sample['ori_image'] = im + im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) + except: + im = sample['image'] + + sample['image'] = im + if 'h' not in sample: + sample['h'] = im.shape[0] + elif sample['h'] != im.shape[0]: + logger.warning( + "The actual image height: {} is not equal to the " + "height: {} in annotation, and update sample['h'] by actual " + "image height.".format(im.shape[0], sample['h'])) + sample['h'] = im.shape[0] + if 'w' not in sample: + sample['w'] = im.shape[1] + elif sample['w'] != im.shape[1]: + logger.warning( + "The actual image width: {} is not equal to the " + "width: {} in annotation, and update sample['w'] by actual " + "image width.".format(im.shape[1], sample['w'])) + sample['w'] = im.shape[1] + + sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32) + sample['scale_factor'] = np.array([1., 1.], dtype=np.float32) + return sample + + +def _make_dirs(dirname): + try: + from pathlib import Path + except ImportError: + from pathlib2 import Path + Path(dirname).mkdir(exist_ok=True) + + +@register_op +class DecodeCache(BaseOperator): + def __init__(self, cache_root=None): + '''decode image and caching + ''' + super(DecodeCache, self).__init__() + + self.use_cache = False if cache_root is None else True + self.cache_root = cache_root + + if cache_root is not None: + _make_dirs(cache_root) + + def apply(self, sample, context=None): + + if self.use_cache and os.path.exists( + self.cache_path(self.cache_root, sample['im_file'])): + path = self.cache_path(self.cache_root, sample['im_file']) + im = self.load(path) + + else: + if 'image' not in sample: + with open(sample['im_file'], 'rb') as f: + sample['image'] = f.read() + + im = sample['image'] + data = np.frombuffer(im, dtype='uint8') + im = cv2.imdecode(data, 1) # BGR mode, but need RGB mode + if 'keep_ori_im' in sample and sample['keep_ori_im']: + sample['ori_image'] = im + im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) + + if self.use_cache and not os.path.exists( + self.cache_path(self.cache_root, sample['im_file'])): + path = self.cache_path(self.cache_root, sample['im_file']) + self.dump(im, path) + + sample['image'] = im + sample['h'] = im.shape[0] + sample['w'] = im.shape[1] + + sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32) + sample['scale_factor'] = np.array([1., 1.], dtype=np.float32) + + sample.pop('im_file') + + return sample + + @staticmethod + def cache_path(dir_oot, im_file): + return os.path.join(dir_oot, os.path.basename(im_file) + '.pkl') + + @staticmethod + def load(path): + with open(path, 'rb') as f: + im = pickle.load(f) + return im + + @staticmethod + def dump(obj, path): + MUTEX.acquire() + try: + with open(path, 'wb') as f: + pickle.dump(obj, f) + + except Exception as e: + logger.warning('dump {} occurs exception {}'.format(path, str(e))) + + finally: + MUTEX.release() + + +@register_op +class Permute(BaseOperator): + def __init__(self): + """ + Change the channel to be (C, H, W) + """ + super(Permute, self).__init__() + + def apply(self, sample, context=None): + im = sample['image'] + im = im.transpose((2, 0, 1)) + sample['image'] = im + + if 'pre_image' in sample: + pre_im = sample['pre_image'] + pre_im = pre_im.transpose((2, 0, 1)) + sample['pre_image'] = pre_im + return sample + + +@register_op +class Lighting(BaseOperator): + """ + Lighting the image by eigenvalues and eigenvectors + Args: + eigval (list): eigenvalues + eigvec (list): eigenvectors + alphastd (float): random weight of lighting, 0.1 by default + """ + + def __init__(self, eigval, eigvec, alphastd=0.1): + super(Lighting, self).__init__() + self.alphastd = alphastd + self.eigval = np.array(eigval).astype('float32') + self.eigvec = np.array(eigvec).astype('float32') + + def apply(self, sample, context=None): + alpha = np.random.normal(scale=self.alphastd, size=(3, )) + sample['image'] += np.dot(self.eigvec, self.eigval * alpha) + + if 'pre_image' in sample: + sample['pre_image'] += np.dot(self.eigvec, self.eigval * alpha) + return sample + + +@register_op +class RandomErasingImage(BaseOperator): + def __init__(self, prob=0.5, lower=0.02, higher=0.4, aspect_ratio=0.3): + """ + Random Erasing Data Augmentation, see https://arxiv.org/abs/1708.04896 + Args: + prob (float): probability to carry out random erasing + lower (float): lower limit of the erasing area ratio + higher (float): upper limit of the erasing area ratio + aspect_ratio (float): aspect ratio of the erasing region + """ + super(RandomErasingImage, self).__init__() + self.prob = prob + self.lower = lower + self.higher = higher + self.aspect_ratio = aspect_ratio + + def apply(self, sample, context=None): + gt_bbox = sample['gt_bbox'] + im = sample['image'] + if not isinstance(im, np.ndarray): + raise TypeError("{}: image is not a numpy array.".format(self)) + if len(im.shape) != 3: + raise ImageError("{}: image is not 3-dimensional.".format(self)) + + for idx in range(gt_bbox.shape[0]): + if self.prob <= np.random.rand(): + continue + + x1, y1, x2, y2 = gt_bbox[idx, :] + w_bbox = x2 - x1 + h_bbox = y2 - y1 + area = w_bbox * h_bbox + + target_area = random.uniform(self.lower, self.higher) * area + aspect_ratio = random.uniform(self.aspect_ratio, + 1 / self.aspect_ratio) + + h = int(round(math.sqrt(target_area * aspect_ratio))) + w = int(round(math.sqrt(target_area / aspect_ratio))) + + if w < w_bbox and h < h_bbox: + off_y1 = random.randint(0, int(h_bbox - h)) + off_x1 = random.randint(0, int(w_bbox - w)) + im[int(y1 + off_y1):int(y1 + off_y1 + h), int(x1 + off_x1):int( + x1 + off_x1 + w), :] = 0 + sample['image'] = im + return sample + + +@register_op +class NormalizeImage(BaseOperator): + def __init__(self, + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225], + is_scale=True, + norm_type='mean_std'): + """ + Args: + mean (list): the pixel mean + std (list): the pixel variance + is_scale (bool): scale the pixel to [0,1] + norm_type (str): type in ['mean_std', 'none'] + """ + super(NormalizeImage, self).__init__() + self.mean = mean + self.std = std + self.is_scale = is_scale + self.norm_type = norm_type + if not (isinstance(self.mean, list) and isinstance(self.std, list) and + isinstance(self.is_scale, bool) and + self.norm_type in ['mean_std', 'none']): + raise TypeError("{}: input type is invalid.".format(self)) + from functools import reduce + if reduce(lambda x, y: x * y, self.std) == 0: + raise ValueError('{}: std is invalid!'.format(self)) + + def apply(self, sample, context=None): + """Normalize the image. + Operators: + 1.(optional) Scale the pixel to [0,1] + 2.(optional) Each pixel minus mean and is divided by std + """ + im = sample['image'] + + im = im.astype(np.float32, copy=False) + if self.is_scale: + scale = 1.0 / 255.0 + im *= scale + + if self.norm_type == 'mean_std': + mean = np.array(self.mean)[np.newaxis, np.newaxis, :] + std = np.array(self.std)[np.newaxis, np.newaxis, :] + im -= mean + im /= std + + sample['image'] = im + + if 'pre_image' in sample: + pre_im = sample['pre_image'] + pre_im = pre_im.astype(np.float32, copy=False) + if self.is_scale: + scale = 1.0 / 255.0 + pre_im *= scale + + if self.norm_type == 'mean_std': + mean = np.array(self.mean)[np.newaxis, np.newaxis, :] + std = np.array(self.std)[np.newaxis, np.newaxis, :] + pre_im -= mean + pre_im /= std + sample['pre_image'] = pre_im + + return sample + + +@register_op +class RandomDistort(BaseOperator): + """Random color distortion. + Args: + hue (list): hue settings. in [lower, upper, probability] format. + saturation (list): saturation settings. in [lower, upper, probability] format. + contrast (list): contrast settings. in [lower, upper, probability] format. + brightness (list): brightness settings. in [lower, upper, probability] format. + random_apply (bool): whether to apply in random (yolo) or fixed (SSD) + order. + count (int): the number of doing distrot + random_channel (bool): whether to swap channels randomly + """ + + def __init__(self, + hue=[-18, 18, 0.5], + saturation=[0.5, 1.5, 0.5], + contrast=[0.5, 1.5, 0.5], + brightness=[0.5, 1.5, 0.5], + random_apply=True, + count=4, + random_channel=False, + prob=1.0): + super(RandomDistort, self).__init__() + self.hue = hue + self.saturation = saturation + self.contrast = contrast + self.brightness = brightness + self.random_apply = random_apply + self.count = count + self.random_channel = random_channel + self.prob = prob + + def apply_hue(self, img): + low, high, prob = self.hue + if np.random.uniform(0., 1.) < prob: + return img + + img = img.astype(np.float32) + # it works, but result differ from HSV version + delta = np.random.uniform(low, high) + u = np.cos(delta * np.pi) + w = np.sin(delta * np.pi) + bt = np.array([[1.0, 0.0, 0.0], [0.0, u, -w], [0.0, w, u]]) + tyiq = np.array([[0.299, 0.587, 0.114], [0.596, -0.274, -0.321], + [0.211, -0.523, 0.311]]) + ityiq = np.array([[1.0, 0.956, 0.621], [1.0, -0.272, -0.647], + [1.0, -1.107, 1.705]]) + t = np.dot(np.dot(ityiq, bt), tyiq).T + img = np.dot(img, t) + return img + + def apply_saturation(self, img): + low, high, prob = self.saturation + if np.random.uniform(0., 1.) < prob: + return img + delta = np.random.uniform(low, high) + img = img.astype(np.float32) + # it works, but result differ from HSV version + gray = img * np.array([[[0.299, 0.587, 0.114]]], dtype=np.float32) + gray = gray.sum(axis=2, keepdims=True) + gray *= (1.0 - delta) + img *= delta + img += gray + return img + + def apply_contrast(self, img): + low, high, prob = self.contrast + if np.random.uniform(0., 1.) < prob: + return img + delta = np.random.uniform(low, high) + img = img.astype(np.float32) + img *= delta + return img + + def apply_brightness(self, img): + low, high, prob = self.brightness + if np.random.uniform(0., 1.) < prob: + return img + delta = np.random.uniform(low, high) + img = img.astype(np.float32) + img += delta + return img + + def apply(self, sample, context=None): + if random.random() > self.prob: + return sample + img = sample['image'] + if self.random_apply: + functions = [ + self.apply_brightness, self.apply_contrast, + self.apply_saturation, self.apply_hue + ] + distortions = np.random.permutation(functions)[:self.count] + for func in distortions: + img = func(img) + sample['image'] = img + return sample + + img = self.apply_brightness(img) + mode = np.random.randint(0, 2) + + if mode: + img = self.apply_contrast(img) + + img = self.apply_saturation(img) + img = self.apply_hue(img) + + if not mode: + img = self.apply_contrast(img) + + if self.random_channel: + if np.random.randint(0, 2): + img = img[..., np.random.permutation(3)] + sample['image'] = img + return sample + + +@register_op +class PhotoMetricDistortion(BaseOperator): + """Apply photometric distortion to image sequentially, every transformation + is applied with a probability of 0.5. The position of random contrast is in + second or second to last. + + 1. random brightness + 2. random contrast (mode 0) + 3. convert color from BGR to HSV + 4. random saturation + 5. random hue + 6. convert color from HSV to BGR + 7. random contrast (mode 1) + 8. randomly swap channels + + Args: + brightness_delta (int): delta of brightness. + contrast_range (tuple): range of contrast. + saturation_range (tuple): range of saturation. + hue_delta (int): delta of hue. + """ + + def __init__(self, + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18): + super(PhotoMetricDistortion, self).__init__() + self.brightness_delta = brightness_delta + self.contrast_lower, self.contrast_upper = contrast_range + self.saturation_lower, self.saturation_upper = saturation_range + self.hue_delta = hue_delta + + def apply(self, results, context=None): + """Call function to perform photometric distortion on images. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Result dict with images distorted. + """ + + img = results['image'] + img = img.astype(np.float32) + # random brightness + if np.random.randint(2): + delta = np.random.uniform(-self.brightness_delta, + self.brightness_delta) + img += delta + + # mode == 0 --> do random contrast first + # mode == 1 --> do random contrast last + mode = np.random.randint(2) + if mode == 1: + if np.random.randint(2): + alpha = np.random.uniform(self.contrast_lower, + self.contrast_upper) + img *= alpha + + # convert color from BGR to HSV + img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) + + # random saturation + if np.random.randint(2): + img[..., 1] *= np.random.uniform(self.saturation_lower, + self.saturation_upper) + + # random hue + if np.random.randint(2): + img[..., 0] += np.random.uniform(-self.hue_delta, self.hue_delta) + img[..., 0][img[..., 0] > 360] -= 360 + img[..., 0][img[..., 0] < 0] += 360 + + # convert color from HSV to BGR + img = cv2.cvtColor(img, cv2.COLOR_HSV2BGR) + + # random contrast + if mode == 0: + if np.random.randint(2): + alpha = np.random.uniform(self.contrast_lower, + self.contrast_upper) + img *= alpha + + # randomly swap channels + if np.random.randint(2): + img = img[..., np.random.permutation(3)] + + results['image'] = img + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(\nbrightness_delta={self.brightness_delta},\n' + repr_str += 'contrast_range=' + repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n' + repr_str += 'saturation_range=' + repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n' + repr_str += f'hue_delta={self.hue_delta})' + return repr_str + + +@register_op +class AutoAugment(BaseOperator): + def __init__(self, autoaug_type="v1"): + """ + Args: + autoaug_type (str): autoaug type, support v0, v1, v2, v3, test + """ + super(AutoAugment, self).__init__() + self.autoaug_type = autoaug_type + + def apply(self, sample, context=None): + """ + Learning Data Augmentation Strategies for Object Detection, see https://arxiv.org/abs/1906.11172 + """ + im = sample['image'] + gt_bbox = sample['gt_bbox'] + if not isinstance(im, np.ndarray): + raise TypeError("{}: image is not a numpy array.".format(self)) + if len(im.shape) != 3: + raise ImageError("{}: image is not 3-dimensional.".format(self)) + if len(gt_bbox) == 0: + return sample + + height, width, _ = im.shape + norm_gt_bbox = np.ones_like(gt_bbox, dtype=np.float32) + norm_gt_bbox[:, 0] = gt_bbox[:, 1] / float(height) + norm_gt_bbox[:, 1] = gt_bbox[:, 0] / float(width) + norm_gt_bbox[:, 2] = gt_bbox[:, 3] / float(height) + norm_gt_bbox[:, 3] = gt_bbox[:, 2] / float(width) + + from .autoaugment_utils import distort_image_with_autoaugment + im, norm_gt_bbox = distort_image_with_autoaugment(im, norm_gt_bbox, + self.autoaug_type) + + gt_bbox[:, 0] = norm_gt_bbox[:, 1] * float(width) + gt_bbox[:, 1] = norm_gt_bbox[:, 0] * float(height) + gt_bbox[:, 2] = norm_gt_bbox[:, 3] * float(width) + gt_bbox[:, 3] = norm_gt_bbox[:, 2] * float(height) + + sample['image'] = im + sample['gt_bbox'] = gt_bbox + return sample + + +@register_op +class RandomFlip(BaseOperator): + def __init__(self, prob=0.5): + """ + Args: + prob (float): the probability of flipping image + """ + super(RandomFlip, self).__init__() + self.prob = prob + if not (isinstance(self.prob, float)): + raise TypeError("{}: input type is invalid.".format(self)) + + def apply_segm(self, segms, height, width): + def _flip_poly(poly, width): + flipped_poly = np.array(poly) + flipped_poly[0::2] = width - np.array(poly[0::2]) + return flipped_poly.tolist() + + def _flip_rle(rle, height, width): + if 'counts' in rle and type(rle['counts']) == list: + rle = mask_util.frPyObjects(rle, height, width) + mask = mask_util.decode(rle) + mask = mask[:, ::-1] + rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8)) + return rle + + flipped_segms = [] + for segm in segms: + if is_poly(segm): + # Polygon format + flipped_segms.append([_flip_poly(poly, width) for poly in segm]) + else: + # RLE format + import pycocotools.mask as mask_util + flipped_segms.append(_flip_rle(segm, height, width)) + return flipped_segms + + def apply_keypoint(self, gt_keypoint, width): + for i in range(gt_keypoint.shape[1]): + if i % 2 == 0: + old_x = gt_keypoint[:, i].copy() + gt_keypoint[:, i] = width - old_x + return gt_keypoint + + def apply_image(self, image): + return image[:, ::-1, :] + + def apply_bbox(self, bbox, width): + oldx1 = bbox[:, 0].copy() + oldx2 = bbox[:, 2].copy() + bbox[:, 0] = width - oldx2 + bbox[:, 2] = width - oldx1 + return bbox + + def apply(self, sample, context=None): + """Filp the image and bounding box. + Operators: + 1. Flip the image numpy. + 2. Transform the bboxes' x coordinates. + (Must judge whether the coordinates are normalized!) + 3. Transform the segmentations' x coordinates. + (Must judge whether the coordinates are normalized!) + Output: + sample: the image, bounding box and segmentation part + in sample are flipped. + """ + if np.random.uniform(0, 1) < self.prob: + im = sample['image'] + height, width = im.shape[:2] + im = self.apply_image(im) + if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: + sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], width) + if 'gt_poly' in sample and len(sample['gt_poly']) > 0: + sample['gt_poly'] = self.apply_segm(sample['gt_poly'], height, + width) + if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0: + sample['gt_keypoint'] = self.apply_keypoint( + sample['gt_keypoint'], width) + + if 'semantic' in sample and sample['semantic']: + sample['semantic'] = sample['semantic'][:, ::-1] + + if 'gt_segm' in sample and sample['gt_segm'].any(): + sample['gt_segm'] = sample['gt_segm'][:, :, ::-1] + + sample['flipped'] = True + sample['image'] = im + return sample + + +@register_op +class Resize(BaseOperator): + def __init__(self, target_size, keep_ratio, interp=cv2.INTER_LINEAR): + """ + Resize image to target size. if keep_ratio is True, + resize the image's long side to the maximum of target_size + if keep_ratio is False, resize the image to target size(h, w) + Args: + target_size (int|list): image target size + keep_ratio (bool): whether keep_ratio or not, default true + interp (int): the interpolation method + """ + super(Resize, self).__init__() + self.keep_ratio = keep_ratio + self.interp = interp + if not isinstance(target_size, (Integral, Sequence)): + raise TypeError( + "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}". + format(type(target_size))) + if isinstance(target_size, Integral): + target_size = [target_size, target_size] + self.target_size = target_size + + def apply_image(self, image, scale): + im_scale_x, im_scale_y = scale + + return cv2.resize( + image, + None, + None, + fx=im_scale_x, + fy=im_scale_y, + interpolation=self.interp) + + def apply_bbox(self, bbox, scale, size): + im_scale_x, im_scale_y = scale + resize_w, resize_h = size + bbox[:, 0::2] *= im_scale_x + bbox[:, 1::2] *= im_scale_y + bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w) + bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h) + return bbox + + def apply_area(self, area, scale): + im_scale_x, im_scale_y = scale + return area * im_scale_x * im_scale_y + + def apply_joints(self, joints, scale, size): + im_scale_x, im_scale_y = scale + resize_w, resize_h = size + joints[..., 0] *= im_scale_x + joints[..., 1] *= im_scale_y + joints[..., 0] = np.clip(joints[..., 0], 0, resize_w) + joints[..., 1] = np.clip(joints[..., 1], 0, resize_h) + return joints + + def apply_segm(self, segms, im_size, scale): + def _resize_poly(poly, im_scale_x, im_scale_y): + resized_poly = np.array(poly).astype('float32') + resized_poly[0::2] *= im_scale_x + resized_poly[1::2] *= im_scale_y + return resized_poly.tolist() + + def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y): + if 'counts' in rle and type(rle['counts']) == list: + rle = mask_util.frPyObjects(rle, im_h, im_w) + + mask = mask_util.decode(rle) + mask = cv2.resize( + mask, + None, + None, + fx=im_scale_x, + fy=im_scale_y, + interpolation=self.interp) + rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8)) + return rle + + im_h, im_w = im_size + im_scale_x, im_scale_y = scale + resized_segms = [] + for segm in segms: + if is_poly(segm): + # Polygon format + resized_segms.append([ + _resize_poly(poly, im_scale_x, im_scale_y) for poly in segm + ]) + else: + # RLE format + import pycocotools.mask as mask_util + resized_segms.append( + _resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y)) + + return resized_segms + + def apply(self, sample, context=None): + """ Resize the image numpy. + """ + im = sample['image'] + if not isinstance(im, np.ndarray): + raise TypeError("{}: image type is not numpy.".format(self)) + + # apply image + if len(im.shape) == 3: + im_shape = im.shape + else: + im_shape = im[0].shape + + if self.keep_ratio: + im_size_min = np.min(im_shape[0:2]) + im_size_max = np.max(im_shape[0:2]) + + target_size_min = np.min(self.target_size) + target_size_max = np.max(self.target_size) + + im_scale = min(target_size_min / im_size_min, + target_size_max / im_size_max) + + resize_h = int(im_scale * float(im_shape[0]) + 0.5) + resize_w = int(im_scale * float(im_shape[1]) + 0.5) + + im_scale_x = im_scale + im_scale_y = im_scale + else: + resize_h, resize_w = self.target_size + im_scale_y = resize_h / im_shape[0] + im_scale_x = resize_w / im_shape[1] + + if len(im.shape) == 3: + im = self.apply_image(sample['image'], [im_scale_x, im_scale_y]) + sample['image'] = im.astype(np.float32) + else: + resized_images = [] + for one_im in im: + applied_im = self.apply_image(one_im, [im_scale_x, im_scale_y]) + resized_images.append(applied_im) + + sample['image'] = np.array(resized_images) + + # 2d keypoints resize + if 'kps2d' in sample.keys(): + kps2d = sample['kps2d'] + kps2d[:, :, 0] = kps2d[:, :, 0] * im_scale_x + kps2d[:, :, 1] = kps2d[:, :, 1] * im_scale_y + + sample['kps2d'] = kps2d + + sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32) + if 'scale_factor' in sample: + scale_factor = sample['scale_factor'] + sample['scale_factor'] = np.asarray( + [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x], + dtype=np.float32) + else: + sample['scale_factor'] = np.asarray( + [im_scale_y, im_scale_x], dtype=np.float32) + + # apply bbox + if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: + sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], + [im_scale_x, im_scale_y], + [resize_w, resize_h]) + + # apply areas + if 'gt_areas' in sample: + sample['gt_areas'] = self.apply_area(sample['gt_areas'], + [im_scale_x, im_scale_y]) + + # apply polygon + if 'gt_poly' in sample and len(sample['gt_poly']) > 0: + sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2], + [im_scale_x, im_scale_y]) + + # apply semantic + if 'semantic' in sample and sample['semantic']: + semantic = sample['semantic'] + semantic = cv2.resize( + semantic.astype('float32'), + None, + None, + fx=im_scale_x, + fy=im_scale_y, + interpolation=self.interp) + semantic = np.asarray(semantic).astype('int32') + semantic = np.expand_dims(semantic, 0) + sample['semantic'] = semantic + + # apply gt_segm + if 'gt_segm' in sample and len(sample['gt_segm']) > 0: + masks = [ + cv2.resize( + gt_segm, + None, + None, + fx=im_scale_x, + fy=im_scale_y, + interpolation=cv2.INTER_NEAREST) + for gt_segm in sample['gt_segm'] + ] + sample['gt_segm'] = np.asarray(masks).astype(np.uint8) + + if 'gt_joints' in sample: + sample['gt_joints'] = self.apply_joints(sample['gt_joints'], + [im_scale_x, im_scale_y], + [resize_w, resize_h]) + + return sample + + +@register_op +class RandomResize(BaseOperator): + def __init__(self, + target_size, + keep_ratio=True, + interp=cv2.INTER_LINEAR, + random_range=False, + random_size=True, + random_interp=False): + """ + Resize image to target size randomly. random target_size and interpolation method + Args: + target_size (int, list, tuple): image target size, if random size is True, must be list or tuple + keep_ratio (bool): whether keep_raio or not, default true + interp (int): the interpolation method + random_range (bool): whether random select target size of image, the target_size must be + a [[min_short_edge, long_edge], [max_short_edge, long_edge]] + random_size (bool): whether random select target size of image + random_interp (bool): whether random select interpolation method + """ + super(RandomResize, self).__init__() + self.keep_ratio = keep_ratio + self.interp = interp + self.interps = [ + cv2.INTER_NEAREST, + cv2.INTER_LINEAR, + cv2.INTER_AREA, + cv2.INTER_CUBIC, + cv2.INTER_LANCZOS4, + ] + assert isinstance(target_size, ( + Integral, Sequence)), "target_size must be Integer, List or Tuple" + if (random_range or random_size) and not isinstance(target_size, + Sequence): + raise TypeError( + "Type of target_size is invalid when random_size or random_range is True. Must be List or Tuple, now is {}". + format(type(target_size))) + if random_range and not len(target_size) == 2: + raise TypeError( + "target_size must be two list as [[min_short_edge, long_edge], [max_short_edge, long_edge]] when random_range is True." + ) + self.target_size = target_size + self.random_range = random_range + self.random_size = random_size + self.random_interp = random_interp + + def apply(self, sample, context=None): + """ Resize the image numpy. + """ + if self.random_range: + short_edge = np.random.randint(self.target_size[0][0], + self.target_size[1][0] + 1) + long_edge = max(self.target_size[0][1], self.target_size[1][1] + 1) + target_size = [short_edge, long_edge] + else: + if self.random_size: + target_size = random.choice(self.target_size) + else: + target_size = self.target_size + + if self.random_interp: + interp = random.choice(self.interps) + else: + interp = self.interp + + resizer = Resize(target_size, self.keep_ratio, interp) + return resizer(sample, context=context) + + +@register_op +class RandomExpand(BaseOperator): + """Random expand the canvas. + Args: + ratio (float): maximum expansion ratio. + prob (float): probability to expand. + fill_value (list): color value used to fill the canvas. in RGB order. + """ + + def __init__(self, ratio=4., prob=0.5, fill_value=(127.5, 127.5, 127.5)): + super(RandomExpand, self).__init__() + assert ratio > 1.01, "expand ratio must be larger than 1.01" + self.ratio = ratio + self.prob = prob + assert isinstance(fill_value, (Number, Sequence)), \ + "fill value must be either float or sequence" + if isinstance(fill_value, Number): + fill_value = (fill_value, ) * 3 + if not isinstance(fill_value, tuple): + fill_value = tuple(fill_value) + self.fill_value = fill_value + + def apply(self, sample, context=None): + if np.random.uniform(0., 1.) < self.prob: + return sample + + im = sample['image'] + height, width = im.shape[:2] + ratio = np.random.uniform(1., self.ratio) + h = int(height * ratio) + w = int(width * ratio) + if not h > height or not w > width: + return sample + y = np.random.randint(0, h - height) + x = np.random.randint(0, w - width) + offsets, size = [x, y], [h, w] + + pad = Pad(size, + pad_mode=-1, + offsets=offsets, + fill_value=self.fill_value) + + return pad(sample, context=context) + + +@register_op +class CropWithSampling(BaseOperator): + def __init__(self, batch_sampler, satisfy_all=False, avoid_no_bbox=True): + """ + Args: + batch_sampler (list): Multiple sets of different + parameters for cropping. + satisfy_all (bool): whether all boxes must satisfy. + e.g.[[1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 1.0], + [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0]] + [max sample, max trial, min scale, max scale, + min aspect ratio, max aspect ratio, + min overlap, max overlap] + avoid_no_bbox (bool): whether to avoid the + situation where the box does not appear. + """ + super(CropWithSampling, self).__init__() + self.batch_sampler = batch_sampler + self.satisfy_all = satisfy_all + self.avoid_no_bbox = avoid_no_bbox + + def apply(self, sample, context): + """ + Crop the image and modify bounding box. + Operators: + 1. Scale the image width and height. + 2. Crop the image according to a radom sample. + 3. Rescale the bounding box. + 4. Determine if the new bbox is satisfied in the new image. + Returns: + sample: the image, bounding box are replaced. + """ + assert 'image' in sample, "image data not found" + im = sample['image'] + gt_bbox = sample['gt_bbox'] + gt_class = sample['gt_class'] + im_height, im_width = im.shape[:2] + gt_score = None + if 'gt_score' in sample: + gt_score = sample['gt_score'] + sampled_bbox = [] + gt_bbox = gt_bbox.tolist() + for sampler in self.batch_sampler: + found = 0 + for i in range(sampler[1]): + if found >= sampler[0]: + break + sample_bbox = generate_sample_bbox(sampler) + if satisfy_sample_constraint(sampler, sample_bbox, gt_bbox, + self.satisfy_all): + sampled_bbox.append(sample_bbox) + found = found + 1 + im = np.array(im) + while sampled_bbox: + idx = int(np.random.uniform(0, len(sampled_bbox))) + sample_bbox = sampled_bbox.pop(idx) + sample_bbox = clip_bbox(sample_bbox) + crop_bbox, crop_class, crop_score = \ + filter_and_process(sample_bbox, gt_bbox, gt_class, scores=gt_score) + if self.avoid_no_bbox: + if len(crop_bbox) < 1: + continue + xmin = int(sample_bbox[0] * im_width) + xmax = int(sample_bbox[2] * im_width) + ymin = int(sample_bbox[1] * im_height) + ymax = int(sample_bbox[3] * im_height) + im = im[ymin:ymax, xmin:xmax] + sample['image'] = im + sample['gt_bbox'] = crop_bbox + sample['gt_class'] = crop_class + sample['gt_score'] = crop_score + return sample + return sample + + +@register_op +class CropWithDataAchorSampling(BaseOperator): + def __init__(self, + batch_sampler, + anchor_sampler=None, + target_size=None, + das_anchor_scales=[16, 32, 64, 128], + sampling_prob=0.5, + min_size=8., + avoid_no_bbox=True): + """ + Args: + anchor_sampler (list): anchor_sampling sets of different + parameters for cropping. + batch_sampler (list): Multiple sets of different + parameters for cropping. + e.g.[[1, 10, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.2, 0.0]] + [[1, 50, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], + [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], + [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], + [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], + [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0]] + [max sample, max trial, min scale, max scale, + min aspect ratio, max aspect ratio, + min overlap, max overlap, min coverage, max coverage] + target_size (int): target image size. + das_anchor_scales (list[float]): a list of anchor scales in data + anchor smapling. + min_size (float): minimum size of sampled bbox. + avoid_no_bbox (bool): whether to avoid the + situation where the box does not appear. + """ + super(CropWithDataAchorSampling, self).__init__() + self.anchor_sampler = anchor_sampler + self.batch_sampler = batch_sampler + self.target_size = target_size + self.sampling_prob = sampling_prob + self.min_size = min_size + self.avoid_no_bbox = avoid_no_bbox + self.das_anchor_scales = np.array(das_anchor_scales) + + def apply(self, sample, context): + """ + Crop the image and modify bounding box. + Operators: + 1. Scale the image width and height. + 2. Crop the image according to a radom sample. + 3. Rescale the bounding box. + 4. Determine if the new bbox is satisfied in the new image. + Returns: + sample: the image, bounding box are replaced. + """ + assert 'image' in sample, "image data not found" + im = sample['image'] + gt_bbox = sample['gt_bbox'] + gt_class = sample['gt_class'] + image_height, image_width = im.shape[:2] + gt_bbox[:, 0] /= image_width + gt_bbox[:, 1] /= image_height + gt_bbox[:, 2] /= image_width + gt_bbox[:, 3] /= image_height + gt_score = None + if 'gt_score' in sample: + gt_score = sample['gt_score'] + sampled_bbox = [] + gt_bbox = gt_bbox.tolist() + + prob = np.random.uniform(0., 1.) + if prob > self.sampling_prob: # anchor sampling + assert self.anchor_sampler + for sampler in self.anchor_sampler: + found = 0 + for i in range(sampler[1]): + if found >= sampler[0]: + break + sample_bbox = data_anchor_sampling( + gt_bbox, image_width, image_height, + self.das_anchor_scales, self.target_size) + if sample_bbox == 0: + break + if satisfy_sample_constraint_coverage(sampler, sample_bbox, + gt_bbox): + sampled_bbox.append(sample_bbox) + found = found + 1 + im = np.array(im) + while sampled_bbox: + idx = int(np.random.uniform(0, len(sampled_bbox))) + sample_bbox = sampled_bbox.pop(idx) + + if 'gt_keypoint' in sample.keys(): + keypoints = (sample['gt_keypoint'], + sample['keypoint_ignore']) + crop_bbox, crop_class, crop_score, gt_keypoints = \ + filter_and_process(sample_bbox, gt_bbox, gt_class, + scores=gt_score, + keypoints=keypoints) + else: + crop_bbox, crop_class, crop_score = filter_and_process( + sample_bbox, gt_bbox, gt_class, scores=gt_score) + crop_bbox, crop_class, crop_score = bbox_area_sampling( + crop_bbox, crop_class, crop_score, self.target_size, + self.min_size) + + if self.avoid_no_bbox: + if len(crop_bbox) < 1: + continue + im = crop_image_sampling(im, sample_bbox, image_width, + image_height, self.target_size) + height, width = im.shape[:2] + crop_bbox[:, 0] *= width + crop_bbox[:, 1] *= height + crop_bbox[:, 2] *= width + crop_bbox[:, 3] *= height + sample['image'] = im + sample['gt_bbox'] = crop_bbox + sample['gt_class'] = crop_class + if 'gt_score' in sample: + sample['gt_score'] = crop_score + if 'gt_keypoint' in sample.keys(): + sample['gt_keypoint'] = gt_keypoints[0] + sample['keypoint_ignore'] = gt_keypoints[1] + return sample + return sample + + else: + for sampler in self.batch_sampler: + found = 0 + for i in range(sampler[1]): + if found >= sampler[0]: + break + sample_bbox = generate_sample_bbox_square( + sampler, image_width, image_height) + if satisfy_sample_constraint_coverage(sampler, sample_bbox, + gt_bbox): + sampled_bbox.append(sample_bbox) + found = found + 1 + im = np.array(im) + while sampled_bbox: + idx = int(np.random.uniform(0, len(sampled_bbox))) + sample_bbox = sampled_bbox.pop(idx) + sample_bbox = clip_bbox(sample_bbox) + + if 'gt_keypoint' in sample.keys(): + keypoints = (sample['gt_keypoint'], + sample['keypoint_ignore']) + crop_bbox, crop_class, crop_score, gt_keypoints = \ + filter_and_process(sample_bbox, gt_bbox, gt_class, + scores=gt_score, + keypoints=keypoints) + else: + crop_bbox, crop_class, crop_score = filter_and_process( + sample_bbox, gt_bbox, gt_class, scores=gt_score) + # sampling bbox according the bbox area + crop_bbox, crop_class, crop_score = bbox_area_sampling( + crop_bbox, crop_class, crop_score, self.target_size, + self.min_size) + + if self.avoid_no_bbox: + if len(crop_bbox) < 1: + continue + xmin = int(sample_bbox[0] * image_width) + xmax = int(sample_bbox[2] * image_width) + ymin = int(sample_bbox[1] * image_height) + ymax = int(sample_bbox[3] * image_height) + im = im[ymin:ymax, xmin:xmax] + height, width = im.shape[:2] + crop_bbox[:, 0] *= width + crop_bbox[:, 1] *= height + crop_bbox[:, 2] *= width + crop_bbox[:, 3] *= height + sample['image'] = im + sample['gt_bbox'] = crop_bbox + sample['gt_class'] = crop_class + if 'gt_score' in sample: + sample['gt_score'] = crop_score + if 'gt_keypoint' in sample.keys(): + sample['gt_keypoint'] = gt_keypoints[0] + sample['keypoint_ignore'] = gt_keypoints[1] + return sample + return sample + + +@register_op +class RandomCrop(BaseOperator): + """Random crop image and bboxes. + Args: + aspect_ratio (list): aspect ratio of cropped region. + in [min, max] format. + thresholds (list): iou thresholds for decide a valid bbox crop. + scaling (list): ratio between a cropped region and the original image. + in [min, max] format. + num_attempts (int): number of tries before giving up. + allow_no_crop (bool): allow return without actually cropping them. + cover_all_box (bool): ensure all bboxes are covered in the final crop. + is_mask_crop(bool): whether crop the segmentation. + """ + + def __init__(self, + aspect_ratio=[.5, 2.], + thresholds=[.0, .1, .3, .5, .7, .9], + scaling=[.3, 1.], + num_attempts=50, + allow_no_crop=True, + cover_all_box=False, + is_mask_crop=False, + ioumode="iou", + prob=1.0): + super(RandomCrop, self).__init__() + self.aspect_ratio = aspect_ratio + self.thresholds = thresholds + self.scaling = scaling + self.num_attempts = num_attempts + self.allow_no_crop = allow_no_crop + self.cover_all_box = cover_all_box + self.is_mask_crop = is_mask_crop + self.ioumode = ioumode + self.prob = prob + + def crop_segms(self, segms, valid_ids, crop, height, width): + def _crop_poly(segm, crop): + xmin, ymin, xmax, ymax = crop + crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin] + crop_p = np.array(crop_coord).reshape(4, 2) + crop_p = Polygon(crop_p) + + crop_segm = list() + for poly in segm: + poly = np.array(poly).reshape(len(poly) // 2, 2) + polygon = Polygon(poly) + if not polygon.is_valid: + exterior = polygon.exterior + multi_lines = exterior.intersection(exterior) + polygons = shapely.ops.polygonize(multi_lines) + polygon = MultiPolygon(polygons) + multi_polygon = list() + if isinstance(polygon, MultiPolygon): + multi_polygon = copy.deepcopy(polygon) + else: + multi_polygon.append(copy.deepcopy(polygon)) + for per_polygon in multi_polygon: + inter = per_polygon.intersection(crop_p) + if not inter: + continue + if isinstance(inter, (MultiPolygon, GeometryCollection)): + for part in inter: + if not isinstance(part, Polygon): + continue + part = np.squeeze( + np.array(part.exterior.coords[:-1]).reshape(1, + -1)) + part[0::2] -= xmin + part[1::2] -= ymin + crop_segm.append(part.tolist()) + elif isinstance(inter, Polygon): + crop_poly = np.squeeze( + np.array(inter.exterior.coords[:-1]).reshape(1, -1)) + crop_poly[0::2] -= xmin + crop_poly[1::2] -= ymin + crop_segm.append(crop_poly.tolist()) + else: + continue + return crop_segm + + def _crop_rle(rle, crop, height, width): + if 'counts' in rle and type(rle['counts']) == list: + rle = mask_util.frPyObjects(rle, height, width) + mask = mask_util.decode(rle) + mask = mask[crop[1]:crop[3], crop[0]:crop[2]] + rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8)) + return rle + + crop_segms = [] + for id in valid_ids: + segm = segms[id] + if is_poly(segm): + import copy + import shapely.ops + from shapely.geometry import Polygon, MultiPolygon, GeometryCollection + logging.getLogger("shapely").setLevel(logging.WARNING) + # Polygon format + crop_segms.append(_crop_poly(segm, crop)) + else: + # RLE format + import pycocotools.mask as mask_util + crop_segms.append(_crop_rle(segm, crop, height, width)) + return crop_segms + + def set_fake_bboxes(self, sample): + sample['gt_bbox'] = np.array( + [ + [32, 32, 128, 128], + [32, 32, 128, 256], + [32, 64, 128, 128], + [32, 64, 128, 256], + [64, 64, 128, 256], + [64, 64, 256, 256], + [64, 32, 128, 256], + [64, 32, 128, 256], + [96, 32, 128, 256], + [96, 32, 128, 256], + ], + dtype=np.float32) + sample['gt_class'] = np.array( + [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]], np.int32) + return sample + + def apply(self, sample, context=None): + if random.random() > self.prob: + return sample + + if 'gt_bbox' not in sample: + # only used in semi-det as unsup data + sample = self.set_fake_bboxes(sample) + sample = self.random_crop(sample, fake_bboxes=True) + return sample + + if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0: + return sample + sample = self.random_crop(sample) + return sample + + def random_crop(self, sample, fake_bboxes=False): + h, w = sample['image'].shape[:2] + gt_bbox = sample['gt_bbox'] + + # NOTE Original method attempts to generate one candidate for each + # threshold then randomly sample one from the resulting list. + # Here a short circuit approach is taken, i.e., randomly choose a + # threshold and attempt to find a valid crop, and simply return the + # first one found. + # The probability is not exactly the same, kinda resembling the + # "Monty Hall" problem. Actually carrying out the attempts will affect + # observability (just like opening doors in the "Monty Hall" game). + thresholds = list(self.thresholds) + if self.allow_no_crop: + thresholds.append('no_crop') + np.random.shuffle(thresholds) + + for thresh in thresholds: + if thresh == 'no_crop': + return sample + + found = False + for i in range(self.num_attempts): + scale = np.random.uniform(*self.scaling) + if self.aspect_ratio is not None: + min_ar, max_ar = self.aspect_ratio + aspect_ratio = np.random.uniform( + max(min_ar, scale**2), min(max_ar, scale**-2)) + h_scale = scale / np.sqrt(aspect_ratio) + w_scale = scale * np.sqrt(aspect_ratio) + else: + h_scale = np.random.uniform(*self.scaling) + w_scale = np.random.uniform(*self.scaling) + crop_h = h * h_scale + crop_w = w * w_scale + if self.aspect_ratio is None: + if crop_h / crop_w < 0.5 or crop_h / crop_w > 2.0: + continue + + crop_h = int(crop_h) + crop_w = int(crop_w) + crop_y = np.random.randint(0, h - crop_h) + crop_x = np.random.randint(0, w - crop_w) + crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h] + if self.ioumode == "iof": + iou = self._gtcropiou_matrix( + gt_bbox, np.array( + [crop_box], dtype=np.float32)) + elif self.ioumode == "iou": + iou = self._iou_matrix( + gt_bbox, np.array( + [crop_box], dtype=np.float32)) + if iou.max() < thresh: + continue + + if self.cover_all_box and iou.min() < thresh: + continue + + cropped_box, valid_ids = self._crop_box_with_center_constraint( + gt_bbox, np.array( + crop_box, dtype=np.float32)) + if valid_ids.size > 0: + found = True + break + + if found: + if self.is_mask_crop and 'gt_poly' in sample and len(sample[ + 'gt_poly']) > 0: + crop_polys = self.crop_segms( + sample['gt_poly'], + valid_ids, + np.array( + crop_box, dtype=np.int64), + h, + w) + if [] in crop_polys: + delete_id = list() + valid_polys = list() + for id, crop_poly in enumerate(crop_polys): + if crop_poly == []: + delete_id.append(id) + else: + valid_polys.append(crop_poly) + valid_ids = np.delete(valid_ids, delete_id) + if len(valid_polys) == 0: + return sample + sample['gt_poly'] = valid_polys + else: + sample['gt_poly'] = crop_polys + + if 'gt_segm' in sample: + sample['gt_segm'] = self._crop_segm(sample['gt_segm'], + crop_box) + sample['gt_segm'] = np.take( + sample['gt_segm'], valid_ids, axis=0) + + sample['image'] = self._crop_image(sample['image'], crop_box) + if fake_bboxes == True: + return sample + + sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0) + sample['gt_class'] = np.take( + sample['gt_class'], valid_ids, axis=0) + if 'gt_score' in sample: + sample['gt_score'] = np.take( + sample['gt_score'], valid_ids, axis=0) + + if 'is_crowd' in sample: + sample['is_crowd'] = np.take( + sample['is_crowd'], valid_ids, axis=0) + + if 'difficult' in sample: + sample['difficult'] = np.take( + sample['difficult'], valid_ids, axis=0) + + if 'gt_joints' in sample: + sample['gt_joints'] = self._crop_joints(sample['gt_joints'], + crop_box) + + return sample + + return sample + + def _iou_matrix(self, a, b): + tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2]) + br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) + + area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2) + area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) + area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) + area_o = (area_a[:, np.newaxis] + area_b - area_i) + return area_i / (area_o + 1e-10) + + def _gtcropiou_matrix(self, a, b): + tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2]) + br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) + + area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2) + area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) + area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) + area_o = (area_a[:, np.newaxis] + area_b - area_i) + return area_i / (area_a + 1e-10) + + def _crop_box_with_center_constraint(self, box, crop): + cropped_box = box.copy() + + cropped_box[:, :2] = np.maximum(box[:, :2], crop[:2]) + cropped_box[:, 2:] = np.minimum(box[:, 2:], crop[2:]) + cropped_box[:, :2] -= crop[:2] + cropped_box[:, 2:] -= crop[:2] + + centers = (box[:, :2] + box[:, 2:]) / 2 + valid = np.logical_and(crop[:2] <= centers, + centers < crop[2:]).all(axis=1) + valid = np.logical_and( + valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1)) + + return cropped_box, np.where(valid)[0] + + def _crop_image(self, img, crop): + x1, y1, x2, y2 = crop + return img[y1:y2, x1:x2, :] + + def _crop_segm(self, segm, crop): + x1, y1, x2, y2 = crop + return segm[:, y1:y2, x1:x2] + + def _crop_joints(self, joints, crop): + x1, y1, x2, y2 = crop + joints[joints[..., 0] > x2, :] = 0 + joints[joints[..., 1] > y2, :] = 0 + joints[joints[..., 0] < x1, :] = 0 + joints[joints[..., 1] < y1, :] = 0 + joints[..., 0] -= x1 + joints[..., 1] -= y1 + return joints + + +@register_op +class RandomScaledCrop(BaseOperator): + """Resize image and bbox based on long side (with optional random scaling), + then crop or pad image to target size. + Args: + target_size (int|list): target size, "hw" format. + scale_range (list): random scale range. + interp (int): interpolation method, default to `cv2.INTER_LINEAR`. + fill_value (float|list|tuple): color value used to fill the canvas, + in RGB order. + """ + + def __init__(self, + target_size=512, + scale_range=[.1, 2.], + interp=cv2.INTER_LINEAR, + fill_value=(123.675, 116.28, 103.53)): + super(RandomScaledCrop, self).__init__() + assert isinstance(target_size, ( + Integral, Sequence)), "target_size must be Integer, List or Tuple" + if isinstance(target_size, Integral): + target_size = [target_size, ] * 2 + + self.target_size = target_size + self.scale_range = scale_range + self.interp = interp + assert isinstance(fill_value, (Number, Sequence)), \ + "fill value must be either float or sequence" + if isinstance(fill_value, Number): + fill_value = (fill_value, ) * 3 + if not isinstance(fill_value, tuple): + fill_value = tuple(fill_value) + self.fill_value = fill_value + + def apply_image(self, img, output_size, offset_x, offset_y): + th, tw = self.target_size + rh, rw = output_size + img = cv2.resize( + img, (rw, rh), interpolation=self.interp).astype(np.float32) + canvas = np.ones([th, tw, 3], dtype=np.float32) + canvas *= np.array(self.fill_value, dtype=np.float32) + canvas[:min(th, rh), :min(tw, rw)] = \ + img[offset_y:offset_y + th, offset_x:offset_x + tw] + return canvas + + def apply_bbox(self, gt_bbox, gt_class, scale, offset_x, offset_y): + th, tw = self.target_size + shift_array = np.array( + [ + offset_x, + offset_y, + ] * 2, dtype=np.float32) + boxes = gt_bbox * scale - shift_array + boxes[:, 0::2] = np.clip(boxes[:, 0::2], 0, tw) + boxes[:, 1::2] = np.clip(boxes[:, 1::2], 0, th) + # filter boxes with no area + area = np.prod(boxes[..., 2:] - boxes[..., :2], axis=1) + valid = (area > 1.).nonzero()[0] + return boxes[valid], gt_class[valid], valid + + def apply_segm(self, segms, output_size, offset_x, offset_y, valid=None): + th, tw = self.target_size + rh, rw = output_size + out_segms = [] + for segm in segms: + segm = cv2.resize(segm, (rw, rh), interpolation=cv2.INTER_NEAREST) + segm = segm.astype(np.float32) + canvas = np.zeros([th, tw], dtype=segm.dtype) + canvas[:min(th, rh), :min(tw, rw)] = \ + segm[offset_y:offset_y + th, offset_x:offset_x + tw] + out_segms.append(canvas) + out_segms = np.stack(out_segms) + return out_segms if valid is None else out_segms[valid] + + def apply(self, sample, context=None): + img = sample['image'] + h, w = img.shape[:2] + random_scale = np.random.uniform(*self.scale_range) + target_scale_size = [t * random_scale for t in self.target_size] + # Compute actual rescaling applied to image. + scale = min(target_scale_size[0] / h, target_scale_size[1] / w) + output_size = [int(round(h * scale)), int(round(w * scale))] + # get offset + offset_x = int( + max(0, np.random.uniform(0., output_size[1] - self.target_size[1]))) + offset_y = int( + max(0, np.random.uniform(0., output_size[0] - self.target_size[0]))) + + # apply to image + sample['image'] = self.apply_image(img, output_size, offset_x, offset_y) + + # apply to bbox + valid = None + if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: + sample['gt_bbox'], sample['gt_class'], valid = self.apply_bbox( + sample['gt_bbox'], sample['gt_class'], scale, offset_x, + offset_y) + + # apply to segm + if 'gt_segm' in sample and len(sample['gt_segm']) > 0: + sample['gt_segm'] = self.apply_segm(sample['gt_segm'], output_size, + offset_x, offset_y, valid) + + sample['im_shape'] = np.asarray(output_size, dtype=np.float32) + scale_factor = sample['scale_factor'] + sample['scale_factor'] = np.asarray( + [scale_factor[0] * scale, scale_factor[1] * scale], + dtype=np.float32) + + return sample + + +@register_op +class Cutmix(BaseOperator): + def __init__(self, alpha=1.5, beta=1.5): + """ + CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features, see https://arxiv.org/abs/1905.04899 + Cutmix image and gt_bbbox/gt_score + Args: + alpha (float): alpha parameter of beta distribute + beta (float): beta parameter of beta distribute + """ + super(Cutmix, self).__init__() + self.alpha = alpha + self.beta = beta + if self.alpha <= 0.0: + raise ValueError("alpha shold be positive in {}".format(self)) + if self.beta <= 0.0: + raise ValueError("beta shold be positive in {}".format(self)) + + def apply_image(self, img1, img2, factor): + """ _rand_bbox """ + h = max(img1.shape[0], img2.shape[0]) + w = max(img1.shape[1], img2.shape[1]) + cut_rat = np.sqrt(1. - factor) + + cut_w = np.int32(w * cut_rat) + cut_h = np.int32(h * cut_rat) + + # uniform + cx = np.random.randint(w) + cy = np.random.randint(h) + + bbx1 = np.clip(cx - cut_w // 2, 0, w - 1) + bby1 = np.clip(cy - cut_h // 2, 0, h - 1) + bbx2 = np.clip(cx + cut_w // 2, 0, w - 1) + bby2 = np.clip(cy + cut_h // 2, 0, h - 1) + + img_1_pad = np.zeros((h, w, img1.shape[2]), 'float32') + img_1_pad[:img1.shape[0], :img1.shape[1], :] = \ + img1.astype('float32') + img_2_pad = np.zeros((h, w, img2.shape[2]), 'float32') + img_2_pad[:img2.shape[0], :img2.shape[1], :] = \ + img2.astype('float32') + img_1_pad[bby1:bby2, bbx1:bbx2, :] = img_2_pad[bby1:bby2, bbx1:bbx2, :] + return img_1_pad + + def __call__(self, sample, context=None): + if not isinstance(sample, Sequence): + return sample + + assert len(sample) == 2, 'cutmix need two samples' + + factor = np.random.beta(self.alpha, self.beta) + factor = max(0.0, min(1.0, factor)) + if factor >= 1.0: + return sample[0] + if factor <= 0.0: + return sample[1] + img1 = sample[0]['image'] + img2 = sample[1]['image'] + img = self.apply_image(img1, img2, factor) + gt_bbox1 = sample[0]['gt_bbox'] + gt_bbox2 = sample[1]['gt_bbox'] + gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0) + gt_class1 = sample[0]['gt_class'] + gt_class2 = sample[1]['gt_class'] + gt_class = np.concatenate((gt_class1, gt_class2), axis=0) + gt_score1 = np.ones_like(sample[0]['gt_class']) + gt_score2 = np.ones_like(sample[1]['gt_class']) + gt_score = np.concatenate( + (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0) + result = copy.deepcopy(sample[0]) + result['image'] = img + result['gt_bbox'] = gt_bbox + result['gt_score'] = gt_score + result['gt_class'] = gt_class + if 'is_crowd' in sample[0]: + is_crowd1 = sample[0]['is_crowd'] + is_crowd2 = sample[1]['is_crowd'] + is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0) + result['is_crowd'] = is_crowd + if 'difficult' in sample[0]: + is_difficult1 = sample[0]['difficult'] + is_difficult2 = sample[1]['difficult'] + is_difficult = np.concatenate( + (is_difficult1, is_difficult2), axis=0) + result['difficult'] = is_difficult + return result + + +@register_op +class Mixup(BaseOperator): + def __init__(self, alpha=1.5, beta=1.5): + """ Mixup image and gt_bbbox/gt_score + Args: + alpha (float): alpha parameter of beta distribute + beta (float): beta parameter of beta distribute + """ + super(Mixup, self).__init__() + self.alpha = alpha + self.beta = beta + if self.alpha <= 0.0: + raise ValueError("alpha shold be positive in {}".format(self)) + if self.beta <= 0.0: + raise ValueError("beta shold be positive in {}".format(self)) + + def apply_image(self, img1, img2, factor): + h = max(img1.shape[0], img2.shape[0]) + w = max(img1.shape[1], img2.shape[1]) + img = np.zeros((h, w, img1.shape[2]), 'float32') + img[:img1.shape[0], :img1.shape[1], :] = \ + img1.astype('float32') * factor + img[:img2.shape[0], :img2.shape[1], :] += \ + img2.astype('float32') * (1.0 - factor) + return img.astype('uint8') + + def __call__(self, sample, context=None): + if not isinstance(sample, Sequence): + return sample + + assert len(sample) == 2, 'mixup need two samples' + + factor = np.random.beta(self.alpha, self.beta) + factor = max(0.0, min(1.0, factor)) + if factor >= 1.0: + return sample[0] + if factor <= 0.0: + return sample[1] + im = self.apply_image(sample[0]['image'], sample[1]['image'], factor) + result = copy.deepcopy(sample[0]) + result['image'] = im + # apply bbox and score + if 'gt_bbox' in sample[0]: + gt_bbox1 = sample[0]['gt_bbox'] + gt_bbox2 = sample[1]['gt_bbox'] + gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0) + result['gt_bbox'] = gt_bbox + if 'gt_class' in sample[0]: + gt_class1 = sample[0]['gt_class'] + gt_class2 = sample[1]['gt_class'] + gt_class = np.concatenate((gt_class1, gt_class2), axis=0) + result['gt_class'] = gt_class + + gt_score1 = np.ones_like(sample[0]['gt_class']) + gt_score2 = np.ones_like(sample[1]['gt_class']) + gt_score = np.concatenate( + (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0) + result['gt_score'] = gt_score.astype('float32') + if 'is_crowd' in sample[0]: + is_crowd1 = sample[0]['is_crowd'] + is_crowd2 = sample[1]['is_crowd'] + is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0) + result['is_crowd'] = is_crowd + if 'difficult' in sample[0]: + is_difficult1 = sample[0]['difficult'] + is_difficult2 = sample[1]['difficult'] + is_difficult = np.concatenate( + (is_difficult1, is_difficult2), axis=0) + result['difficult'] = is_difficult + + if 'gt_ide' in sample[0]: + gt_ide1 = sample[0]['gt_ide'] + gt_ide2 = sample[1]['gt_ide'] + gt_ide = np.concatenate((gt_ide1, gt_ide2), axis=0) + result['gt_ide'] = gt_ide + return result + + +@register_op +class NormalizeBox(BaseOperator): + """Transform the bounding box's coornidates to [0,1].""" + + def __init__(self): + super(NormalizeBox, self).__init__() + + def apply(self, sample, context): + im = sample['image'] + gt_bbox = sample['gt_bbox'] + height, width, _ = im.shape + for i in range(gt_bbox.shape[0]): + gt_bbox[i][0] = gt_bbox[i][0] / width + gt_bbox[i][1] = gt_bbox[i][1] / height + gt_bbox[i][2] = gt_bbox[i][2] / width + gt_bbox[i][3] = gt_bbox[i][3] / height + sample['gt_bbox'] = gt_bbox + + if 'gt_keypoint' in sample.keys(): + gt_keypoint = sample['gt_keypoint'] + + for i in range(gt_keypoint.shape[1]): + if i % 2: + gt_keypoint[:, i] = gt_keypoint[:, i] / height + else: + gt_keypoint[:, i] = gt_keypoint[:, i] / width + sample['gt_keypoint'] = gt_keypoint + + return sample + + +@register_op +class BboxXYXY2XYWH(BaseOperator): + """ + Convert bbox XYXY format to XYWH format. + """ + + def __init__(self): + super(BboxXYXY2XYWH, self).__init__() + + def apply(self, sample, context=None): + assert 'gt_bbox' in sample + bbox = sample['gt_bbox'] + bbox[:, 2:4] = bbox[:, 2:4] - bbox[:, :2] + bbox[:, :2] = bbox[:, :2] + bbox[:, 2:4] / 2. + sample['gt_bbox'] = bbox + return sample + + +@register_op +class PadBox(BaseOperator): + def __init__(self, num_max_boxes=50): + """ + Pad zeros to bboxes if number of bboxes is less than num_max_boxes. + Args: + num_max_boxes (int): the max number of bboxes + """ + self.num_max_boxes = num_max_boxes + super(PadBox, self).__init__() + + def apply(self, sample, context=None): + assert 'gt_bbox' in sample + bbox = sample['gt_bbox'] + gt_num = min(self.num_max_boxes, len(bbox)) + num_max = self.num_max_boxes + # fields = context['fields'] if context else [] + pad_bbox = np.zeros((num_max, 4), dtype=np.float32) + if gt_num > 0: + pad_bbox[:gt_num, :] = bbox[:gt_num, :] + sample['gt_bbox'] = pad_bbox + if 'gt_class' in sample: + pad_class = np.zeros((num_max, ), dtype=np.int32) + if gt_num > 0: + pad_class[:gt_num] = sample['gt_class'][:gt_num, 0] + sample['gt_class'] = pad_class + if 'gt_score' in sample: + pad_score = np.zeros((num_max, ), dtype=np.float32) + if gt_num > 0: + pad_score[:gt_num] = sample['gt_score'][:gt_num, 0] + sample['gt_score'] = pad_score + # in training, for example in op ExpandImage, + # the bbox and gt_class is expandded, but the difficult is not, + # so, judging by it's length + if 'difficult' in sample: + pad_diff = np.zeros((num_max, ), dtype=np.int32) + if gt_num > 0: + pad_diff[:gt_num] = sample['difficult'][:gt_num, 0] + sample['difficult'] = pad_diff + if 'is_crowd' in sample: + pad_crowd = np.zeros((num_max, ), dtype=np.int32) + if gt_num > 0: + pad_crowd[:gt_num] = sample['is_crowd'][:gt_num, 0] + sample['is_crowd'] = pad_crowd + if 'gt_ide' in sample: + pad_ide = np.zeros((num_max, ), dtype=np.int32) + if gt_num > 0: + pad_ide[:gt_num] = sample['gt_ide'][:gt_num, 0] + sample['gt_ide'] = pad_ide + return sample + + +@register_op +class DebugVisibleImage(BaseOperator): + """ + In debug mode, visualize images according to `gt_box`. + (Currently only supported when not cropping and flipping image.) + """ + + def __init__(self, output_dir='output/debug', is_normalized=False): + super(DebugVisibleImage, self).__init__() + self.is_normalized = is_normalized + self.output_dir = output_dir + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + if not isinstance(self.is_normalized, bool): + raise TypeError("{}: input type is invalid.".format(self)) + + def apply(self, sample, context=None): + image = Image.fromarray(sample['image'].astype(np.uint8)) + out_file_name = '{:012d}.jpg'.format(sample['im_id'][0]) + width = sample['w'] + height = sample['h'] + gt_bbox = sample['gt_bbox'] + gt_class = sample['gt_class'] + draw = ImageDraw.Draw(image) + for i in range(gt_bbox.shape[0]): + if self.is_normalized: + gt_bbox[i][0] = gt_bbox[i][0] * width + gt_bbox[i][1] = gt_bbox[i][1] * height + gt_bbox[i][2] = gt_bbox[i][2] * width + gt_bbox[i][3] = gt_bbox[i][3] * height + + xmin, ymin, xmax, ymax = gt_bbox[i] + draw.line( + [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin), + (xmin, ymin)], + width=2, + fill='green') + # draw label + text = str(gt_class[i][0]) + tw, th = draw.textsize(text) + draw.rectangle( + [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill='green') + draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255)) + + if 'gt_keypoint' in sample.keys(): + gt_keypoint = sample['gt_keypoint'] + if self.is_normalized: + for i in range(gt_keypoint.shape[1]): + if i % 2: + gt_keypoint[:, i] = gt_keypoint[:, i] * height + else: + gt_keypoint[:, i] = gt_keypoint[:, i] * width + for i in range(gt_keypoint.shape[0]): + keypoint = gt_keypoint[i] + for j in range(int(keypoint.shape[0] / 2)): + x1 = round(keypoint[2 * j]).astype(np.int32) + y1 = round(keypoint[2 * j + 1]).astype(np.int32) + draw.ellipse( + (x1, y1, x1 + 5, y1 + 5), fill='green', outline='green') + save_path = os.path.join(self.output_dir, out_file_name) + image.save(save_path, quality=95) + return sample + + +@register_op +class Pad(BaseOperator): + def __init__(self, + size=None, + size_divisor=32, + pad_mode=0, + offsets=None, + fill_value=(127.5, 127.5, 127.5)): + """ + Pad image to a specified size or multiple of size_divisor. + Args: + size (int, Sequence): image target size, if None, pad to multiple of size_divisor, default None + size_divisor (int): size divisor, default 32 + pad_mode (int): pad mode, currently only supports four modes [-1, 0, 1, 2]. if -1, use specified offsets + if 0, only pad to right and bottom. if 1, pad according to center. if 2, only pad left and top + offsets (list): [offset_x, offset_y], specify offset while padding, only supported pad_mode=-1 + fill_value (bool): rgb value of pad area, default (127.5, 127.5, 127.5) + """ + super(Pad, self).__init__() + + if not isinstance(size, (int, Sequence)): + raise TypeError( + "Type of target_size is invalid when random_size is True. \ + Must be List, now is {}".format(type(size))) + + if isinstance(size, int): + size = [size, size] + + assert pad_mode in [ + -1, 0, 1, 2 + ], 'currently only supports four modes [-1, 0, 1, 2]' + if pad_mode == -1: + assert offsets, 'if pad_mode is -1, offsets should not be None' + + self.size = size + self.size_divisor = size_divisor + self.pad_mode = pad_mode + self.fill_value = fill_value + self.offsets = offsets + + def apply_segm(self, segms, offsets, im_size, size): + def _expand_poly(poly, x, y): + expanded_poly = np.array(poly) + expanded_poly[0::2] += x + expanded_poly[1::2] += y + return expanded_poly.tolist() + + def _expand_rle(rle, x, y, height, width, h, w): + if 'counts' in rle and type(rle['counts']) == list: + rle = mask_util.frPyObjects(rle, height, width) + mask = mask_util.decode(rle) + expanded_mask = np.full((h, w), 0).astype(mask.dtype) + expanded_mask[y:y + height, x:x + width] = mask + rle = mask_util.encode( + np.array( + expanded_mask, order='F', dtype=np.uint8)) + return rle + + x, y = offsets + height, width = im_size + h, w = size + expanded_segms = [] + for segm in segms: + if is_poly(segm): + # Polygon format + expanded_segms.append( + [_expand_poly(poly, x, y) for poly in segm]) + else: + # RLE format + import pycocotools.mask as mask_util + expanded_segms.append( + _expand_rle(segm, x, y, height, width, h, w)) + return expanded_segms + + def apply_bbox(self, bbox, offsets): + return bbox + np.array(offsets * 2, dtype=np.float32) + + def apply_keypoint(self, keypoints, offsets): + n = len(keypoints[0]) // 2 + return keypoints + np.array(offsets * n, dtype=np.float32) + + def apply_image(self, image, offsets, im_size, size): + x, y = offsets + im_h, im_w = im_size + h, w = size + canvas = np.ones((h, w, 3), dtype=np.float32) + canvas *= np.array(self.fill_value, dtype=np.float32) + canvas[y:y + im_h, x:x + im_w, :] = image.astype(np.float32) + return canvas + + def apply(self, sample, context=None): + im = sample['image'] + im_h, im_w = im.shape[:2] + if self.size: + h, w = self.size + assert ( + im_h <= h and im_w <= w + ), '(h, w) of target size should be greater than (im_h, im_w)' + else: + h = int(np.ceil(im_h / self.size_divisor) * self.size_divisor) + w = int(np.ceil(im_w / self.size_divisor) * self.size_divisor) + + if h == im_h and w == im_w: + sample['image'] = im.astype(np.float32) + return sample + + if self.pad_mode == -1: + offset_x, offset_y = self.offsets + elif self.pad_mode == 0: + offset_y, offset_x = 0, 0 + elif self.pad_mode == 1: + offset_y, offset_x = (h - im_h) // 2, (w - im_w) // 2 + else: + offset_y, offset_x = h - im_h, w - im_w + + offsets, im_size, size = [offset_x, offset_y], [im_h, im_w], [h, w] + + sample['image'] = self.apply_image(im, offsets, im_size, size) + + if self.pad_mode == 0: + return sample + if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: + sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], offsets) + + if 'gt_poly' in sample and len(sample['gt_poly']) > 0: + sample['gt_poly'] = self.apply_segm(sample['gt_poly'], offsets, + im_size, size) + + if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0: + sample['gt_keypoint'] = self.apply_keypoint(sample['gt_keypoint'], + offsets) + + return sample + + +@register_op +class Poly2Mask(BaseOperator): + """ + gt poly to mask annotations. + Args: + del_poly (bool): Whether to delete poly after generating mask. Default: False. + """ + + def __init__(self, del_poly=False): + super(Poly2Mask, self).__init__() + import pycocotools.mask as maskUtils + self.maskutils = maskUtils + self.del_poly = del_poly + + def _poly2mask(self, mask_ann, img_h, img_w): + if isinstance(mask_ann, list): + # polygon -- a single object might consist of multiple parts + # we merge all parts into one mask rle code + rles = self.maskutils.frPyObjects(mask_ann, img_h, img_w) + rle = self.maskutils.merge(rles) + elif isinstance(mask_ann['counts'], list): + # uncompressed RLE + rle = self.maskutils.frPyObjects(mask_ann, img_h, img_w) + else: + # rle + rle = mask_ann + mask = self.maskutils.decode(rle) + return mask + + def apply(self, sample, context=None): + assert 'gt_poly' in sample + im_h, im_w = sample['im_shape'] + masks = [ + self._poly2mask(gt_poly, im_h, im_w) + for gt_poly in sample['gt_poly'] + ] + sample['gt_segm'] = np.asarray(masks).astype(np.uint8) + if self.del_poly: + del (sample['gt_poly']) + + return sample + + +@register_op +class AugmentHSV(BaseOperator): + """ + Augment the SV channel of image data. + Args: + fraction (float): the fraction for augment. Default: 0.5. + is_bgr (bool): whether the image is BGR mode. Default: True. + hgain (float): H channel gains + sgain (float): S channel gains + vgain (float): V channel gains + """ + + def __init__(self, + fraction=0.50, + is_bgr=True, + hgain=None, + sgain=None, + vgain=None): + super(AugmentHSV, self).__init__() + self.fraction = fraction + self.is_bgr = is_bgr + self.hgain = hgain + self.sgain = sgain + self.vgain = vgain + self.use_hsvgain = False if hgain is None else True + + def apply(self, sample, context=None): + img = sample['image'] + if self.is_bgr: + img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) + else: + img_hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV) + + if self.use_hsvgain: + hsv_augs = np.random.uniform( + -1, 1, 3) * [self.hgain, self.sgain, self.vgain] + # random selection of h, s, v + hsv_augs *= np.random.randint(0, 2, 3) + img_hsv[..., 0] = (img_hsv[..., 0] + hsv_augs[0]) % 180 + img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_augs[1], 0, 255) + img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_augs[2], 0, 255) + + else: + S = img_hsv[:, :, 1].astype(np.float32) + V = img_hsv[:, :, 2].astype(np.float32) + + a = (random.random() * 2 - 1) * self.fraction + 1 + S *= a + if a > 1: + np.clip(S, a_min=0, a_max=255, out=S) + + a = (random.random() * 2 - 1) * self.fraction + 1 + V *= a + if a > 1: + np.clip(V, a_min=0, a_max=255, out=V) + + img_hsv[:, :, 1] = S.astype(np.uint8) + img_hsv[:, :, 2] = V.astype(np.uint8) + + if self.is_bgr: + cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img) + else: + cv2.cvtColor(img_hsv, cv2.COLOR_HSV2RGB, dst=img) + + sample['image'] = img.astype(np.float32) + return sample + + +@register_op +class Norm2PixelBbox(BaseOperator): + """ + Transform the bounding box's coornidates which is in [0,1] to pixels. + """ + + def __init__(self): + super(Norm2PixelBbox, self).__init__() + + def apply(self, sample, context=None): + assert 'gt_bbox' in sample + bbox = sample['gt_bbox'] + height, width = sample['image'].shape[:2] + bbox[:, 0::2] = bbox[:, 0::2] * width + bbox[:, 1::2] = bbox[:, 1::2] * height + sample['gt_bbox'] = bbox + return sample + + +@register_op +class BboxCXCYWH2XYXY(BaseOperator): + """ + Convert bbox CXCYWH format to XYXY format. + [center_x, center_y, width, height] -> [x0, y0, x1, y1] + """ + + def __init__(self): + super(BboxCXCYWH2XYXY, self).__init__() + + def apply(self, sample, context=None): + assert 'gt_bbox' in sample + bbox0 = sample['gt_bbox'] + bbox = bbox0.copy() + + bbox[:, :2] = bbox0[:, :2] - bbox0[:, 2:4] / 2. + bbox[:, 2:4] = bbox0[:, :2] + bbox0[:, 2:4] / 2. + sample['gt_bbox'] = bbox + return sample + + +@register_op +class RandomResizeCrop(BaseOperator): + """Random resize and crop image and bboxes. + Args: + resizes (list): resize image to one of resizes. if keep_ratio is True and mode is + 'long', resize the image's long side to the maximum of target_size, if keep_ratio is + True and mode is 'short', resize the image's short side to the minimum of target_size. + cropsizes (list): crop sizes after resize, [(min_crop_1, max_crop_1), ...] + mode (str): resize mode, `long` or `short`. Details see resizes. + prob (float): probability of this op. + keep_ratio (bool): whether keep_ratio or not, default true + interp (int): the interpolation method + thresholds (list): iou thresholds for decide a valid bbox crop. + num_attempts (int): number of tries before giving up. + allow_no_crop (bool): allow return without actually cropping them. + cover_all_box (bool): ensure all bboxes are covered in the final crop. + is_mask_crop(bool): whether crop the segmentation. + """ + + def __init__(self, + resizes, + cropsizes, + prob=0.5, + mode='short', + keep_ratio=True, + interp=cv2.INTER_LINEAR, + num_attempts=3, + cover_all_box=False, + allow_no_crop=False, + thresholds=[0.3, 0.5, 0.7], + is_mask_crop=False, + ioumode="iou"): + super(RandomResizeCrop, self).__init__() + + self.resizes = resizes + self.cropsizes = cropsizes + self.prob = prob + self.mode = mode + self.ioumode = ioumode + + self.resizer = Resize(0, keep_ratio=keep_ratio, interp=interp) + self.croper = RandomCrop( + num_attempts=num_attempts, + cover_all_box=cover_all_box, + thresholds=thresholds, + allow_no_crop=allow_no_crop, + is_mask_crop=is_mask_crop) + + def _format_size(self, size): + if isinstance(size, Integral): + size = (size, size) + return size + + def apply(self, sample, context=None): + if random.random() < self.prob: + _resize = self._format_size(random.choice(self.resizes)) + _cropsize = self._format_size(random.choice(self.cropsizes)) + sample = self._resize( + self.resizer, + sample, + size=_resize, + mode=self.mode, + context=context) + sample = self._random_crop( + self.croper, sample, size=_cropsize, context=context) + return sample + + @staticmethod + def _random_crop(croper, sample, size, context=None): + if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0: + return sample + + self = croper + h, w = sample['image'].shape[:2] + gt_bbox = sample['gt_bbox'] + cropsize = size + min_crop = min(cropsize) + max_crop = max(cropsize) + + thresholds = list(self.thresholds) + np.random.shuffle(thresholds) + + for thresh in thresholds: + found = False + for _ in range(self.num_attempts): + + crop_h = random.randint(min_crop, min(h, max_crop)) + crop_w = random.randint(min_crop, min(w, max_crop)) + + crop_y = random.randint(0, h - crop_h) + crop_x = random.randint(0, w - crop_w) + + crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h] + if self.ioumode == "iof": + iou = self._gtcropiou_matrix( + gt_bbox, np.array( + [crop_box], dtype=np.float32)) + elif self.ioumode == "iou": + iou = self._iou_matrix( + gt_bbox, np.array( + [crop_box], dtype=np.float32)) + if iou.max() < thresh: + continue + + if self.cover_all_box and iou.min() < thresh: + continue + + cropped_box, valid_ids = self._crop_box_with_center_constraint( + gt_bbox, np.array( + crop_box, dtype=np.float32)) + if valid_ids.size > 0: + found = True + break + + if found: + if self.is_mask_crop and 'gt_poly' in sample and len(sample[ + 'gt_poly']) > 0: + crop_polys = self.crop_segms( + sample['gt_poly'], + valid_ids, + np.array( + crop_box, dtype=np.int64), + h, + w) + if [] in crop_polys: + delete_id = list() + valid_polys = list() + for id, crop_poly in enumerate(crop_polys): + if crop_poly == []: + delete_id.append(id) + else: + valid_polys.append(crop_poly) + valid_ids = np.delete(valid_ids, delete_id) + if len(valid_polys) == 0: + return sample + sample['gt_poly'] = valid_polys + else: + sample['gt_poly'] = crop_polys + + if 'gt_segm' in sample: + sample['gt_segm'] = self._crop_segm(sample['gt_segm'], + crop_box) + sample['gt_segm'] = np.take( + sample['gt_segm'], valid_ids, axis=0) + + sample['image'] = self._crop_image(sample['image'], crop_box) + sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0) + sample['gt_class'] = np.take( + sample['gt_class'], valid_ids, axis=0) + if 'gt_score' in sample: + sample['gt_score'] = np.take( + sample['gt_score'], valid_ids, axis=0) + + if 'is_crowd' in sample: + sample['is_crowd'] = np.take( + sample['is_crowd'], valid_ids, axis=0) + + if 'gt_areas' in sample: + sample['gt_areas'] = np.take( + sample['gt_areas'], valid_ids, axis=0) + + if 'gt_joints' in sample: + gt_joints = self._crop_joints(sample['gt_joints'], crop_box) + sample['gt_joints'] = gt_joints[valid_ids] + return sample + + return sample + + @staticmethod + def _resize(resizer, sample, size, mode='short', context=None): + self = resizer + im = sample['image'] + target_size = size + + if not isinstance(im, np.ndarray): + raise TypeError("{}: image type is not numpy.".format(self)) + if len(im.shape) != 3: + raise ImageError('{}: image is not 3-dimensional.'.format(self)) + + # apply image + im_shape = im.shape + if self.keep_ratio: + + im_size_min = np.min(im_shape[0:2]) + im_size_max = np.max(im_shape[0:2]) + + target_size_min = np.min(target_size) + target_size_max = np.max(target_size) + + if mode == 'long': + im_scale = min(target_size_min / im_size_min, + target_size_max / im_size_max) + else: + im_scale = max(target_size_min / im_size_min, + target_size_max / im_size_max) + + resize_h = int(im_scale * float(im_shape[0]) + 0.5) + resize_w = int(im_scale * float(im_shape[1]) + 0.5) + + im_scale_x = im_scale + im_scale_y = im_scale + else: + resize_h, resize_w = target_size + im_scale_y = resize_h / im_shape[0] + im_scale_x = resize_w / im_shape[1] + + im = self.apply_image(sample['image'], [im_scale_x, im_scale_y]) + sample['image'] = im + sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32) + if 'scale_factor' in sample: + scale_factor = sample['scale_factor'] + sample['scale_factor'] = np.asarray( + [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x], + dtype=np.float32) + else: + sample['scale_factor'] = np.asarray( + [im_scale_y, im_scale_x], dtype=np.float32) + + # apply bbox + if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: + sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], + [im_scale_x, im_scale_y], + [resize_w, resize_h]) + + # apply polygon + if 'gt_poly' in sample and len(sample['gt_poly']) > 0: + sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2], + [im_scale_x, im_scale_y]) + + # apply semantic + if 'semantic' in sample and sample['semantic']: + semantic = sample['semantic'] + semantic = cv2.resize( + semantic.astype('float32'), + None, + None, + fx=im_scale_x, + fy=im_scale_y, + interpolation=self.interp) + semantic = np.asarray(semantic).astype('int32') + semantic = np.expand_dims(semantic, 0) + sample['semantic'] = semantic + + # apply gt_segm + if 'gt_segm' in sample and len(sample['gt_segm']) > 0: + masks = [ + cv2.resize( + gt_segm, + None, + None, + fx=im_scale_x, + fy=im_scale_y, + interpolation=cv2.INTER_NEAREST) + for gt_segm in sample['gt_segm'] + ] + sample['gt_segm'] = np.asarray(masks).astype(np.uint8) + + if 'gt_joints' in sample: + sample['gt_joints'] = self.apply_joints(sample['gt_joints'], + [im_scale_x, im_scale_y], + [resize_w, resize_h]) + + return sample + + +@register_op +class RandomSelect(BaseOperator): + """ + Randomly choose a transformation between transforms1 and transforms2, + and the probability of choosing transforms1 is p. + + The code is based on https://github.com/facebookresearch/detr/blob/main/datasets/transforms.py + + """ + + def __init__(self, transforms1, transforms2, p=0.5): + super(RandomSelect, self).__init__() + self.transforms1 = Compose(transforms1) + self.transforms2 = Compose(transforms2) + self.p = p + + def apply(self, sample, context=None): + if random.random() < self.p: + return self.transforms1(sample) + return self.transforms2(sample) + + +@register_op +class RandomShortSideResize(BaseOperator): + def __init__(self, + short_side_sizes, + max_size=None, + interp=cv2.INTER_LINEAR, + random_interp=False): + """ + Resize the image randomly according to the short side. If max_size is not None, + the long side is scaled according to max_size. The whole process will be keep ratio. + Args: + short_side_sizes (list|tuple): Image target short side size. + max_size (int): The size of the longest side of image after resize. + interp (int): The interpolation method. + random_interp (bool): Whether random select interpolation method. + """ + super(RandomShortSideResize, self).__init__() + + assert isinstance(short_side_sizes, + Sequence), "short_side_sizes must be List or Tuple" + + self.short_side_sizes = short_side_sizes + self.max_size = max_size + self.interp = interp + self.random_interp = random_interp + self.interps = [ + cv2.INTER_NEAREST, + cv2.INTER_LINEAR, + cv2.INTER_AREA, + cv2.INTER_CUBIC, + cv2.INTER_LANCZOS4, + ] + + def get_size_with_aspect_ratio(self, image_shape, size, max_size=None): + h, w = image_shape + max_clip = False + if max_size is not None: + min_original_size = float(min((w, h))) + max_original_size = float(max((w, h))) + if max_original_size / min_original_size * size > max_size: + size = int(max_size * min_original_size / max_original_size) + max_clip = True + + if (w <= h and w == size) or (h <= w and h == size): + return (w, h) + + if w < h: + ow = size + oh = int(round(size * h / w)) if not max_clip else max_size + else: + oh = size + ow = int(round(size * w / h)) if not max_clip else max_size + + return (ow, oh) + + def resize(self, + sample, + target_size, + max_size=None, + interp=cv2.INTER_LINEAR): + im = sample['image'] + if not isinstance(im, np.ndarray): + raise TypeError("{}: image type is not numpy.".format(self)) + if len(im.shape) != 3: + raise ImageError('{}: image is not 3-dimensional.'.format(self)) + + target_size = self.get_size_with_aspect_ratio(im.shape[:2], target_size, + max_size) + im_scale_y, im_scale_x = target_size[1] / im.shape[0], target_size[ + 0] / im.shape[1] + + sample['image'] = cv2.resize(im, target_size, interpolation=interp) + sample['im_shape'] = np.asarray(target_size[::-1], dtype=np.float32) + if 'scale_factor' in sample: + scale_factor = sample['scale_factor'] + sample['scale_factor'] = np.asarray( + [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x], + dtype=np.float32) + else: + sample['scale_factor'] = np.asarray( + [im_scale_y, im_scale_x], dtype=np.float32) + + # apply bbox + if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: + sample['gt_bbox'] = self.apply_bbox( + sample['gt_bbox'], [im_scale_x, im_scale_y], target_size) + # apply polygon + if 'gt_poly' in sample and len(sample['gt_poly']) > 0: + sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im.shape[:2], + [im_scale_x, im_scale_y]) + # apply semantic + if 'semantic' in sample and sample['semantic']: + semantic = sample['semantic'] + semantic = cv2.resize( + semantic.astype('float32'), + target_size, + interpolation=self.interp) + semantic = np.asarray(semantic).astype('int32') + semantic = np.expand_dims(semantic, 0) + sample['semantic'] = semantic + # apply gt_segm + if 'gt_segm' in sample and len(sample['gt_segm']) > 0: + masks = [ + cv2.resize( + gt_segm, target_size, interpolation=cv2.INTER_NEAREST) + for gt_segm in sample['gt_segm'] + ] + sample['gt_segm'] = np.asarray(masks).astype(np.uint8) + + if 'gt_joints' in sample: + sample['gt_joints'] = self.apply_joints( + sample['gt_joints'], [im_scale_x, im_scale_y], target_size) + + # apply areas + if 'gt_areas' in sample: + sample['gt_areas'] = self.apply_area(sample['gt_areas'], + [im_scale_x, im_scale_y]) + + return sample + + def apply_bbox(self, bbox, scale, size): + im_scale_x, im_scale_y = scale + resize_w, resize_h = size + bbox[:, 0::2] *= im_scale_x + bbox[:, 1::2] *= im_scale_y + bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w) + bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h) + return bbox.astype('float32') + + def apply_joints(self, joints, scale, size): + im_scale_x, im_scale_y = scale + resize_w, resize_h = size + joints[..., 0] *= im_scale_x + joints[..., 1] *= im_scale_y + # joints[joints[..., 0] >= resize_w, :] = 0 + # joints[joints[..., 1] >= resize_h, :] = 0 + # joints[joints[..., 0] < 0, :] = 0 + # joints[joints[..., 1] < 0, :] = 0 + joints[..., 0] = np.clip(joints[..., 0], 0, resize_w) + joints[..., 1] = np.clip(joints[..., 1], 0, resize_h) + return joints + + def apply_area(self, area, scale): + im_scale_x, im_scale_y = scale + return area * im_scale_x * im_scale_y + + def apply_segm(self, segms, im_size, scale): + def _resize_poly(poly, im_scale_x, im_scale_y): + resized_poly = np.array(poly).astype('float32') + resized_poly[0::2] *= im_scale_x + resized_poly[1::2] *= im_scale_y + return resized_poly.tolist() + + def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y): + if 'counts' in rle and type(rle['counts']) == list: + rle = mask_util.frPyObjects(rle, im_h, im_w) + + mask = mask_util.decode(rle) + mask = cv2.resize( + mask, + None, + None, + fx=im_scale_x, + fy=im_scale_y, + interpolation=self.interp) + rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8)) + return rle + + im_h, im_w = im_size + im_scale_x, im_scale_y = scale + resized_segms = [] + for segm in segms: + if is_poly(segm): + # Polygon format + resized_segms.append([ + _resize_poly(poly, im_scale_x, im_scale_y) for poly in segm + ]) + else: + # RLE format + import pycocotools.mask as mask_util + resized_segms.append( + _resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y)) + + return resized_segms + + def apply(self, sample, context=None): + target_size = random.choice(self.short_side_sizes) + interp = random.choice( + self.interps) if self.random_interp else self.interp + + return self.resize(sample, target_size, self.max_size, interp) + + +@register_op +class RandomShortSideRangeResize(RandomShortSideResize): + def __init__(self, scales, interp=cv2.INTER_LINEAR, random_interp=False): + """ + Resize the image randomly according to the short side. If max_size is not None, + the long side is scaled according to max_size. The whole process will be keep ratio. + Args: + short_side_sizes (list|tuple): Image target short side size. + interp (int): The interpolation method. + random_interp (bool): Whether random select interpolation method. + """ + super(RandomShortSideRangeResize, self).__init__(scales, None, interp, + random_interp) + + assert isinstance(scales, + Sequence), "short_side_sizes must be List or Tuple" + + self.scales = scales + + def random_sample(self, img_scales): + img_scale_long = [max(s) for s in img_scales] + img_scale_short = [min(s) for s in img_scales] + long_edge = np.random.randint( + min(img_scale_long), max(img_scale_long) + 1) + short_edge = np.random.randint( + min(img_scale_short), max(img_scale_short) + 1) + img_scale = (long_edge, short_edge) + return img_scale + + def apply(self, sample, context=None): + long_edge, short_edge = self.random_sample(self.short_side_sizes) + # print("target size:{}".format((long_edge, short_edge))) + interp = random.choice( + self.interps) if self.random_interp else self.interp + + return self.resize(sample, short_edge, long_edge, interp) + + +@register_op +class RandomSizeCrop(BaseOperator): + """ + Cut the image randomly according to `min_size` and `max_size` + Args: + min_size (int): Min size for edges of cropped image. + max_size (int): Max size for edges of cropped image. If it + is set to larger than length of the input image, + the output will keep the origin length. + keep_empty (bool): Whether to keep the cropped result with no object. + If it is set to False, the no-object result will not + be returned, replaced by the original input. + """ + + def __init__(self, min_size, max_size, keep_empty=True): + super(RandomSizeCrop, self).__init__() + self.min_size = min_size + self.max_size = max_size + self.keep_empty = keep_empty + + from paddle.vision.transforms.functional import crop as paddle_crop + self.paddle_crop = paddle_crop + + @staticmethod + def get_crop_params(img_shape, output_size): + """Get parameters for ``crop`` for a random crop. + Args: + img_shape (list|tuple): Image's height and width. + output_size (list|tuple): Expected output size of the crop. + Returns: + tuple: params (i, j, h, w) to be passed to ``crop`` for random crop. + """ + h, w = img_shape + th, tw = output_size + + if h + 1 < th or w + 1 < tw: + raise ValueError( + "Required crop size {} is larger then input image size {}". + format((th, tw), (h, w))) + + if w == tw and h == th: + return 0, 0, h, w + + i = random.randint(0, h - th + 1) + j = random.randint(0, w - tw + 1) + return i, j, th, tw + + def crop(self, sample, region): + keep_index = None + # apply bbox and check whether the cropped result is valid + if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: + croped_bbox = self.apply_bbox(sample['gt_bbox'], region) + bbox = croped_bbox.reshape([-1, 2, 2]) + area = (bbox[:, 1, :] - bbox[:, 0, :]).prod(axis=1) + keep_index = np.where(area > 0)[0] + + if not self.keep_empty and len(keep_index) == 0: + # When keep_empty is set to False, cropped with no-object will + # not be used and return the origin content. + return sample + + sample['gt_bbox'] = croped_bbox[keep_index] if len( + keep_index) > 0 else np.zeros( + [0, 4], dtype=np.float32) + sample['gt_class'] = sample['gt_class'][keep_index] if len( + keep_index) > 0 else np.zeros( + [0, 1], dtype=np.float32) + if 'gt_score' in sample: + sample['gt_score'] = sample['gt_score'][keep_index] if len( + keep_index) > 0 else np.zeros( + [0, 1], dtype=np.float32) + if 'is_crowd' in sample: + sample['is_crowd'] = sample['is_crowd'][keep_index] if len( + keep_index) > 0 else np.zeros( + [0, 1], dtype=np.float32) + if 'gt_areas' in sample: + sample['gt_areas'] = np.take( + sample['gt_areas'], keep_index, axis=0) + + image_shape = sample['image'].shape[:2] + sample['image'] = self.paddle_crop(sample['image'], *region) + sample['im_shape'] = np.array( + sample['image'].shape[:2], dtype=np.float32) + + # apply polygon + if 'gt_poly' in sample and len(sample['gt_poly']) > 0: + sample['gt_poly'] = self.apply_segm(sample['gt_poly'], region, + image_shape) + sample['gt_poly'] = np.array(sample['gt_poly']) + if keep_index is not None and len(keep_index) > 0: + sample['gt_poly'] = sample['gt_poly'][keep_index] + sample['gt_poly'] = sample['gt_poly'].tolist() + # apply gt_segm + if 'gt_segm' in sample and len(sample['gt_segm']) > 0: + i, j, h, w = region + sample['gt_segm'] = sample['gt_segm'][:, i:i + h, j:j + w] + if keep_index is not None and len(keep_index) > 0: + sample['gt_segm'] = sample['gt_segm'][keep_index] + + if 'gt_joints' in sample: + gt_joints = self._crop_joints(sample['gt_joints'], region) + sample['gt_joints'] = gt_joints + if keep_index is not None: + sample['gt_joints'] = sample['gt_joints'][keep_index] + + return sample + + def apply_bbox(self, bbox, region): + i, j, h, w = region + region_size = np.asarray([w, h]) + crop_bbox = bbox - np.asarray([j, i, j, i]) + crop_bbox = np.minimum(crop_bbox.reshape([-1, 2, 2]), region_size) + crop_bbox = crop_bbox.clip(min=0) + return crop_bbox.reshape([-1, 4]).astype('float32') + + def _crop_joints(self, joints, region): + y1, x1, h, w = region + x2 = x1 + w + y2 = y1 + h + # x1, y1, x2, y2 = crop + joints[..., 0] -= x1 + joints[..., 1] -= y1 + joints[joints[..., 0] > w, :] = 0 + joints[joints[..., 1] > h, :] = 0 + joints[joints[..., 0] < 0, :] = 0 + joints[joints[..., 1] < 0, :] = 0 + return joints + + def apply_segm(self, segms, region, image_shape): + def _crop_poly(segm, crop): + xmin, ymin, xmax, ymax = crop + crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin] + crop_p = np.array(crop_coord).reshape(4, 2) + crop_p = Polygon(crop_p) + + crop_segm = list() + for poly in segm: + poly = np.array(poly).reshape(len(poly) // 2, 2) + polygon = Polygon(poly) + if not polygon.is_valid: + exterior = polygon.exterior + multi_lines = exterior.intersection(exterior) + polygons = shapely.ops.polygonize(multi_lines) + polygon = MultiPolygon(polygons) + multi_polygon = list() + if isinstance(polygon, MultiPolygon): + multi_polygon = copy.deepcopy(polygon) + else: + multi_polygon.append(copy.deepcopy(polygon)) + for per_polygon in multi_polygon: + inter = per_polygon.intersection(crop_p) + if not inter: + continue + if isinstance(inter, (MultiPolygon, GeometryCollection)): + for part in inter: + if not isinstance(part, Polygon): + continue + part = np.squeeze( + np.array(part.exterior.coords[:-1]).reshape(1, + -1)) + part[0::2] -= xmin + part[1::2] -= ymin + crop_segm.append(part.tolist()) + elif isinstance(inter, Polygon): + crop_poly = np.squeeze( + np.array(inter.exterior.coords[:-1]).reshape(1, -1)) + crop_poly[0::2] -= xmin + crop_poly[1::2] -= ymin + crop_segm.append(crop_poly.tolist()) + else: + continue + return crop_segm + + def _crop_rle(rle, crop, height, width): + if 'counts' in rle and type(rle['counts']) == list: + rle = mask_util.frPyObjects(rle, height, width) + mask = mask_util.decode(rle) + mask = mask[crop[1]:crop[3], crop[0]:crop[2]] + rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8)) + return rle + + i, j, h, w = region + crop = [j, i, j + w, i + h] + height, width = image_shape + crop_segms = [] + for segm in segms: + if is_poly(segm): + import copy + import shapely.ops + from shapely.geometry import Polygon, MultiPolygon, GeometryCollection + # Polygon format + crop_segms.append(_crop_poly(segm, crop)) + else: + # RLE format + import pycocotools.mask as mask_util + crop_segms.append(_crop_rle(segm, crop, height, width)) + return crop_segms + + def apply(self, sample, context=None): + h = random.randint(self.min_size, + min(sample['image'].shape[0], self.max_size)) + w = random.randint(self.min_size, + min(sample['image'].shape[1], self.max_size)) + + region = self.get_crop_params(sample['image'].shape[:2], [h, w]) + return self.crop(sample, region) + + +@register_op +class CenterRandColor(BaseOperator): + """Random color for CenterNet series models. + Args: + saturation (float): saturation settings. + contrast (float): contrast settings. + brightness (float): brightness settings. + """ + + def __init__(self, saturation=0.4, contrast=0.4, brightness=0.4): + super(CenterRandColor, self).__init__() + self.saturation = saturation + self.contrast = contrast + self.brightness = brightness + + def apply_saturation(self, img, img_gray): + alpha = 1. + np.random.uniform( + low=-self.saturation, high=self.saturation) + self._blend(alpha, img, img_gray[:, :, None]) + return img + + def apply_contrast(self, img, img_gray): + alpha = 1. + np.random.uniform(low=-self.contrast, high=self.contrast) + img_mean = img_gray.mean() + self._blend(alpha, img, img_mean) + return img + + def apply_brightness(self, img, img_gray): + alpha = 1 + np.random.uniform( + low=-self.brightness, high=self.brightness) + img *= alpha + return img + + def _blend(self, alpha, img, img_mean): + img *= alpha + img_mean *= (1 - alpha) + img += img_mean + + def apply(self, sample, context=None): + functions = [ + self.apply_brightness, + self.apply_contrast, + self.apply_saturation, + ] + + img = sample['image'] + img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + distortions = np.random.permutation(functions) + for func in distortions: + img = func(img, img_gray) + sample['image'] = img + + if 'pre_image' in sample: + pre_img = sample['pre_image'] + pre_img_gray = cv2.cvtColor(pre_img, cv2.COLOR_BGR2GRAY) + pre_distortions = np.random.permutation(functions) + for func in pre_distortions: + pre_img = func(pre_img, pre_img_gray) + sample['pre_image'] = pre_img + + return sample + + +@register_op +class Mosaic(BaseOperator): + """ Mosaic operator for image and gt_bboxes + The code is based on https://github.com/Megvii-BaseDetection/YOLOX/blob/main/yolox/data/datasets/mosaicdetection.py + + 1. get mosaic coords + 2. clip bbox and get mosaic_labels + 3. random_affine augment + 4. Mixup augment as copypaste (optinal), not used in tiny/nano + + Args: + prob (float): probability of using Mosaic, 1.0 as default + input_dim (list[int]): input shape + degrees (list[2]): the rotate range to apply, transform range is [min, max] + translate (list[2]): the translate range to apply, transform range is [min, max] + scale (list[2]): the scale range to apply, transform range is [min, max] + shear (list[2]): the shear range to apply, transform range is [min, max] + enable_mixup (bool): whether to enable Mixup or not + mixup_prob (float): probability of using Mixup, 1.0 as default + mixup_scale (list[int]): scale range of Mixup + remove_outside_box (bool): whether remove outside boxes, False as + default in COCO dataset, True in MOT dataset + """ + + def __init__(self, + prob=1.0, + input_dim=[640, 640], + degrees=[-10, 10], + translate=[-0.1, 0.1], + scale=[0.1, 2], + shear=[-2, 2], + enable_mixup=True, + mixup_prob=1.0, + mixup_scale=[0.5, 1.5], + remove_outside_box=False): + super(Mosaic, self).__init__() + self.prob = prob + if isinstance(input_dim, Integral): + input_dim = [input_dim, input_dim] + self.input_dim = input_dim + self.degrees = degrees + self.translate = translate + self.scale = scale + self.shear = shear + self.enable_mixup = enable_mixup + self.mixup_prob = mixup_prob + self.mixup_scale = mixup_scale + self.remove_outside_box = remove_outside_box + + def get_mosaic_coords(self, mosaic_idx, xc, yc, w, h, input_h, input_w): + # (x1, y1, x2, y2) means coords in large image, + # small_coords means coords in small image in mosaic aug. + if mosaic_idx == 0: + # top left + x1, y1, x2, y2 = max(xc - w, 0), max(yc - h, 0), xc, yc + small_coords = w - (x2 - x1), h - (y2 - y1), w, h + elif mosaic_idx == 1: + # top right + x1, y1, x2, y2 = xc, max(yc - h, 0), min(xc + w, input_w * 2), yc + small_coords = 0, h - (y2 - y1), min(w, x2 - x1), h + elif mosaic_idx == 2: + # bottom left + x1, y1, x2, y2 = max(xc - w, 0), yc, xc, min(input_h * 2, yc + h) + small_coords = w - (x2 - x1), 0, w, min(y2 - y1, h) + elif mosaic_idx == 3: + # bottom right + x1, y1, x2, y2 = xc, yc, min(xc + w, input_w * 2), min(input_h * 2, + yc + h) + small_coords = 0, 0, min(w, x2 - x1), min(y2 - y1, h) + + return (x1, y1, x2, y2), small_coords + + def random_affine_augment(self, + img, + labels=[], + input_dim=[640, 640], + degrees=[-10, 10], + scales=[0.1, 2], + shears=[-2, 2], + translates=[-0.1, 0.1]): + # random rotation and scale + degree = random.uniform(degrees[0], degrees[1]) + scale = random.uniform(scales[0], scales[1]) + assert scale > 0, "Argument scale should be positive." + R = cv2.getRotationMatrix2D(angle=degree, center=(0, 0), scale=scale) + M = np.ones([2, 3]) + + # random shear + shear = random.uniform(shears[0], shears[1]) + shear_x = math.tan(shear * math.pi / 180) + shear_y = math.tan(shear * math.pi / 180) + M[0] = R[0] + shear_y * R[1] + M[1] = R[1] + shear_x * R[0] + + # random translation + translate = random.uniform(translates[0], translates[1]) + translation_x = translate * input_dim[0] + translation_y = translate * input_dim[1] + M[0, 2] = translation_x + M[1, 2] = translation_y + + # warpAffine + img = cv2.warpAffine( + img, M, dsize=tuple(input_dim), borderValue=(114, 114, 114)) + + num_gts = len(labels) + if num_gts > 0: + # warp corner points + corner_points = np.ones((4 * num_gts, 3)) + corner_points[:, :2] = labels[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape( + 4 * num_gts, 2) # x1y1, x2y2, x1y2, x2y1 + # apply affine transform + corner_points = corner_points @M.T + corner_points = corner_points.reshape(num_gts, 8) + + # create new boxes + corner_xs = corner_points[:, 0::2] + corner_ys = corner_points[:, 1::2] + new_bboxes = np.concatenate((corner_xs.min(1), corner_ys.min(1), + corner_xs.max(1), corner_ys.max(1))) + new_bboxes = new_bboxes.reshape(4, num_gts).T + + # clip boxes + new_bboxes[:, 0::2] = np.clip(new_bboxes[:, 0::2], 0, input_dim[0]) + new_bboxes[:, 1::2] = np.clip(new_bboxes[:, 1::2], 0, input_dim[1]) + labels[:, :4] = new_bboxes + + return img, labels + + def __call__(self, sample, context=None): + if not isinstance(sample, Sequence): + return sample + + assert len( + sample) == 5, "Mosaic needs 5 samples, 4 for mosaic and 1 for mixup." + if np.random.uniform(0., 1.) > self.prob: + return sample[0] + + mosaic_gt_bbox, mosaic_gt_class, mosaic_is_crowd, mosaic_difficult = [], [], [], [] + input_h, input_w = self.input_dim + yc = int(random.uniform(0.5 * input_h, 1.5 * input_h)) + xc = int(random.uniform(0.5 * input_w, 1.5 * input_w)) + mosaic_img = np.full((input_h * 2, input_w * 2, 3), 114, dtype=np.uint8) + + # 1. get mosaic coords + for mosaic_idx, sp in enumerate(sample[:4]): + img = sp['image'] + gt_bbox = sp['gt_bbox'] + h0, w0 = img.shape[:2] + scale = min(1. * input_h / h0, 1. * input_w / w0) + img = cv2.resize( + img, (int(w0 * scale), int(h0 * scale)), + interpolation=cv2.INTER_LINEAR) + (h, w, c) = img.shape[:3] + + # suffix l means large image, while s means small image in mosaic aug. + (l_x1, l_y1, l_x2, l_y2), ( + s_x1, s_y1, s_x2, s_y2) = self.get_mosaic_coords( + mosaic_idx, xc, yc, w, h, input_h, input_w) + + mosaic_img[l_y1:l_y2, l_x1:l_x2] = img[s_y1:s_y2, s_x1:s_x2] + padw, padh = l_x1 - s_x1, l_y1 - s_y1 + + # Normalized xywh to pixel xyxy format + _gt_bbox = gt_bbox.copy() + if len(gt_bbox) > 0: + _gt_bbox[:, 0] = scale * gt_bbox[:, 0] + padw + _gt_bbox[:, 1] = scale * gt_bbox[:, 1] + padh + _gt_bbox[:, 2] = scale * gt_bbox[:, 2] + padw + _gt_bbox[:, 3] = scale * gt_bbox[:, 3] + padh + + mosaic_gt_bbox.append(_gt_bbox) + mosaic_gt_class.append(sp['gt_class']) + if 'is_crowd' in sp: + mosaic_is_crowd.append(sp['is_crowd']) + if 'difficult' in sp: + mosaic_difficult.append(sp['difficult']) + + # 2. clip bbox and get mosaic_labels([gt_bbox, gt_class, is_crowd]) + if len(mosaic_gt_bbox): + mosaic_gt_bbox = np.concatenate(mosaic_gt_bbox, 0) + mosaic_gt_class = np.concatenate(mosaic_gt_class, 0) + if mosaic_is_crowd: + mosaic_is_crowd = np.concatenate(mosaic_is_crowd, 0) + mosaic_labels = np.concatenate([ + mosaic_gt_bbox, + mosaic_gt_class.astype(mosaic_gt_bbox.dtype), + mosaic_is_crowd.astype(mosaic_gt_bbox.dtype) + ], 1) + elif mosaic_difficult: + mosaic_difficult = np.concatenate(mosaic_difficult, 0) + mosaic_labels = np.concatenate([ + mosaic_gt_bbox, + mosaic_gt_class.astype(mosaic_gt_bbox.dtype), + mosaic_difficult.astype(mosaic_gt_bbox.dtype) + ], 1) + else: + mosaic_labels = np.concatenate([ + mosaic_gt_bbox, mosaic_gt_class.astype(mosaic_gt_bbox.dtype) + ], 1) + if self.remove_outside_box: + # for MOT dataset + flag1 = mosaic_gt_bbox[:, 0] < 2 * input_w + flag2 = mosaic_gt_bbox[:, 2] > 0 + flag3 = mosaic_gt_bbox[:, 1] < 2 * input_h + flag4 = mosaic_gt_bbox[:, 3] > 0 + flag_all = flag1 * flag2 * flag3 * flag4 + mosaic_labels = mosaic_labels[flag_all] + else: + mosaic_labels[:, 0] = np.clip(mosaic_labels[:, 0], 0, + 2 * input_w) + mosaic_labels[:, 1] = np.clip(mosaic_labels[:, 1], 0, + 2 * input_h) + mosaic_labels[:, 2] = np.clip(mosaic_labels[:, 2], 0, + 2 * input_w) + mosaic_labels[:, 3] = np.clip(mosaic_labels[:, 3], 0, + 2 * input_h) + else: + mosaic_labels = np.zeros((1, 6)) + + # 3. random_affine augment + mosaic_img, mosaic_labels = self.random_affine_augment( + mosaic_img, + mosaic_labels, + input_dim=self.input_dim, + degrees=self.degrees, + translates=self.translate, + scales=self.scale, + shears=self.shear) + + # 4. Mixup augment as copypaste, https://arxiv.org/abs/2012.07177 + # optinal, not used(enable_mixup=False) in tiny/nano + if (self.enable_mixup and not len(mosaic_labels) == 0 and + random.random() < self.mixup_prob): + sample_mixup = sample[4] + mixup_img = sample_mixup['image'] + if 'is_crowd' in sample_mixup: + cp_labels = np.concatenate([ + sample_mixup['gt_bbox'], + sample_mixup['gt_class'].astype(mosaic_labels.dtype), + sample_mixup['is_crowd'].astype(mosaic_labels.dtype) + ], 1) + elif 'difficult' in sample_mixup: + cp_labels = np.concatenate([ + sample_mixup['gt_bbox'], + sample_mixup['gt_class'].astype(mosaic_labels.dtype), + sample_mixup['difficult'].astype(mosaic_labels.dtype) + ], 1) + else: + cp_labels = np.concatenate([ + sample_mixup['gt_bbox'], + sample_mixup['gt_class'].astype(mosaic_labels.dtype) + ], 1) + mosaic_img, mosaic_labels = self.mixup_augment( + mosaic_img, mosaic_labels, self.input_dim, cp_labels, mixup_img) + + sample0 = sample[0] + sample0['image'] = mosaic_img.astype(np.uint8) # can not be float32 + sample0['h'] = float(mosaic_img.shape[0]) + sample0['w'] = float(mosaic_img.shape[1]) + sample0['im_shape'][0] = sample0['h'] + sample0['im_shape'][1] = sample0['w'] + sample0['gt_bbox'] = mosaic_labels[:, :4].astype(np.float32) + sample0['gt_class'] = mosaic_labels[:, 4:5].astype(np.float32) + if 'is_crowd' in sample[0]: + sample0['is_crowd'] = mosaic_labels[:, 5:6].astype(np.float32) + if 'difficult' in sample[0]: + sample0['difficult'] = mosaic_labels[:, 5:6].astype(np.float32) + return sample0 + + def mixup_augment(self, origin_img, origin_labels, input_dim, cp_labels, + img): + jit_factor = random.uniform(*self.mixup_scale) + FLIP = random.uniform(0, 1) > 0.5 + if len(img.shape) == 3: + cp_img = np.ones( + (input_dim[0], input_dim[1], 3), dtype=np.uint8) * 114 + else: + cp_img = np.ones(input_dim, dtype=np.uint8) * 114 + + cp_scale_ratio = min(input_dim[0] / img.shape[0], + input_dim[1] / img.shape[1]) + resized_img = cv2.resize( + img, (int(img.shape[1] * cp_scale_ratio), + int(img.shape[0] * cp_scale_ratio)), + interpolation=cv2.INTER_LINEAR) + + cp_img[:int(img.shape[0] * cp_scale_ratio), :int(img.shape[ + 1] * cp_scale_ratio)] = resized_img + + cp_img = cv2.resize(cp_img, (int(cp_img.shape[1] * jit_factor), + int(cp_img.shape[0] * jit_factor))) + cp_scale_ratio *= jit_factor + + if FLIP: + cp_img = cp_img[:, ::-1, :] + + origin_h, origin_w = cp_img.shape[:2] + target_h, target_w = origin_img.shape[:2] + padded_img = np.zeros( + (max(origin_h, target_h), max(origin_w, target_w), 3), + dtype=np.uint8) + padded_img[:origin_h, :origin_w] = cp_img + + x_offset, y_offset = 0, 0 + if padded_img.shape[0] > target_h: + y_offset = random.randint(0, padded_img.shape[0] - target_h - 1) + if padded_img.shape[1] > target_w: + x_offset = random.randint(0, padded_img.shape[1] - target_w - 1) + padded_cropped_img = padded_img[y_offset:y_offset + target_h, x_offset: + x_offset + target_w] + + # adjust boxes + cp_bboxes_origin_np = cp_labels[:, :4].copy() + cp_bboxes_origin_np[:, 0::2] = np.clip(cp_bboxes_origin_np[:, 0::2] * + cp_scale_ratio, 0, origin_w) + cp_bboxes_origin_np[:, 1::2] = np.clip(cp_bboxes_origin_np[:, 1::2] * + cp_scale_ratio, 0, origin_h) + + if FLIP: + cp_bboxes_origin_np[:, 0::2] = ( + origin_w - cp_bboxes_origin_np[:, 0::2][:, ::-1]) + cp_bboxes_transformed_np = cp_bboxes_origin_np.copy() + if self.remove_outside_box: + # for MOT dataset + cp_bboxes_transformed_np[:, 0::2] -= x_offset + cp_bboxes_transformed_np[:, 1::2] -= y_offset + else: + cp_bboxes_transformed_np[:, 0::2] = np.clip( + cp_bboxes_transformed_np[:, 0::2] - x_offset, 0, target_w) + cp_bboxes_transformed_np[:, 1::2] = np.clip( + cp_bboxes_transformed_np[:, 1::2] - y_offset, 0, target_h) + + cls_labels = cp_labels[:, 4:5].copy() + box_labels = cp_bboxes_transformed_np + if cp_labels.shape[-1] == 6: + crd_labels = cp_labels[:, 5:6].copy() + labels = np.hstack((box_labels, cls_labels, crd_labels)) + else: + labels = np.hstack((box_labels, cls_labels)) + if self.remove_outside_box: + labels = labels[labels[:, 0] < target_w] + labels = labels[labels[:, 2] > 0] + labels = labels[labels[:, 1] < target_h] + labels = labels[labels[:, 3] > 0] + + origin_labels = np.vstack((origin_labels, labels)) + origin_img = origin_img.astype(np.float32) + origin_img = 0.5 * origin_img + 0.5 * padded_cropped_img.astype( + np.float32) + + return origin_img.astype(np.uint8), origin_labels + + +@register_op +class PadResize(BaseOperator): + """ PadResize for image and gt_bbbox + + Args: + target_size (list[int]): input shape + fill_value (float): pixel value of padded image + """ + + def __init__(self, target_size, fill_value=114): + super(PadResize, self).__init__() + if isinstance(target_size, Integral): + target_size = [target_size, target_size] + self.target_size = target_size + self.fill_value = fill_value + + def _resize(self, img, bboxes, labels): + ratio = min(self.target_size[0] / img.shape[0], + self.target_size[1] / img.shape[1]) + w, h = int(img.shape[1] * ratio), int(img.shape[0] * ratio) + resized_img = cv2.resize(img, (w, h), interpolation=cv2.INTER_LINEAR) + + if len(bboxes) > 0: + bboxes *= ratio + mask = np.minimum(bboxes[:, 2] - bboxes[:, 0], + bboxes[:, 3] - bboxes[:, 1]) > 1 + bboxes = bboxes[mask] + labels = labels[mask] + return resized_img, bboxes, labels + + def _pad(self, img): + h, w, _ = img.shape + if h == self.target_size[0] and w == self.target_size[1]: + return img + padded_img = np.full( + (self.target_size[0], self.target_size[1], 3), + self.fill_value, + dtype=np.uint8) + padded_img[:h, :w] = img + return padded_img + + def apply(self, sample, context=None): + image = sample['image'] + bboxes = sample['gt_bbox'] + labels = sample['gt_class'] + image, bboxes, labels = self._resize(image, bboxes, labels) + sample['image'] = self._pad(image).astype(np.float32) + sample['gt_bbox'] = bboxes + sample['gt_class'] = labels + return sample + + +@register_op +class RandomShift(BaseOperator): + """ + Randomly shift image + + Args: + prob (float): probability to do random shift. + max_shift (int): max shift pixels + filter_thr (int): filter gt bboxes if one side is smaller than this + """ + + def __init__(self, prob=0.5, max_shift=32, filter_thr=1): + super(RandomShift, self).__init__() + self.prob = prob + self.max_shift = max_shift + self.filter_thr = filter_thr + + def calc_shift_coor(self, im_h, im_w, shift_h, shift_w): + return [ + max(0, shift_w), max(0, shift_h), min(im_w, im_w + shift_w), + min(im_h, im_h + shift_h) + ] + + def apply(self, sample, context=None): + if random.random() > self.prob: + return sample + + im = sample['image'] + gt_bbox = sample['gt_bbox'] + gt_class = sample['gt_class'] + im_h, im_w = im.shape[:2] + shift_h = random.randint(-self.max_shift, self.max_shift) + shift_w = random.randint(-self.max_shift, self.max_shift) + + gt_bbox[:, 0::2] += shift_w + gt_bbox[:, 1::2] += shift_h + gt_bbox[:, 0::2] = np.clip(gt_bbox[:, 0::2], 0, im_w) + gt_bbox[:, 1::2] = np.clip(gt_bbox[:, 1::2], 0, im_h) + gt_bbox_h = gt_bbox[:, 2] - gt_bbox[:, 0] + gt_bbox_w = gt_bbox[:, 3] - gt_bbox[:, 1] + keep = (gt_bbox_w > self.filter_thr) & (gt_bbox_h > self.filter_thr) + if not keep.any(): + return sample + + gt_bbox = gt_bbox[keep] + gt_class = gt_class[keep] + + # shift image + coor_new = self.calc_shift_coor(im_h, im_w, shift_h, shift_w) + # shift frame to the opposite direction + coor_old = self.calc_shift_coor(im_h, im_w, -shift_h, -shift_w) + canvas = np.zeros_like(im) + canvas[coor_new[1]:coor_new[3], coor_new[0]:coor_new[2]] \ + = im[coor_old[1]:coor_old[3], coor_old[0]:coor_old[2]] + + sample['image'] = canvas + sample['gt_bbox'] = gt_bbox + sample['gt_class'] = gt_class + return sample + + +@register_op +class StrongAugImage(BaseOperator): + def __init__(self, transforms): + super(StrongAugImage, self).__init__() + self.transforms = Compose(transforms) + + def apply(self, sample, context=None): + im = sample + im['image'] = sample['image'].astype('uint8') + results = self.transforms(im) + sample['image'] = results['image'].astype('uint8') + return sample + + +@register_op +class RandomColorJitter(BaseOperator): + def __init__(self, + prob=0.8, + brightness=0.4, + contrast=0.4, + saturation=0.4, + hue=0.1): + super(RandomColorJitter, self).__init__() + self.prob = prob + self.brightness = brightness + self.contrast = contrast + self.saturation = saturation + self.hue = hue + + def apply(self, sample, context=None): + if np.random.uniform(0, 1) < self.prob: + from paddle.vision.transforms import ColorJitter + transform = ColorJitter(self.brightness, self.contrast, + self.saturation, self.hue) + sample['image'] = transform(sample['image'].astype(np.uint8)) + sample['image'] = sample['image'].astype(np.float32) + return sample + + +@register_op +class RandomGrayscale(BaseOperator): + def __init__(self, prob=0.2): + super(RandomGrayscale, self).__init__() + self.prob = prob + + def apply(self, sample, context=None): + if np.random.uniform(0, 1) < self.prob: + from paddle.vision.transforms import Grayscale + transform = Grayscale(num_output_channels=3) + sample['image'] = transform(sample['image']) + return sample + + +@register_op +class RandomGaussianBlur(BaseOperator): + def __init__(self, prob=0.5, sigma=[0.1, 2.0]): + super(RandomGaussianBlur, self).__init__() + self.prob = prob + self.sigma = sigma + + def apply(self, sample, context=None): + if np.random.uniform(0, 1) < self.prob: + sigma = np.random.uniform(self.sigma[0], self.sigma[1]) + im = cv2.GaussianBlur(sample['image'], (23, 23), sigma) + sample['image'] = im + return sample + + +@register_op +class RandomErasing(BaseOperator): + def __init__(self, + prob=0.5, + scale=(0.02, 0.33), + ratio=(0.3, 3.3), + value=0, + inplace=False): + super(RandomErasing, self).__init__() + assert isinstance(scale, + (tuple, list)), "scale should be a tuple or list" + assert (scale[0] >= 0 and scale[1] <= 1 and scale[0] <= scale[1] + ), "scale should be of kind (min, max) and in range [0, 1]" + assert isinstance(ratio, + (tuple, list)), "ratio should be a tuple or list" + assert (ratio[0] >= 0 and + ratio[0] <= ratio[1]), "ratio should be of kind (min, max)" + assert isinstance( + value, (Number, str, tuple, + list)), "value should be a number, tuple, list or str" + if isinstance(value, str) and value != "random": + raise ValueError("value must be 'random' when type is str") + self.prob = prob + self.scale = scale + self.ratio = ratio + self.value = value + self.inplace = inplace + + def _erase(self, img, i, j, h, w, v, inplace=False): + if not inplace: + img = img.copy() + img[i:i + h, j:j + w, ...] = v + return img + + def _get_param(self, img, scale, ratio, value): + shape = np.asarray(img).astype(np.uint8).shape + h, w, c = shape[-3], shape[-2], shape[-1] + img_area = h * w + log_ratio = np.log(ratio) + for _ in range(1): + erase_area = np.random.uniform(*scale) * img_area + aspect_ratio = np.exp(np.random.uniform(*log_ratio)) + erase_h = int(round(np.sqrt(erase_area * aspect_ratio))) + erase_w = int(round(np.sqrt(erase_area / aspect_ratio))) + if erase_h >= h or erase_w >= w: + continue + + if value is None: + v = np.random.normal(size=[erase_h, erase_w, c]) * 255 + else: + v = np.array(value)[None, None, :] + top = np.random.randint(0, h - erase_h + 1) + left = np.random.randint(0, w - erase_w + 1) + return top, left, erase_h, erase_w, v + return 0, 0, h, w, img + + def apply(self, sample, context=None): + if random.random() < self.prob: + if isinstance(self.value, Number): + value = [self.value] + elif isinstance(self.value, str): + value = None + else: + value = self.value + if value is not None and not (len(value) == 1 or len(value) == 3): + raise ValueError( + "Value should be a single number or a sequence with length equals to image's channel." + ) + im = sample['image'] + top, left, erase_h, erase_w, v = self._get_param(im, self.scale, + self.ratio, value) + im = self._erase(im, top, left, erase_h, erase_w, v, self.inplace) + sample['image'] = im + return sample + + +@register_op +class RandomErasingCrop(BaseOperator): + def __init__(self): + super(RandomErasingCrop, self).__init__() + self.transform1 = RandomErasing( + prob=0.7, scale=(0.05, 0.2), ratio=(0.3, 3.3), value="random") + self.transform2 = RandomErasing( + prob=0.5, scale=(0.05, 0.2), ratio=(0.1, 6), value="random") + self.transform3 = RandomErasing( + prob=0.3, scale=(0.05, 0.2), ratio=(0.05, 8), value="random") + + def apply(self, sample, context=None): + sample = self.transform1(sample) + sample = self.transform2(sample) + sample = self.transform3(sample) + return sample diff --git a/rtdetr_paddle/ppdet/data/utils.py b/rtdetr_paddle/ppdet/data/utils.py new file mode 100644 index 0000000..c01b3d2 --- /dev/null +++ b/rtdetr_paddle/ppdet/data/utils.py @@ -0,0 +1,71 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numbers +import numpy as np + +try: + from collections.abc import Sequence, Mapping +except: + from collections import Sequence, Mapping + + +def default_collate_fn(batch): + """ + Default batch collating function for :code:`paddle.io.DataLoader`, + get input data as a list of sample datas, each element in list + if the data of a sample, and sample data should composed of list, + dictionary, string, number, numpy array, this + function will parse input data recursively and stack number, + numpy array and paddle.Tensor datas as batch datas. e.g. for + following input data: + [{'image': np.array(shape=[3, 224, 224]), 'label': 1}, + {'image': np.array(shape=[3, 224, 224]), 'label': 3}, + {'image': np.array(shape=[3, 224, 224]), 'label': 4}, + {'image': np.array(shape=[3, 224, 224]), 'label': 5},] + + + This default collate function zipped each number and numpy array + field together and stack each field as the batch field as follows: + {'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])} + Args: + batch(list of sample data): batch should be a list of sample data. + + Returns: + Batched data: batched each number, numpy array and paddle.Tensor + in input data. + """ + sample = batch[0] + if isinstance(sample, np.ndarray): + batch = np.stack(batch, axis=0) + return batch + elif isinstance(sample, numbers.Number): + batch = np.array(batch) + return batch + elif isinstance(sample, (str, bytes)): + return batch + elif isinstance(sample, Mapping): + return { + key: default_collate_fn([d[key] for d in batch]) + for key in sample + } + elif isinstance(sample, Sequence): + sample_fields_num = len(sample) + if not all(len(sample) == sample_fields_num for sample in iter(batch)): + raise RuntimeError( + "fileds number not same among samples in a batch") + return [default_collate_fn(fields) for fields in zip(*batch)] + + raise TypeError("batch data con only contains: tensor, numpy.ndarray, " + "dict, list, number, but got {}".format(type(sample))) diff --git a/rtdetr_paddle/ppdet/engine/__init__.py b/rtdetr_paddle/ppdet/engine/__init__.py new file mode 100644 index 0000000..dfded9e --- /dev/null +++ b/rtdetr_paddle/ppdet/engine/__init__.py @@ -0,0 +1,26 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import trainer +from .trainer import * + +from . import callbacks +from .callbacks import * + +from . import env +from .env import * + +__all__ = trainer.__all__ \ + + callbacks.__all__ \ + + env.__all__ diff --git a/rtdetr_paddle/ppdet/engine/callbacks.py b/rtdetr_paddle/ppdet/engine/callbacks.py new file mode 100644 index 0000000..35ebb3e --- /dev/null +++ b/rtdetr_paddle/ppdet/engine/callbacks.py @@ -0,0 +1,557 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import datetime +import six +import copy +import json + +import paddle +import paddle.distributed as dist + +from ppdet.utils.checkpoint import save_model +from ppdet.metrics import get_infer_results + +from ppdet.utils.logger import setup_logger +logger = setup_logger('ppdet.engine') + +__all__ = [ + 'Callback', 'ComposeCallback', 'LogPrinter', 'Checkpointer', + 'VisualDLWriter', 'SniperProposalsGenerator' +] + + +class Callback(object): + def __init__(self, model): + self.model = model + + def on_step_begin(self, status): + pass + + def on_step_end(self, status): + pass + + def on_epoch_begin(self, status): + pass + + def on_epoch_end(self, status): + pass + + def on_train_begin(self, status): + pass + + def on_train_end(self, status): + pass + + +class ComposeCallback(object): + def __init__(self, callbacks): + callbacks = [c for c in list(callbacks) if c is not None] + for c in callbacks: + assert isinstance( + c, Callback), "callback should be subclass of Callback" + self._callbacks = callbacks + + def on_step_begin(self, status): + for c in self._callbacks: + c.on_step_begin(status) + + def on_step_end(self, status): + for c in self._callbacks: + c.on_step_end(status) + + def on_epoch_begin(self, status): + for c in self._callbacks: + c.on_epoch_begin(status) + + def on_epoch_end(self, status): + for c in self._callbacks: + c.on_epoch_end(status) + + def on_train_begin(self, status): + for c in self._callbacks: + c.on_train_begin(status) + + def on_train_end(self, status): + for c in self._callbacks: + c.on_train_end(status) + + +class LogPrinter(Callback): + def __init__(self, model): + super(LogPrinter, self).__init__(model) + + def on_step_end(self, status): + if dist.get_world_size() < 2 or dist.get_rank() == 0: + mode = status['mode'] + if mode == 'train': + epoch_id = status['epoch_id'] + step_id = status['step_id'] + steps_per_epoch = status['steps_per_epoch'] + training_status = status['training_status'] + batch_time = status['batch_time'] + data_time = status['data_time'] + + epoches = self.model.cfg.epoch + batch_size = self.model.cfg['{}Reader'.format(mode.capitalize( + ))]['batch_size'] + + logs = training_status.log() + space_fmt = ':' + str(len(str(steps_per_epoch))) + 'd' + if step_id % self.model.cfg.log_iter == 0: + eta_steps = (epoches - epoch_id) * steps_per_epoch - step_id + eta_sec = eta_steps * batch_time.global_avg + eta_str = str(datetime.timedelta(seconds=int(eta_sec))) + ips = float(batch_size) / batch_time.avg + fmt = ' '.join([ + 'Epoch: [{}]', + '[{' + space_fmt + '}/{}]', + 'learning_rate: {lr:.6f}', + '{meters}', + 'eta: {eta}', + 'batch_cost: {btime}', + 'data_cost: {dtime}', + 'ips: {ips:.4f} images/s', + ]) + fmt = fmt.format( + epoch_id, + step_id, + steps_per_epoch, + lr=status['learning_rate'], + meters=logs, + eta=eta_str, + btime=str(batch_time), + dtime=str(data_time), + ips=ips) + logger.info(fmt) + if mode == 'eval': + step_id = status['step_id'] + if step_id % 100 == 0: + logger.info("Eval iter: {}".format(step_id)) + + def on_epoch_end(self, status): + if dist.get_world_size() < 2 or dist.get_rank() == 0: + mode = status['mode'] + if mode == 'eval': + sample_num = status['sample_num'] + cost_time = status['cost_time'] + logger.info('Total sample number: {}, average FPS: {}'.format( + sample_num, sample_num / cost_time)) + + +class Checkpointer(Callback): + def __init__(self, model): + super(Checkpointer, self).__init__(model) + self.best_ap = -1000. + self.save_dir = os.path.join(self.model.cfg.save_dir, + self.model.cfg.filename) + if hasattr(self.model.model, 'student_model'): + self.weight = self.model.model.student_model + else: + self.weight = self.model.model + + def on_epoch_end(self, status): + # Checkpointer only performed during training + mode = status['mode'] + epoch_id = status['epoch_id'] + weight = None + save_name = None + if dist.get_world_size() < 2 or dist.get_rank() == 0: + if mode == 'train': + end_epoch = self.model.cfg.epoch + if ( + epoch_id + 1 + ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1: + save_name = str( + epoch_id) if epoch_id != end_epoch - 1 else "model_final" + weight = self.weight.state_dict() + elif mode == 'eval': + if 'save_best_model' in status and status['save_best_model']: + for metric in self.model._metrics: + map_res = metric.get_results() + eval_func = "ap" + if 'pose3d' in map_res: + key = 'pose3d' + eval_func = "mpjpe" + elif 'bbox' in map_res: + key = 'bbox' + elif 'keypoint' in map_res: + key = 'keypoint' + else: + key = 'mask' + if key not in map_res: + logger.warning("Evaluation results empty, this may be due to " \ + "training iterations being too few or not " \ + "loading the correct weights.") + return + if map_res[key][0] >= self.best_ap: + self.best_ap = map_res[key][0] + save_name = 'best_model' + weight = self.weight.state_dict() + logger.info("Best test {} {} is {:0.3f}.".format( + key, eval_func, abs(self.best_ap))) + if weight: + if self.model.use_ema: + exchange_save_model = status.get('exchange_save_model', + False) + if not exchange_save_model: + # save model and ema_model + save_model( + status['weight'], + self.model.optimizer, + self.save_dir, + save_name, + epoch_id + 1, + ema_model=weight) + else: + # save model(student model) and ema_model(teacher model) + # in DenseTeacher SSOD, the teacher model will be higher, + # so exchange when saving pdparams + student_model = status['weight'] # model + teacher_model = weight # ema_model + save_model( + teacher_model, + self.model.optimizer, + self.save_dir, + save_name, + epoch_id + 1, + ema_model=student_model) + del teacher_model + del student_model + else: + save_model(weight, self.model.optimizer, self.save_dir, + save_name, epoch_id + 1) + + +class WiferFaceEval(Callback): + def __init__(self, model): + super(WiferFaceEval, self).__init__(model) + + def on_epoch_begin(self, status): + assert self.model.mode == 'eval', \ + "WiferFaceEval can only be set during evaluation" + for metric in self.model._metrics: + metric.update(self.model.model) + sys.exit() + + +class VisualDLWriter(Callback): + """ + Use VisualDL to log data or image + """ + + def __init__(self, model): + super(VisualDLWriter, self).__init__(model) + + assert six.PY3, "VisualDL requires Python >= 3.5" + try: + from visualdl import LogWriter + except Exception as e: + logger.error('visualdl not found, plaese install visualdl. ' + 'for example: `pip install visualdl`.') + raise e + self.vdl_writer = LogWriter( + model.cfg.get('vdl_log_dir', 'vdl_log_dir/scalar')) + self.vdl_loss_step = 0 + self.vdl_mAP_step = 0 + self.vdl_image_step = 0 + self.vdl_image_frame = 0 + + def on_step_end(self, status): + mode = status['mode'] + if dist.get_world_size() < 2 or dist.get_rank() == 0: + if mode == 'train': + training_status = status['training_status'] + for loss_name, loss_value in training_status.get().items(): + self.vdl_writer.add_scalar(loss_name, loss_value, + self.vdl_loss_step) + self.vdl_loss_step += 1 + elif mode == 'test': + ori_image = status['original_image'] + result_image = status['result_image'] + self.vdl_writer.add_image( + "original/frame_{}".format(self.vdl_image_frame), ori_image, + self.vdl_image_step) + self.vdl_writer.add_image( + "result/frame_{}".format(self.vdl_image_frame), + result_image, self.vdl_image_step) + self.vdl_image_step += 1 + # each frame can display ten pictures at most. + if self.vdl_image_step % 10 == 0: + self.vdl_image_step = 0 + self.vdl_image_frame += 1 + + def on_epoch_end(self, status): + mode = status['mode'] + if dist.get_world_size() < 2 or dist.get_rank() == 0: + if mode == 'eval': + for metric in self.model._metrics: + for key, map_value in metric.get_results().items(): + self.vdl_writer.add_scalar("{}-mAP".format(key), + map_value[0], + self.vdl_mAP_step) + self.vdl_mAP_step += 1 + + +class WandbCallback(Callback): + def __init__(self, model): + super(WandbCallback, self).__init__(model) + + try: + import wandb + self.wandb = wandb + except Exception as e: + logger.error('wandb not found, please install wandb. ' + 'Use: `pip install wandb`.') + raise e + + self.wandb_params = model.cfg.get('wandb', None) + self.save_dir = os.path.join(self.model.cfg.save_dir, + self.model.cfg.filename) + if self.wandb_params is None: + self.wandb_params = {} + for k, v in model.cfg.items(): + if k.startswith("wandb_"): + self.wandb_params.update({k.lstrip("wandb_"): v}) + + self._run = None + if dist.get_world_size() < 2 or dist.get_rank() == 0: + _ = self.run + self.run.config.update(self.model.cfg) + self.run.define_metric("epoch") + self.run.define_metric("eval/*", step_metric="epoch") + + self.best_ap = -1000. + self.fps = [] + + @property + def run(self): + if self._run is None: + if self.wandb.run is not None: + logger.info( + "There is an ongoing wandb run which will be used" + "for logging. Please use `wandb.finish()` to end that" + "if the behaviour is not intended") + self._run = self.wandb.run + else: + self._run = self.wandb.init(**self.wandb_params) + return self._run + + def save_model(self, + optimizer, + save_dir, + save_name, + last_epoch, + ema_model=None, + ap=None, + fps=None, + tags=None): + if dist.get_world_size() < 2 or dist.get_rank() == 0: + model_path = os.path.join(save_dir, save_name) + metadata = {} + metadata["last_epoch"] = last_epoch + if ap: + metadata["ap"] = ap + + if fps: + metadata["fps"] = fps + + if ema_model is None: + ema_artifact = self.wandb.Artifact( + name="ema_model-{}".format(self.run.id), + type="model", + metadata=metadata) + model_artifact = self.wandb.Artifact( + name="model-{}".format(self.run.id), + type="model", + metadata=metadata) + + ema_artifact.add_file(model_path + ".pdema", name="model_ema") + model_artifact.add_file(model_path + ".pdparams", name="model") + + self.run.log_artifact(ema_artifact, aliases=tags) + self.run.log_artfact(model_artifact, aliases=tags) + else: + model_artifact = self.wandb.Artifact( + name="model-{}".format(self.run.id), + type="model", + metadata=metadata) + model_artifact.add_file(model_path + ".pdparams", name="model") + self.run.log_artifact(model_artifact, aliases=tags) + + def on_step_end(self, status): + + mode = status['mode'] + if dist.get_world_size() < 2 or dist.get_rank() == 0: + if mode == 'train': + training_status = status['training_status'].get() + for k, v in training_status.items(): + training_status[k] = float(v) + + # calculate ips, data_cost, batch_cost + batch_time = status['batch_time'] + data_time = status['data_time'] + batch_size = self.model.cfg['{}Reader'.format(mode.capitalize( + ))]['batch_size'] + + ips = float(batch_size) / float(batch_time.avg) + data_cost = float(data_time.avg) + batch_cost = float(batch_time.avg) + + metrics = {"train/" + k: v for k, v in training_status.items()} + + metrics["train/ips"] = ips + metrics["train/data_cost"] = data_cost + metrics["train/batch_cost"] = batch_cost + + self.fps.append(ips) + self.run.log(metrics) + + def on_epoch_end(self, status): + mode = status['mode'] + epoch_id = status['epoch_id'] + save_name = None + if dist.get_world_size() < 2 or dist.get_rank() == 0: + if mode == 'train': + fps = sum(self.fps) / len(self.fps) + self.fps = [] + + end_epoch = self.model.cfg.epoch + if ( + epoch_id + 1 + ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1: + save_name = str( + epoch_id) if epoch_id != end_epoch - 1 else "model_final" + tags = ["latest", "epoch_{}".format(epoch_id)] + self.save_model( + self.model.optimizer, + self.save_dir, + save_name, + epoch_id + 1, + self.model.use_ema, + fps=fps, + tags=tags) + if mode == 'eval': + sample_num = status['sample_num'] + cost_time = status['cost_time'] + + fps = sample_num / cost_time + + merged_dict = {} + for metric in self.model._metrics: + for key, map_value in metric.get_results().items(): + merged_dict["eval/{}-mAP".format(key)] = map_value[0] + merged_dict["epoch"] = status["epoch_id"] + merged_dict["eval/fps"] = sample_num / cost_time + + self.run.log(merged_dict) + + if 'save_best_model' in status and status['save_best_model']: + for metric in self.model._metrics: + map_res = metric.get_results() + if 'pose3d' in map_res: + key = 'pose3d' + elif 'bbox' in map_res: + key = 'bbox' + elif 'keypoint' in map_res: + key = 'keypoint' + else: + key = 'mask' + if key not in map_res: + logger.warning("Evaluation results empty, this may be due to " \ + "training iterations being too few or not " \ + "loading the correct weights.") + return + if map_res[key][0] >= self.best_ap: + self.best_ap = map_res[key][0] + save_name = 'best_model' + tags = ["best", "epoch_{}".format(epoch_id)] + + self.save_model( + self.model.optimizer, + self.save_dir, + save_name, + last_epoch=epoch_id + 1, + ema_model=self.model.use_ema, + ap=abs(self.best_ap), + fps=fps, + tags=tags) + + def on_train_end(self, status): + self.run.finish() + + +class SniperProposalsGenerator(Callback): + def __init__(self, model): + super(SniperProposalsGenerator, self).__init__(model) + ori_dataset = self.model.dataset + self.dataset = self._create_new_dataset(ori_dataset) + self.loader = self.model.loader + self.cfg = self.model.cfg + self.infer_model = self.model.model + + def _create_new_dataset(self, ori_dataset): + dataset = copy.deepcopy(ori_dataset) + # init anno_cropper + dataset.init_anno_cropper() + # generate infer roidbs + ori_roidbs = dataset.get_ori_roidbs() + roidbs = dataset.anno_cropper.crop_infer_anno_records(ori_roidbs) + # set new roidbs + dataset.set_roidbs(roidbs) + + return dataset + + def _eval_with_loader(self, loader): + results = [] + with paddle.no_grad(): + self.infer_model.eval() + for step_id, data in enumerate(loader): + outs = self.infer_model(data) + for key in ['im_shape', 'scale_factor', 'im_id']: + outs[key] = data[key] + for key, value in outs.items(): + if hasattr(value, 'numpy'): + outs[key] = value.numpy() + + results.append(outs) + + return results + + def on_train_end(self, status): + self.loader.dataset = self.dataset + results = self._eval_with_loader(self.loader) + results = self.dataset.anno_cropper.aggregate_chips_detections(results) + # sniper + proposals = [] + clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()} + for outs in results: + batch_res = get_infer_results(outs, clsid2catid) + start = 0 + for i, im_id in enumerate(outs['im_id']): + bbox_num = outs['bbox_num'] + end = start + bbox_num[i] + bbox_res = batch_res['bbox'][start:end] \ + if 'bbox' in batch_res else None + if bbox_res: + proposals += bbox_res + logger.info("save proposals in {}".format(self.cfg.proposals_path)) + with open(self.cfg.proposals_path, 'w') as f: + json.dump(proposals, f) diff --git a/rtdetr_paddle/ppdet/engine/env.py b/rtdetr_paddle/ppdet/engine/env.py new file mode 100644 index 0000000..0a89657 --- /dev/null +++ b/rtdetr_paddle/ppdet/engine/env.py @@ -0,0 +1,50 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import random +import numpy as np + +import paddle +from paddle.distributed import fleet + +__all__ = ['init_parallel_env', 'set_random_seed', 'init_fleet_env'] + + +def init_fleet_env(find_unused_parameters=False): + strategy = fleet.DistributedStrategy() + strategy.find_unused_parameters = find_unused_parameters + fleet.init(is_collective=True, strategy=strategy) + + +def init_parallel_env(): + env = os.environ + dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env + if dist: + trainer_id = int(env['PADDLE_TRAINER_ID']) + local_seed = (99 + trainer_id) + random.seed(local_seed) + np.random.seed(local_seed) + + paddle.distributed.init_parallel_env() + + +def set_random_seed(seed): + paddle.seed(seed) + random.seed(seed) + np.random.seed(seed) diff --git a/rtdetr_paddle/ppdet/engine/export_utils.py b/rtdetr_paddle/ppdet/engine/export_utils.py new file mode 100644 index 0000000..882dd5a --- /dev/null +++ b/rtdetr_paddle/ppdet/engine/export_utils.py @@ -0,0 +1,349 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import yaml +from collections import OrderedDict + +import paddle +from ppdet.data.source.category import get_categories + +from ppdet.utils.logger import setup_logger +logger = setup_logger('ppdet.engine') + +# Global dictionary +TRT_MIN_SUBGRAPH = { + 'YOLO': 3, + 'PPYOLOE': 3, + 'SSD': 60, + 'RCNN': 40, + 'RetinaNet': 40, + 'S2ANet': 80, + 'EfficientDet': 40, + 'Face': 3, + 'TTFNet': 60, + 'FCOS': 16, + 'SOLOv2': 60, + 'HigherHRNet': 3, + 'HRNet': 3, + 'DeepSORT': 3, + 'ByteTrack': 10, + 'CenterTrack': 5, + 'JDE': 10, + 'FairMOT': 5, + 'GFL': 16, + 'PicoDet': 3, + 'CenterNet': 5, + 'TOOD': 5, + 'YOLOX': 8, + 'YOLOF': 40, + 'METRO_Body': 3, + 'DETR': 3, +} + +KEYPOINT_ARCH = ['HigherHRNet', 'TopDownHRNet'] +MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack'] + +TO_STATIC_SPEC = { + 'yolov3_darknet53_270e_coco': [{ + 'im_id': paddle.static.InputSpec( + name='im_id', shape=[-1, 1], dtype='float32'), + 'is_crowd': paddle.static.InputSpec( + name='is_crowd', shape=[-1, 50], dtype='float32'), + 'gt_bbox': paddle.static.InputSpec( + name='gt_bbox', shape=[-1, 50, 4], dtype='float32'), + 'curr_iter': paddle.static.InputSpec( + name='curr_iter', shape=[-1], dtype='float32'), + 'image': paddle.static.InputSpec( + name='image', shape=[-1, 3, -1, -1], dtype='float32'), + 'im_shape': paddle.static.InputSpec( + name='im_shape', shape=[-1, 2], dtype='float32'), + 'scale_factor': paddle.static.InputSpec( + name='scale_factor', shape=[-1, 2], dtype='float32'), + 'target0': paddle.static.InputSpec( + name='target0', shape=[-1, 3, 86, -1, -1], dtype='float32'), + 'target1': paddle.static.InputSpec( + name='target1', shape=[-1, 3, 86, -1, -1], dtype='float32'), + 'target2': paddle.static.InputSpec( + name='target2', shape=[-1, 3, 86, -1, -1], dtype='float32'), + }], + 'tinypose_128x96': [{ + 'center': paddle.static.InputSpec( + name='center', shape=[-1, 2], dtype='float32'), + 'scale': paddle.static.InputSpec( + name='scale', shape=[-1, 2], dtype='float32'), + 'im_id': paddle.static.InputSpec( + name='im_id', shape=[-1, 1], dtype='float32'), + 'image': paddle.static.InputSpec( + name='image', shape=[-1, 3, 128, 96], dtype='float32'), + 'score': paddle.static.InputSpec( + name='score', shape=[-1], dtype='float32'), + 'rotate': paddle.static.InputSpec( + name='rotate', shape=[-1], dtype='float32'), + 'target': paddle.static.InputSpec( + name='target', shape=[-1, 17, 32, 24], dtype='float32'), + 'target_weight': paddle.static.InputSpec( + name='target_weight', shape=[-1, 17, 1], dtype='float32'), + }], + 'fcos_r50_fpn_1x_coco': [{ + 'im_id': paddle.static.InputSpec( + name='im_id', shape=[-1, 1], dtype='float32'), + 'curr_iter': paddle.static.InputSpec( + name='curr_iter', shape=[-1], dtype='float32'), + 'image': paddle.static.InputSpec( + name='image', shape=[-1, 3, -1, -1], dtype='float32'), + 'im_shape': paddle.static.InputSpec( + name='im_shape', shape=[-1, 2], dtype='float32'), + 'scale_factor': paddle.static.InputSpec( + name='scale_factor', shape=[-1, 2], dtype='float32'), + 'reg_target0': paddle.static.InputSpec( + name='reg_target0', shape=[-1, 160, 160, 4], dtype='float32'), + 'labels0': paddle.static.InputSpec( + name='labels0', shape=[-1, 160, 160, 1], dtype='int32'), + 'centerness0': paddle.static.InputSpec( + name='centerness0', shape=[-1, 160, 160, 1], dtype='float32'), + 'reg_target1': paddle.static.InputSpec( + name='reg_target1', shape=[-1, 80, 80, 4], dtype='float32'), + 'labels1': paddle.static.InputSpec( + name='labels1', shape=[-1, 80, 80, 1], dtype='int32'), + 'centerness1': paddle.static.InputSpec( + name='centerness1', shape=[-1, 80, 80, 1], dtype='float32'), + 'reg_target2': paddle.static.InputSpec( + name='reg_target2', shape=[-1, 40, 40, 4], dtype='float32'), + 'labels2': paddle.static.InputSpec( + name='labels2', shape=[-1, 40, 40, 1], dtype='int32'), + 'centerness2': paddle.static.InputSpec( + name='centerness2', shape=[-1, 40, 40, 1], dtype='float32'), + 'reg_target3': paddle.static.InputSpec( + name='reg_target3', shape=[-1, 20, 20, 4], dtype='float32'), + 'labels3': paddle.static.InputSpec( + name='labels3', shape=[-1, 20, 20, 1], dtype='int32'), + 'centerness3': paddle.static.InputSpec( + name='centerness3', shape=[-1, 20, 20, 1], dtype='float32'), + 'reg_target4': paddle.static.InputSpec( + name='reg_target4', shape=[-1, 10, 10, 4], dtype='float32'), + 'labels4': paddle.static.InputSpec( + name='labels4', shape=[-1, 10, 10, 1], dtype='int32'), + 'centerness4': paddle.static.InputSpec( + name='centerness4', shape=[-1, 10, 10, 1], dtype='float32'), + }], + 'picodet_s_320_coco_lcnet': [{ + 'im_id': paddle.static.InputSpec( + name='im_id', shape=[-1, 1], dtype='float32'), + 'is_crowd': paddle.static.InputSpec( + name='is_crowd', shape=[-1, -1, 1], dtype='float32'), + 'gt_class': paddle.static.InputSpec( + name='gt_class', shape=[-1, -1, 1], dtype='int32'), + 'gt_bbox': paddle.static.InputSpec( + name='gt_bbox', shape=[-1, -1, 4], dtype='float32'), + 'curr_iter': paddle.static.InputSpec( + name='curr_iter', shape=[-1], dtype='float32'), + 'image': paddle.static.InputSpec( + name='image', shape=[-1, 3, -1, -1], dtype='float32'), + 'im_shape': paddle.static.InputSpec( + name='im_shape', shape=[-1, 2], dtype='float32'), + 'scale_factor': paddle.static.InputSpec( + name='scale_factor', shape=[-1, 2], dtype='float32'), + 'pad_gt_mask': paddle.static.InputSpec( + name='pad_gt_mask', shape=[-1, -1, 1], dtype='float32'), + }], + 'ppyoloe_crn_s_300e_coco': [{ + 'im_id': paddle.static.InputSpec( + name='im_id', shape=[-1, 1], dtype='float32'), + 'is_crowd': paddle.static.InputSpec( + name='is_crowd', shape=[-1, -1, 1], dtype='float32'), + 'gt_class': paddle.static.InputSpec( + name='gt_class', shape=[-1, -1, 1], dtype='int32'), + 'gt_bbox': paddle.static.InputSpec( + name='gt_bbox', shape=[-1, -1, 4], dtype='float32'), + 'curr_iter': paddle.static.InputSpec( + name='curr_iter', shape=[-1], dtype='float32'), + 'image': paddle.static.InputSpec( + name='image', shape=[-1, 3, -1, -1], dtype='float32'), + 'im_shape': paddle.static.InputSpec( + name='im_shape', shape=[-1, 2], dtype='float32'), + 'scale_factor': paddle.static.InputSpec( + name='scale_factor', shape=[-1, 2], dtype='float32'), + 'pad_gt_mask': paddle.static.InputSpec( + name='pad_gt_mask', shape=[-1, -1, 1], dtype='float32'), + }], +} + + +def apply_to_static(config, model): + filename = config.get('filename', None) + spec = TO_STATIC_SPEC.get(filename, None) + model = paddle.jit.to_static(model, input_spec=spec) + logger.info("Successfully to apply @to_static with specs: {}".format(spec)) + return model + + +def _prune_input_spec(input_spec, program, targets): + # try to prune static program to figure out pruned input spec + # so we perform following operations in static mode + device = paddle.get_device() + paddle.enable_static() + paddle.set_device(device) + pruned_input_spec = [{}] + program = program.clone() + program = program._prune(targets=targets) + global_block = program.global_block() + for name, spec in input_spec[0].items(): + try: + v = global_block.var(name) + pruned_input_spec[0][name] = spec + except Exception: + pass + paddle.disable_static(place=device) + return pruned_input_spec + + +def _parse_reader(reader_cfg, dataset_cfg, metric, arch, image_shape): + preprocess_list = [] + + anno_file = dataset_cfg.get_anno() + + clsid2catid, catid2name = get_categories(metric, anno_file, arch) + + label_list = [str(cat) for cat in catid2name.values()] + + fuse_normalize = reader_cfg.get('fuse_normalize', False) + sample_transforms = reader_cfg['sample_transforms'] + for st in sample_transforms[1:]: + for key, value in st.items(): + p = {'type': key} + if key == 'Resize': + if int(image_shape[1]) != -1: + value['target_size'] = image_shape[1:] + value['interp'] = value.get('interp', 1) # cv2.INTER_LINEAR + if fuse_normalize and key == 'NormalizeImage': + continue + p.update(value) + preprocess_list.append(p) + batch_transforms = reader_cfg.get('batch_transforms', None) + if batch_transforms: + for bt in batch_transforms: + for key, value in bt.items(): + # for deploy/infer, use PadStride(stride) instead PadBatch(pad_to_stride) + if key == 'PadBatch': + preprocess_list.append({ + 'type': 'PadStride', + 'stride': value['pad_to_stride'] + }) + break + + return preprocess_list, label_list + + +def _parse_tracker(tracker_cfg): + tracker_params = {} + for k, v in tracker_cfg.items(): + tracker_params.update({k: v}) + return tracker_params + + +def _dump_infer_config(config, path, image_shape, model): + arch_state = False + from ppdet.core.config.yaml_helpers import setup_orderdict + setup_orderdict() + use_dynamic_shape = True if image_shape[2] == -1 else False + infer_cfg = OrderedDict({ + 'mode': 'paddle', + 'draw_threshold': 0.5, + 'metric': config['metric'], + 'use_dynamic_shape': use_dynamic_shape + }) + export_onnx = config.get('export_onnx', False) + export_eb = config.get('export_eb', False) + + infer_arch = config['architecture'] + if 'RCNN' in infer_arch and export_onnx: + logger.warning( + "Exporting RCNN model to ONNX only support batch_size = 1") + infer_cfg['export_onnx'] = True + infer_cfg['export_eb'] = export_eb + + if infer_arch in MOT_ARCH: + if infer_arch == 'DeepSORT': + tracker_cfg = config['DeepSORTTracker'] + elif infer_arch == 'CenterTrack': + tracker_cfg = config['CenterTracker'] + else: + tracker_cfg = config['JDETracker'] + infer_cfg['tracker'] = _parse_tracker(tracker_cfg) + + for arch, min_subgraph_size in TRT_MIN_SUBGRAPH.items(): + if arch in infer_arch: + infer_cfg['arch'] = arch + infer_cfg['min_subgraph_size'] = min_subgraph_size + arch_state = True + break + + if infer_arch == 'PPYOLOEWithAuxHead': + infer_arch = 'PPYOLOE' + + if infer_arch in ['PPYOLOE', 'YOLOX', 'YOLOF']: + infer_cfg['arch'] = infer_arch + infer_cfg['min_subgraph_size'] = TRT_MIN_SUBGRAPH[infer_arch] + arch_state = True + + if not arch_state: + logger.error( + 'Architecture: {} is not supported for exporting model now.\n'. + format(infer_arch) + + 'Please set TRT_MIN_SUBGRAPH in ppdet/engine/export_utils.py') + os._exit(0) + if 'mask_head' in config[config['architecture']] and config[config[ + 'architecture']]['mask_head']: + infer_cfg['mask'] = True + label_arch = 'detection_arch' + if infer_arch in KEYPOINT_ARCH: + label_arch = 'keypoint_arch' + + if infer_arch in MOT_ARCH: + if config['metric'] in ['COCO', 'VOC']: + # MOT model run as Detector + reader_cfg = config['TestReader'] + dataset_cfg = config['TestDataset'] + else: + # 'metric' in ['MOT', 'MCMOT', 'KITTI'] + label_arch = 'mot_arch' + reader_cfg = config['TestMOTReader'] + dataset_cfg = config['TestMOTDataset'] + else: + reader_cfg = config['TestReader'] + dataset_cfg = config['TestDataset'] + + infer_cfg['Preprocess'], infer_cfg['label_list'] = _parse_reader( + reader_cfg, dataset_cfg, config['metric'], label_arch, image_shape[1:]) + + if infer_arch == 'PicoDet': + if hasattr(config, 'export') and config['export'].get( + 'post_process', + False) and not config['export'].get('benchmark', False): + infer_cfg['arch'] = 'GFL' + head_name = 'PicoHeadV2' if config['PicoHeadV2'] else 'PicoHead' + infer_cfg['NMS'] = config[head_name]['nms'] + # In order to speed up the prediction, the threshold of nms + # is adjusted here, which can be changed in infer_cfg.yml + config[head_name]['nms']["score_threshold"] = 0.3 + config[head_name]['nms']["nms_threshold"] = 0.5 + infer_cfg['fpn_stride'] = config[head_name]['fpn_stride'] + + yaml.dump(infer_cfg, open(path, 'w')) + logger.info("Export inference config file to {}".format(os.path.join(path))) diff --git a/rtdetr_paddle/ppdet/engine/trainer.py b/rtdetr_paddle/ppdet/engine/trainer.py new file mode 100644 index 0000000..6c3f229 --- /dev/null +++ b/rtdetr_paddle/ppdet/engine/trainer.py @@ -0,0 +1,966 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import copy +import time +from tqdm import tqdm + +import numpy as np +import typing +from PIL import Image, ImageOps, ImageFile + +ImageFile.LOAD_TRUNCATED_IMAGES = True + +import paddle +import paddle.nn as nn +import paddle.distributed as dist +from paddle.distributed import fleet +from paddle.static import InputSpec +from ppdet.optimizer import ModelEMA + +from ppdet.core.workspace import create +from ppdet.utils.checkpoint import load_weight, load_pretrain_weight +from ppdet.utils.visualizer import visualize_results, save_result +from ppdet.metrics import Metric, COCOMetric, VOCMetric, get_infer_results +from ppdet.data.source.category import get_categories +import ppdet.utils.stats as stats +from ppdet.utils.fuse_utils import fuse_conv_bn +from ppdet.utils import profiler +from ppdet.modeling.post_process import multiclass_nms + +from .callbacks import Callback, ComposeCallback, LogPrinter, Checkpointer, VisualDLWriter, WandbCallback +from .export_utils import _dump_infer_config, _prune_input_spec, apply_to_static + +from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients + +from ppdet.utils.logger import setup_logger +logger = setup_logger('ppdet.engine') + +__all__ = ['Trainer'] + +class Trainer(object): + def __init__(self, cfg, mode='train'): + self.cfg = cfg.copy() + assert mode.lower() in ['train', 'eval', 'test'], \ + "mode should be 'train', 'eval' or 'test'" + self.mode = mode.lower() + self.optimizer = None + self.is_loaded_weights = False + self.use_amp = self.cfg.get('amp', False) + self.amp_level = self.cfg.get('amp_level', 'O1') + self.custom_white_list = self.cfg.get('custom_white_list', None) + self.custom_black_list = self.cfg.get('custom_black_list', None) + + # build data loader + capital_mode = self.mode.capitalize() + self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create( + '{}Dataset'.format(capital_mode))() + + if self.mode == 'train': + self.loader = create('{}Reader'.format(capital_mode))( + self.dataset, cfg.worker_num) + + # build model + if 'model' not in self.cfg: + self.model = create(cfg.architecture) + else: + self.model = self.cfg.model + self.is_loaded_weights = True + + # EvalDataset build with BatchSampler to evaluate in single device + # TODO: multi-device evaluate + if self.mode == 'eval': + self._eval_batch_sampler = paddle.io.BatchSampler( + self.dataset, batch_size=self.cfg.EvalReader['batch_size']) + reader_name = '{}Reader'.format(self.mode.capitalize()) + # If metric is VOC, need to be set collate_batch=False. + if cfg.metric == 'VOC': + self.cfg[reader_name]['collate_batch'] = False + self.loader = create(reader_name)(self.dataset, cfg.worker_num, + self._eval_batch_sampler) + + # TestDataset build after user set images, skip loader creation here + + # get Params + print_params = self.cfg.get('print_params', False) + if print_params: + params = sum([ + p.numel() for n, p in self.model.named_parameters() + if all([x not in n for x in ['_mean', '_variance', 'aux_']]) + ]) # exclude BatchNorm running status + logger.info('Model Params : {} M.'.format((params / 1e6).numpy()[ + 0])) + + # build optimizer in train mode + if self.mode == 'train': + steps_per_epoch = len(self.loader) + if steps_per_epoch < 1: + logger.warning( + "Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader." + ) + self.lr = create('LearningRate')(steps_per_epoch) + self.optimizer = create('OptimizerBuilder')(self.lr, self.model) + + if self.use_amp and self.amp_level == 'O2': + self.model, self.optimizer = paddle.amp.decorate( + models=self.model, + optimizers=self.optimizer, + level=self.amp_level) + self.use_ema = ('use_ema' in cfg and cfg['use_ema']) + if self.use_ema: + ema_decay = self.cfg.get('ema_decay', 0.9998) + ema_decay_type = self.cfg.get('ema_decay_type', 'threshold') + cycle_epoch = self.cfg.get('cycle_epoch', -1) + ema_black_list = self.cfg.get('ema_black_list', None) + ema_filter_no_grad = self.cfg.get('ema_filter_no_grad', False) + self.ema = ModelEMA( + self.model, + decay=ema_decay, + ema_decay_type=ema_decay_type, + cycle_epoch=cycle_epoch, + ema_black_list=ema_black_list, + ema_filter_no_grad=ema_filter_no_grad) + + self._nranks = dist.get_world_size() + self._local_rank = dist.get_rank() + + self.status = {} + + self.start_epoch = 0 + self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch + + # initial default callbacks + self._init_callbacks() + + # initial default metrics + self._init_metrics() + self._reset_metrics() + + def _init_callbacks(self): + if self.mode == 'train': + self._callbacks = [LogPrinter(self), Checkpointer(self)] + if self.cfg.get('use_vdl', False): + self._callbacks.append(VisualDLWriter(self)) + if self.cfg.get('use_wandb', False) or 'wandb' in self.cfg: + self._callbacks.append(WandbCallback(self)) + self._compose_callback = ComposeCallback(self._callbacks) + elif self.mode == 'eval': + self._callbacks = [LogPrinter(self)] + self._compose_callback = ComposeCallback(self._callbacks) + elif self.mode == 'test' and self.cfg.get('use_vdl', False): + self._callbacks = [VisualDLWriter(self)] + self._compose_callback = ComposeCallback(self._callbacks) + else: + self._callbacks = [] + self._compose_callback = None + + def _init_metrics(self, validate=False): + if self.mode == 'test' or (self.mode == 'train' and not validate): + self._metrics = [] + return + classwise = self.cfg['classwise'] if 'classwise' in self.cfg else False + if self.cfg.metric == 'COCO': + # TODO: bias should be unified + bias = 1 if self.cfg.get('bias', False) else 0 + output_eval = self.cfg['output_eval'] \ + if 'output_eval' in self.cfg else None + save_prediction_only = self.cfg.get('save_prediction_only', False) + + # pass clsid2catid info to metric instance to avoid multiple loading + # annotation file + clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()} \ + if self.mode == 'eval' else None + + # when do validation in train, annotation file should be get from + # EvalReader instead of self.dataset(which is TrainReader) + if self.mode == 'train' and validate: + eval_dataset = self.cfg['EvalDataset'] + eval_dataset.check_or_download_dataset() + anno_file = eval_dataset.get_anno() + dataset = eval_dataset + else: + dataset = self.dataset + anno_file = dataset.get_anno() + + IouType = self.cfg['IouType'] if 'IouType' in self.cfg else 'bbox' + self._metrics = [ + COCOMetric( + anno_file=anno_file, + clsid2catid=clsid2catid, + classwise=classwise, + output_eval=output_eval, + bias=bias, + IouType=IouType, + save_prediction_only=save_prediction_only) + ] + + elif self.cfg.metric == 'VOC': + output_eval = self.cfg['output_eval'] \ + if 'output_eval' in self.cfg else None + save_prediction_only = self.cfg.get('save_prediction_only', False) + self._metrics = [ + VOCMetric( + label_list=self.dataset.get_label_list(), + class_num=self.cfg.num_classes, + map_type=self.cfg.map_type, + classwise=classwise, + output_eval=output_eval, + save_prediction_only=save_prediction_only) + ] + else: + logger.warning("Metric not support for metric type {}".format( + self.cfg.metric)) + self._metrics = [] + + def _reset_metrics(self): + for metric in self._metrics: + metric.reset() + + def register_callbacks(self, callbacks): + callbacks = [c for c in list(callbacks) if c is not None] + for c in callbacks: + assert isinstance(c, Callback), \ + "metrics shoule be instances of subclass of Metric" + self._callbacks.extend(callbacks) + self._compose_callback = ComposeCallback(self._callbacks) + + def register_metrics(self, metrics): + metrics = [m for m in list(metrics) if m is not None] + for m in metrics: + assert isinstance(m, Metric), \ + "metrics shoule be instances of subclass of Metric" + self._metrics.extend(metrics) + + def load_weights(self, weights, ARSL_eval=False): + if self.is_loaded_weights: + return + self.start_epoch = 0 + load_pretrain_weight(self.model, weights, ARSL_eval) + logger.debug("Load weights {} to start training".format(weights)) + + def resume_weights(self, weights): + self.start_epoch = load_weight(self.model, weights, self.optimizer, + self.ema if self.use_ema else None) + logger.debug("Resume weights of epoch {}".format(self.start_epoch)) + + def train(self, validate=False): + assert self.mode == 'train', "Model not in 'train' mode" + Init_mark = False + if validate: + self.cfg['EvalDataset'] = self.cfg.EvalDataset = create( + "EvalDataset")() + + model = self.model + if self.cfg.get('to_static', False): + model = apply_to_static(self.cfg, model) + sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and + (self.cfg.use_gpu or self.cfg.use_mlu) and self._nranks > 1) + if sync_bn: + model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model) + + # enabel auto mixed precision mode + if self.use_amp: + scaler = paddle.amp.GradScaler( + enable=self.cfg.use_gpu or self.cfg.use_npu or self.cfg.use_mlu, + init_loss_scaling=self.cfg.get('init_loss_scaling', 1024)) + # get distributed model + if self.cfg.get('fleet', False): + model = fleet.distributed_model(model) + self.optimizer = fleet.distributed_optimizer(self.optimizer) + elif self._nranks > 1: + find_unused_parameters = self.cfg[ + 'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False + model = paddle.DataParallel( + model, find_unused_parameters=find_unused_parameters) + + self.status.update({ + 'epoch_id': self.start_epoch, + 'step_id': 0, + 'steps_per_epoch': len(self.loader) + }) + + self.status['batch_time'] = stats.SmoothedValue( + self.cfg.log_iter, fmt='{avg:.4f}') + self.status['data_time'] = stats.SmoothedValue( + self.cfg.log_iter, fmt='{avg:.4f}') + self.status['training_status'] = stats.TrainingStats(self.cfg.log_iter) + + profiler_options = self.cfg.get('profiler_options', None) + + self._compose_callback.on_train_begin(self.status) + + use_fused_allreduce_gradients = self.cfg[ + 'use_fused_allreduce_gradients'] if 'use_fused_allreduce_gradients' in self.cfg else False + + for epoch_id in range(self.start_epoch, self.cfg.epoch): + self.status['mode'] = 'train' + self.status['epoch_id'] = epoch_id + self._compose_callback.on_epoch_begin(self.status) + self.loader.dataset.set_epoch(epoch_id) + model.train() + iter_tic = time.time() + for step_id, data in enumerate(self.loader): + self.status['data_time'].update(time.time() - iter_tic) + self.status['step_id'] = step_id + profiler.add_profiler_step(profiler_options) + self._compose_callback.on_step_begin(self.status) + data['epoch_id'] = epoch_id + if self.cfg.get('to_static', + False) and 'image_file' in data.keys(): + data.pop('image_file') + + if self.use_amp: + if isinstance( + model, paddle. + DataParallel) and use_fused_allreduce_gradients: + with model.no_sync(): + with paddle.amp.auto_cast( + enable=self.cfg.use_gpu or + self.cfg.use_npu or self.cfg.use_mlu, + custom_white_list=self.custom_white_list, + custom_black_list=self.custom_black_list, + level=self.amp_level): + # model forward + outputs = model(data) + loss = outputs['loss'] + # model backward + scaled_loss = scaler.scale(loss) + scaled_loss.backward() + fused_allreduce_gradients( + list(model.parameters()), None) + else: + with paddle.amp.auto_cast( + enable=self.cfg.use_gpu or self.cfg.use_npu or + self.cfg.use_mlu, + custom_white_list=self.custom_white_list, + custom_black_list=self.custom_black_list, + level=self.amp_level): + # model forward + outputs = model(data) + loss = outputs['loss'] + # model backward + scaled_loss = scaler.scale(loss) + scaled_loss.backward() + # in dygraph mode, optimizer.minimize is equal to optimizer.step + scaler.minimize(self.optimizer, scaled_loss) + else: + if isinstance( + model, paddle. + DataParallel) and use_fused_allreduce_gradients: + with model.no_sync(): + # model forward + outputs = model(data) + loss = outputs['loss'] + # model backward + loss.backward() + fused_allreduce_gradients( + list(model.parameters()), None) + else: + # model forward + outputs = model(data) + loss = outputs['loss'] + # model backward + loss.backward() + self.optimizer.step() + curr_lr = self.optimizer.get_lr() + self.lr.step() + self.optimizer.clear_grad() + self.status['learning_rate'] = curr_lr + + if self._nranks < 2 or self._local_rank == 0: + self.status['training_status'].update(outputs) + + self.status['batch_time'].update(time.time() - iter_tic) + self._compose_callback.on_step_end(self.status) + if self.use_ema: + self.ema.update() + iter_tic = time.time() + + is_snapshot = (self._nranks < 2 or (self._local_rank == 0 or self.cfg.metric == "Pose3DEval")) \ + and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1) + if is_snapshot and self.use_ema: + # apply ema weight on model + weight = copy.deepcopy(self.model.state_dict()) + self.model.set_dict(self.ema.apply()) + self.status['weight'] = weight + + self._compose_callback.on_epoch_end(self.status) + + if validate and is_snapshot: + if not hasattr(self, '_eval_loader'): + # build evaluation dataset and loader + self._eval_dataset = self.cfg.EvalDataset + self._eval_batch_sampler = \ + paddle.io.BatchSampler( + self._eval_dataset, + batch_size=self.cfg.EvalReader['batch_size']) + # If metric is VOC, need to be set collate_batch=False. + if self.cfg.metric == 'VOC': + self.cfg['EvalReader']['collate_batch'] = False + else: + self._eval_loader = create('EvalReader')( + self._eval_dataset, + self.cfg.worker_num, + batch_sampler=self._eval_batch_sampler) + # if validation in training is enabled, metrics should be re-init + # Init_mark makes sure this code will only execute once + if validate and Init_mark == False: + Init_mark = True + self._init_metrics(validate=validate) + self._reset_metrics() + + with paddle.no_grad(): + self.status['save_best_model'] = True + self._eval_with_loader(self._eval_loader) + + if is_snapshot and self.use_ema: + # reset original weight + self.model.set_dict(weight) + self.status.pop('weight') + + self._compose_callback.on_train_end(self.status) + + def _eval_with_loader(self, loader): + sample_num = 0 + tic = time.time() + self._compose_callback.on_epoch_begin(self.status) + self.status['mode'] = 'eval' + + self.model.eval() + for step_id, data in enumerate(loader): + self.status['step_id'] = step_id + self._compose_callback.on_step_begin(self.status) + # forward + if self.use_amp: + with paddle.amp.auto_cast( + enable=self.cfg.use_gpu or self.cfg.use_npu or + self.cfg.use_mlu, + custom_white_list=self.custom_white_list, + custom_black_list=self.custom_black_list, + level=self.amp_level): + outs = self.model(data) + else: + outs = self.model(data) + + # update metrics + for metric in self._metrics: + metric.update(data, outs) + + # multi-scale inputs: all inputs have same im_id + if isinstance(data, typing.Sequence): + sample_num += data[0]['im_id'].numpy().shape[0] + else: + sample_num += data['im_id'].numpy().shape[0] + self._compose_callback.on_step_end(self.status) + + self.status['sample_num'] = sample_num + self.status['cost_time'] = time.time() - tic + + # accumulate metric to log out + for metric in self._metrics: + metric.accumulate() + metric.log() + self._compose_callback.on_epoch_end(self.status) + # reset metric states for metric may performed multiple times + self._reset_metrics() + + def evaluate(self): + # get distributed model + if self.cfg.get('fleet', False): + self.model = fleet.distributed_model(self.model) + self.optimizer = fleet.distributed_optimizer(self.optimizer) + elif self._nranks > 1: + find_unused_parameters = self.cfg[ + 'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False + self.model = paddle.DataParallel( + self.model, find_unused_parameters=find_unused_parameters) + with paddle.no_grad(): + self._eval_with_loader(self.loader) + + def _eval_with_loader_slice(self, + loader, + slice_size=[640, 640], + overlap_ratio=[0.25, 0.25], + combine_method='nms', + match_threshold=0.6, + match_metric='iou'): + sample_num = 0 + tic = time.time() + self._compose_callback.on_epoch_begin(self.status) + self.status['mode'] = 'eval' + self.model.eval() + merged_bboxs = [] + for step_id, data in enumerate(loader): + self.status['step_id'] = step_id + self._compose_callback.on_step_begin(self.status) + # forward + if self.use_amp: + with paddle.amp.auto_cast( + enable=self.cfg.use_gpu or self.cfg.use_npu or + self.cfg.use_mlu, + custom_white_list=self.custom_white_list, + custom_black_list=self.custom_black_list, + level=self.amp_level): + outs = self.model(data) + else: + outs = self.model(data) + + shift_amount = data['st_pix'] + outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount + outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount + merged_bboxs.append(outs['bbox']) + + if data['is_last'] > 0: + # merge matching predictions + merged_results = {'bbox': []} + if combine_method == 'nms': + final_boxes = multiclass_nms( + np.concatenate(merged_bboxs), self.cfg.num_classes, + match_threshold, match_metric) + merged_results['bbox'] = np.concatenate(final_boxes) + elif combine_method == 'concat': + merged_results['bbox'] = np.concatenate(merged_bboxs) + else: + raise ValueError( + "Now only support 'nms' or 'concat' to fuse detection results." + ) + merged_results['im_id'] = np.array([[0]]) + merged_results['bbox_num'] = np.array( + [len(merged_results['bbox'])]) + + merged_bboxs = [] + data['im_id'] = data['ori_im_id'] + # update metrics + for metric in self._metrics: + metric.update(data, merged_results) + + # multi-scale inputs: all inputs have same im_id + if isinstance(data, typing.Sequence): + sample_num += data[0]['im_id'].numpy().shape[0] + else: + sample_num += data['im_id'].numpy().shape[0] + + self._compose_callback.on_step_end(self.status) + + self.status['sample_num'] = sample_num + self.status['cost_time'] = time.time() - tic + + # accumulate metric to log out + for metric in self._metrics: + metric.accumulate() + metric.log() + self._compose_callback.on_epoch_end(self.status) + # reset metric states for metric may performed multiple times + self._reset_metrics() + + def evaluate_slice(self, + slice_size=[640, 640], + overlap_ratio=[0.25, 0.25], + combine_method='nms', + match_threshold=0.6, + match_metric='iou'): + with paddle.no_grad(): + self._eval_with_loader_slice(self.loader, slice_size, overlap_ratio, + combine_method, match_threshold, + match_metric) + + def slice_predict(self, + images, + slice_size=[640, 640], + overlap_ratio=[0.25, 0.25], + combine_method='nms', + match_threshold=0.6, + match_metric='iou', + draw_threshold=0.5, + output_dir='output', + save_results=False, + visualize=True): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + self.dataset.set_slice_images(images, slice_size, overlap_ratio) + loader = create('TestReader')(self.dataset, 0) + imid2path = self.dataset.get_imid2path() + + def setup_metrics_for_loader(): + # mem + metrics = copy.deepcopy(self._metrics) + mode = self.mode + save_prediction_only = self.cfg[ + 'save_prediction_only'] if 'save_prediction_only' in self.cfg else None + output_eval = self.cfg[ + 'output_eval'] if 'output_eval' in self.cfg else None + + # modify + self.mode = '_test' + self.cfg['save_prediction_only'] = True + self.cfg['output_eval'] = output_dir + self.cfg['imid2path'] = imid2path + self._init_metrics() + + # restore + self.mode = mode + self.cfg.pop('save_prediction_only') + if save_prediction_only is not None: + self.cfg['save_prediction_only'] = save_prediction_only + + self.cfg.pop('output_eval') + if output_eval is not None: + self.cfg['output_eval'] = output_eval + + self.cfg.pop('imid2path') + + _metrics = copy.deepcopy(self._metrics) + self._metrics = metrics + + return _metrics + + if save_results: + metrics = setup_metrics_for_loader() + else: + metrics = [] + + anno_file = self.dataset.get_anno() + clsid2catid, catid2name = get_categories( + self.cfg.metric, anno_file=anno_file) + + # Run Infer + self.status['mode'] = 'test' + self.model.eval() + + results = [] # all images + merged_bboxs = [] # single image + for step_id, data in enumerate(tqdm(loader)): + self.status['step_id'] = step_id + # forward + with paddle.no_grad(): + outs = self.model(data) + + outs['bbox'] = outs['bbox'].numpy() # only in test mode + shift_amount = data['st_pix'] + outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount.numpy() + outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount.numpy() + merged_bboxs.append(outs['bbox']) + + if data['is_last'] > 0: + # merge matching predictions + merged_results = {'bbox': []} + if combine_method == 'nms': + final_boxes = multiclass_nms( + np.concatenate(merged_bboxs), self.cfg.num_classes, + match_threshold, match_metric) + merged_results['bbox'] = np.concatenate(final_boxes) + elif combine_method == 'concat': + merged_results['bbox'] = np.concatenate(merged_bboxs) + else: + raise ValueError( + "Now only support 'nms' or 'concat' to fuse detection results." + ) + merged_results['im_id'] = np.array([[0]]) + merged_results['bbox_num'] = np.array( + [len(merged_results['bbox'])]) + + merged_bboxs = [] + data['im_id'] = data['ori_im_id'] + + for _m in metrics: + _m.update(data, merged_results) + + for key in ['im_shape', 'scale_factor', 'im_id']: + if isinstance(data, typing.Sequence): + merged_results[key] = data[0][key] + else: + merged_results[key] = data[key] + for key, value in merged_results.items(): + if hasattr(value, 'numpy'): + merged_results[key] = value.numpy() + results.append(merged_results) + + for _m in metrics: + _m.accumulate() + _m.reset() + + if visualize: + for outs in results: + batch_res = get_infer_results(outs, clsid2catid) + bbox_num = outs['bbox_num'] + + start = 0 + for i, im_id in enumerate(outs['im_id']): + image_path = imid2path[int(im_id)] + image = Image.open(image_path).convert('RGB') + image = ImageOps.exif_transpose(image) + self.status['original_image'] = np.array(image.copy()) + + end = start + bbox_num[i] + bbox_res = batch_res['bbox'][start:end] \ + if 'bbox' in batch_res else None + mask_res = batch_res['mask'][start:end] \ + if 'mask' in batch_res else None + segm_res = batch_res['segm'][start:end] \ + if 'segm' in batch_res else None + keypoint_res = batch_res['keypoint'][start:end] \ + if 'keypoint' in batch_res else None + pose3d_res = batch_res['pose3d'][start:end] \ + if 'pose3d' in batch_res else None + image = visualize_results( + image, bbox_res, mask_res, segm_res, keypoint_res, + pose3d_res, int(im_id), catid2name, draw_threshold) + self.status['result_image'] = np.array(image.copy()) + if self._compose_callback: + self._compose_callback.on_step_end(self.status) + # save image with detection + save_name = self._get_save_image_name(output_dir, + image_path) + logger.info("Detection bbox results save in {}".format( + save_name)) + image.save(save_name, quality=95) + + start = end + + def predict(self, + images, + draw_threshold=0.5, + output_dir='output', + save_results=False, + visualize=True): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + self.dataset.set_images(images) + loader = create('TestReader')(self.dataset, 0) + + imid2path = self.dataset.get_imid2path() + + def setup_metrics_for_loader(): + # mem + metrics = copy.deepcopy(self._metrics) + mode = self.mode + save_prediction_only = self.cfg[ + 'save_prediction_only'] if 'save_prediction_only' in self.cfg else None + output_eval = self.cfg[ + 'output_eval'] if 'output_eval' in self.cfg else None + + # modify + self.mode = '_test' + self.cfg['save_prediction_only'] = True + self.cfg['output_eval'] = output_dir + self.cfg['imid2path'] = imid2path + self._init_metrics() + + # restore + self.mode = mode + self.cfg.pop('save_prediction_only') + if save_prediction_only is not None: + self.cfg['save_prediction_only'] = save_prediction_only + + self.cfg.pop('output_eval') + if output_eval is not None: + self.cfg['output_eval'] = output_eval + + self.cfg.pop('imid2path') + + _metrics = copy.deepcopy(self._metrics) + self._metrics = metrics + + return _metrics + + if save_results: + metrics = setup_metrics_for_loader() + else: + metrics = [] + + anno_file = self.dataset.get_anno() + clsid2catid, catid2name = get_categories( + self.cfg.metric, anno_file=anno_file) + + # Run Infer + self.status['mode'] = 'test' + self.model.eval() + + results = [] + for step_id, data in enumerate(tqdm(loader)): + self.status['step_id'] = step_id + # forward + with paddle.no_grad(): + if hasattr(self.model, 'modelTeacher'): + outs = self.model.modelTeacher(data) + else: + outs = self.model(data) + + for _m in metrics: + _m.update(data, outs) + + for key in ['im_shape', 'scale_factor', 'im_id']: + if isinstance(data, typing.Sequence): + outs[key] = data[0][key] + else: + outs[key] = data[key] + for key, value in outs.items(): + if hasattr(value, 'numpy'): + outs[key] = value.numpy() + results.append(outs) + + for _m in metrics: + _m.accumulate() + _m.reset() + + if visualize: + for outs in results: + batch_res = get_infer_results(outs, clsid2catid) + bbox_num = outs['bbox_num'] + + start = 0 + for i, im_id in enumerate(outs['im_id']): + image_path = imid2path[int(im_id)] + image = Image.open(image_path).convert('RGB') + image = ImageOps.exif_transpose(image) + self.status['original_image'] = np.array(image.copy()) + + end = start + bbox_num[i] + bbox_res = batch_res['bbox'][start:end] \ + if 'bbox' in batch_res else None + mask_res = batch_res['mask'][start:end] \ + if 'mask' in batch_res else None + segm_res = batch_res['segm'][start:end] \ + if 'segm' in batch_res else None + keypoint_res = batch_res['keypoint'][start:end] \ + if 'keypoint' in batch_res else None + pose3d_res = batch_res['pose3d'][start:end] \ + if 'pose3d' in batch_res else None + image = visualize_results( + image, bbox_res, mask_res, segm_res, keypoint_res, + pose3d_res, int(im_id), catid2name, draw_threshold) + self.status['result_image'] = np.array(image.copy()) + if self._compose_callback: + self._compose_callback.on_step_end(self.status) + # save image with detection + save_name = self._get_save_image_name(output_dir, + image_path) + logger.info("Detection bbox results save in {}".format( + save_name)) + image.save(save_name, quality=95) + + start = end + return results + + def _get_save_image_name(self, output_dir, image_path): + """ + Get save image name from source image path. + """ + image_name = os.path.split(image_path)[-1] + name, ext = os.path.splitext(image_name) + return os.path.join(output_dir, "{}".format(name)) + ext + + def _get_infer_cfg_and_input_spec(self, + save_dir, + prune_input=True, + kl_quant=False): + image_shape = None + im_shape = [None, 2] + scale_factor = [None, 2] + test_reader_name = 'TestReader' + if 'inputs_def' in self.cfg[test_reader_name]: + inputs_def = self.cfg[test_reader_name]['inputs_def'] + image_shape = inputs_def.get('image_shape', None) + # set image_shape=[None, 3, -1, -1] as default + if image_shape is None: + image_shape = [None, 3, -1, -1] + + if len(image_shape) == 3: + image_shape = [None] + image_shape + else: + im_shape = [image_shape[0], 2] + scale_factor = [image_shape[0], 2] + + if hasattr(self.model, 'deploy'): + self.model.deploy = True + + for layer in self.model.sublayers(): + if hasattr(layer, 'convert_to_deploy'): + layer.convert_to_deploy() + + if hasattr(self.cfg, 'export') and 'fuse_conv_bn' in self.cfg[ + 'export'] and self.cfg['export']['fuse_conv_bn']: + self.model = fuse_conv_bn(self.model) + + export_post_process = self.cfg['export'].get( + 'post_process', False) if hasattr(self.cfg, 'export') else True + export_nms = self.cfg['export'].get('nms', False) if hasattr( + self.cfg, 'export') else True + export_benchmark = self.cfg['export'].get( + 'benchmark', False) if hasattr(self.cfg, 'export') else False + if hasattr(self.model, 'export_post_process'): + self.model.export_post_process = export_post_process if not export_benchmark else False + if hasattr(self.model, 'export_nms'): + self.model.export_nms = export_nms if not export_benchmark else False + if export_post_process and not export_benchmark: + image_shape = [None] + image_shape[1:] + + # Save infer cfg + _dump_infer_config(self.cfg, + os.path.join(save_dir, 'infer_cfg.yml'), image_shape, + self.model) + + input_spec = [{ + "image": InputSpec( + shape=image_shape, name='image'), + "im_shape": InputSpec( + shape=im_shape, name='im_shape'), + "scale_factor": InputSpec( + shape=scale_factor, name='scale_factor') + }] + + if prune_input: + static_model = paddle.jit.to_static( + self.model, input_spec=input_spec, full_graph=True) + # NOTE: dy2st do not pruned program, but jit.save will prune program + # input spec, prune input spec here and save with pruned input spec + pruned_input_spec = _prune_input_spec( + input_spec, static_model.forward.main_program, + static_model.forward.outputs) + else: + static_model = None + pruned_input_spec = input_spec + + return static_model, pruned_input_spec + + def export(self, output_dir='output_inference'): + if hasattr(self.model, 'aux_neck'): + self.model.__delattr__('aux_neck') + if hasattr(self.model, 'aux_head'): + self.model.__delattr__('aux_head') + self.model.eval() + + model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0] + save_dir = os.path.join(output_dir, model_name) + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + static_model, pruned_input_spec = self._get_infer_cfg_and_input_spec( + save_dir) + + # dy2st and save model + paddle.jit.save( + static_model, + os.path.join(save_dir, 'model'), + input_spec=pruned_input_spec) + + logger.info("Export model and saved in {}".format(save_dir)) diff --git a/rtdetr_paddle/ppdet/metrics/__init__.py b/rtdetr_paddle/ppdet/metrics/__init__.py new file mode 100644 index 0000000..afca8d0 --- /dev/null +++ b/rtdetr_paddle/ppdet/metrics/__init__.py @@ -0,0 +1,26 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import metrics + +from .metrics import * +from .pose3d_metrics import * + +from . import mot_metrics +from .mot_metrics import * +__all__ = metrics.__all__ + mot_metrics.__all__ + +from . import mcmot_metrics +from .mcmot_metrics import * +__all__ = metrics.__all__ + mcmot_metrics.__all__ \ No newline at end of file diff --git a/rtdetr_paddle/ppdet/metrics/coco_utils.py b/rtdetr_paddle/ppdet/metrics/coco_utils.py new file mode 100644 index 0000000..b7a4d7e --- /dev/null +++ b/rtdetr_paddle/ppdet/metrics/coco_utils.py @@ -0,0 +1,188 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import numpy as np +import itertools + +from ppdet.metrics.json_results import get_det_res, get_det_poly_res, get_seg_res, get_solov2_segm_res, get_keypoint_res, get_pose3d_res +from ppdet.metrics.map_utils import draw_pr_curve + +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + + +def get_infer_results(outs, catid, bias=0): + """ + Get result at the stage of inference. + The output format is dictionary containing bbox or mask result. + + For example, bbox result is a list and each element contains + image_id, category_id, bbox and score. + """ + if outs is None or len(outs) == 0: + raise ValueError( + 'The number of valid detection result if zero. Please use reasonable model and check input data.' + ) + + im_id = outs['im_id'] + + infer_res = {} + if 'bbox' in outs: + if len(outs['bbox']) > 0 and len(outs['bbox'][0]) > 6: + infer_res['bbox'] = get_det_poly_res( + outs['bbox'], outs['bbox_num'], im_id, catid, bias=bias) + else: + infer_res['bbox'] = get_det_res( + outs['bbox'], outs['bbox_num'], im_id, catid, bias=bias) + + if 'mask' in outs: + # mask post process + infer_res['mask'] = get_seg_res(outs['mask'], outs['bbox'], + outs['bbox_num'], im_id, catid) + + if 'segm' in outs: + infer_res['segm'] = get_solov2_segm_res(outs, im_id, catid) + + if 'keypoint' in outs: + infer_res['keypoint'] = get_keypoint_res(outs, im_id) + outs['bbox_num'] = [len(infer_res['keypoint'])] + + if 'pose3d' in outs: + infer_res['pose3d'] = get_pose3d_res(outs, im_id) + outs['bbox_num'] = [len(infer_res['pose3d'])] + + return infer_res + + +def cocoapi_eval(jsonfile, + style, + coco_gt=None, + anno_file=None, + max_dets=(100, 300, 1000), + classwise=False, + sigmas=None, + use_area=True): + """ + Args: + jsonfile (str): Evaluation json file, eg: bbox.json, mask.json. + style (str): COCOeval style, can be `bbox` , `segm` , `proposal`, `keypoints` and `keypoints_crowd`. + coco_gt (str): Whether to load COCOAPI through anno_file, + eg: coco_gt = COCO(anno_file) + anno_file (str): COCO annotations file. + max_dets (tuple): COCO evaluation maxDets. + classwise (bool): Whether per-category AP and draw P-R Curve or not. + sigmas (nparray): keypoint labelling sigmas. + use_area (bool): If gt annotations (eg. CrowdPose, AIC) + do not have 'area', please set use_area=False. + """ + assert coco_gt != None or anno_file != None + if style == 'keypoints_crowd': + #please install xtcocotools==1.6 + from xtcocotools.coco import COCO + from xtcocotools.cocoeval import COCOeval + else: + from pycocotools.coco import COCO + from pycocotools.cocoeval import COCOeval + + if coco_gt == None: + coco_gt = COCO(anno_file) + logger.info("Start evaluate...") + coco_dt = coco_gt.loadRes(jsonfile) + if style == 'proposal': + coco_eval = COCOeval(coco_gt, coco_dt, 'bbox') + coco_eval.params.useCats = 0 + coco_eval.params.maxDets = list(max_dets) + elif style == 'keypoints_crowd': + coco_eval = COCOeval(coco_gt, coco_dt, style, sigmas, use_area) + else: + coco_eval = COCOeval(coco_gt, coco_dt, style) + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + if classwise: + # Compute per-category AP and PR curve + try: + from terminaltables import AsciiTable + except Exception as e: + logger.error( + 'terminaltables not found, plaese install terminaltables. ' + 'for example: `pip install terminaltables`.') + raise e + precisions = coco_eval.eval['precision'] + cat_ids = coco_gt.getCatIds() + # precision: (iou, recall, cls, area range, max dets) + assert len(cat_ids) == precisions.shape[2] + results_per_category = [] + for idx, catId in enumerate(cat_ids): + # area range index 0: all area ranges + # max dets index -1: typically 100 per image + nm = coco_gt.loadCats(catId)[0] + precision = precisions[:, :, idx, 0, -1] + precision = precision[precision > -1] + if precision.size: + ap = np.mean(precision) + else: + ap = float('nan') + results_per_category.append( + (str(nm["name"]), '{:0.3f}'.format(float(ap)))) + pr_array = precisions[0, :, idx, 0, 2] + recall_array = np.arange(0.0, 1.01, 0.01) + draw_pr_curve( + pr_array, + recall_array, + out_dir=style + '_pr_curve', + file_name='{}_precision_recall_curve.jpg'.format(nm["name"])) + + num_columns = min(6, len(results_per_category) * 2) + results_flatten = list(itertools.chain(*results_per_category)) + headers = ['category', 'AP'] * (num_columns // 2) + results_2d = itertools.zip_longest( + * [results_flatten[i::num_columns] for i in range(num_columns)]) + table_data = [headers] + table_data += [result for result in results_2d] + table = AsciiTable(table_data) + logger.info('Per-category of {} AP: \n{}'.format(style, table.table)) + logger.info("per-category PR curve has output to {} folder.".format( + style + '_pr_curve')) + # flush coco evaluation result + sys.stdout.flush() + return coco_eval.stats + + +def json_eval_results(metric, json_directory, dataset): + """ + cocoapi eval with already exists proposal.json, bbox.json or mask.json + """ + assert metric == 'COCO' + anno_file = dataset.get_anno() + json_file_list = ['proposal.json', 'bbox.json', 'mask.json'] + if json_directory: + assert os.path.exists( + json_directory), "The json directory:{} does not exist".format( + json_directory) + for k, v in enumerate(json_file_list): + json_file_list[k] = os.path.join(str(json_directory), v) + + coco_eval_style = ['proposal', 'bbox', 'segm'] + for i, v_json in enumerate(json_file_list): + if os.path.exists(v_json): + cocoapi_eval(v_json, coco_eval_style[i], anno_file=anno_file) + else: + logger.info("{} not exists!".format(v_json)) diff --git a/rtdetr_paddle/ppdet/metrics/json_results.py b/rtdetr_paddle/ppdet/metrics/json_results.py new file mode 100755 index 0000000..d2575af --- /dev/null +++ b/rtdetr_paddle/ppdet/metrics/json_results.py @@ -0,0 +1,175 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import six +import numpy as np + + +def get_det_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0): + det_res = [] + k = 0 + for i in range(len(bbox_nums)): + cur_image_id = int(image_id[i][0]) + det_nums = bbox_nums[i] + for j in range(det_nums): + dt = bboxes[k] + k = k + 1 + num_id, score, xmin, ymin, xmax, ymax = dt.tolist() + if int(num_id) < 0: + continue + category_id = label_to_cat_id_map[int(num_id)] + w = xmax - xmin + bias + h = ymax - ymin + bias + bbox = [xmin, ymin, w, h] + dt_res = { + 'image_id': cur_image_id, + 'category_id': category_id, + 'bbox': bbox, + 'score': score + } + det_res.append(dt_res) + return det_res + + +def get_det_poly_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0): + det_res = [] + k = 0 + for i in range(len(bbox_nums)): + cur_image_id = int(image_id[i][0]) + det_nums = bbox_nums[i] + for j in range(det_nums): + dt = bboxes[k] + k = k + 1 + num_id, score, x1, y1, x2, y2, x3, y3, x4, y4 = dt.tolist() + if int(num_id) < 0: + continue + category_id = label_to_cat_id_map[int(num_id)] + rbox = [x1, y1, x2, y2, x3, y3, x4, y4] + dt_res = { + 'image_id': cur_image_id, + 'category_id': category_id, + 'bbox': rbox, + 'score': score + } + det_res.append(dt_res) + return det_res + + +def strip_mask(mask): + row = mask[0, 0, :] + col = mask[0, :, 0] + im_h = len(col) - np.count_nonzero(col == -1) + im_w = len(row) - np.count_nonzero(row == -1) + return mask[:, :im_h, :im_w] + + +def get_seg_res(masks, bboxes, mask_nums, image_id, label_to_cat_id_map): + import pycocotools.mask as mask_util + seg_res = [] + k = 0 + for i in range(len(mask_nums)): + cur_image_id = int(image_id[i][0]) + det_nums = mask_nums[i] + mask_i = masks[k:k + det_nums] + mask_i = strip_mask(mask_i) + for j in range(det_nums): + mask = mask_i[j].astype(np.uint8) + score = float(bboxes[k][1]) + label = int(bboxes[k][0]) + k = k + 1 + if label == -1: + continue + cat_id = label_to_cat_id_map[label] + rle = mask_util.encode( + np.array( + mask[:, :, None], order="F", dtype="uint8"))[0] + if six.PY3: + if 'counts' in rle: + rle['counts'] = rle['counts'].decode("utf8") + sg_res = { + 'image_id': cur_image_id, + 'category_id': cat_id, + 'segmentation': rle, + 'score': score + } + seg_res.append(sg_res) + return seg_res + + +def get_solov2_segm_res(results, image_id, num_id_to_cat_id_map): + import pycocotools.mask as mask_util + segm_res = [] + # for each batch + segms = results['segm'].astype(np.uint8) + clsid_labels = results['cate_label'] + clsid_scores = results['cate_score'] + lengths = segms.shape[0] + im_id = int(image_id[0][0]) + if lengths == 0 or segms is None: + return None + # for each sample + for i in range(lengths - 1): + clsid = int(clsid_labels[i]) + catid = num_id_to_cat_id_map[clsid] + score = float(clsid_scores[i]) + mask = segms[i] + segm = mask_util.encode(np.array(mask[:, :, np.newaxis], order='F'))[0] + segm['counts'] = segm['counts'].decode('utf8') + coco_res = { + 'image_id': im_id, + 'category_id': catid, + 'segmentation': segm, + 'score': score + } + segm_res.append(coco_res) + return segm_res + + +def get_keypoint_res(results, im_id): + anns = [] + preds = results['keypoint'] + for idx in range(im_id.shape[0]): + image_id = im_id[idx].item() + kpts, scores = preds[idx] + for kpt, score in zip(kpts, scores): + kpt = kpt.flatten() + ann = { + 'image_id': image_id, + 'category_id': 1, # XXX hard code + 'keypoints': kpt.tolist(), + 'score': float(score) + } + x = kpt[0::3] + y = kpt[1::3] + x0, x1, y0, y1 = np.min(x).item(), np.max(x).item(), np.min(y).item( + ), np.max(y).item() + ann['area'] = (x1 - x0) * (y1 - y0) + ann['bbox'] = [x0, y0, x1 - x0, y1 - y0] + anns.append(ann) + return anns + + +def get_pose3d_res(results, im_id): + anns = [] + preds = results['pose3d'] + for idx in range(im_id.shape[0]): + image_id = im_id[idx].item() + pose3d = preds[idx] + ann = { + 'image_id': image_id, + 'category_id': 1, # XXX hard code + 'pose3d': pose3d.tolist(), + 'score': float(1.) + } + anns.append(ann) + return anns diff --git a/rtdetr_paddle/ppdet/metrics/keypoint_metrics.py b/rtdetr_paddle/ppdet/metrics/keypoint_metrics.py new file mode 100644 index 0000000..cbd52d0 --- /dev/null +++ b/rtdetr_paddle/ppdet/metrics/keypoint_metrics.py @@ -0,0 +1,410 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import json +from collections import defaultdict, OrderedDict +import numpy as np +import paddle +from pycocotools.coco import COCO +from pycocotools.cocoeval import COCOeval +from ..modeling.keypoint_utils import oks_nms +from scipy.io import loadmat, savemat +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + +__all__ = ['KeyPointTopDownCOCOEval', 'KeyPointTopDownMPIIEval'] + + +class KeyPointTopDownCOCOEval(object): + """refer to + https://github.com/leoxiaobin/deep-high-resolution-net.pytorch + Copyright (c) Microsoft, under the MIT License. + """ + + def __init__(self, + anno_file, + num_samples, + num_joints, + output_eval, + iou_type='keypoints', + in_vis_thre=0.2, + oks_thre=0.9, + save_prediction_only=False): + super(KeyPointTopDownCOCOEval, self).__init__() + self.coco = COCO(anno_file) + self.num_samples = num_samples + self.num_joints = num_joints + self.iou_type = iou_type + self.in_vis_thre = in_vis_thre + self.oks_thre = oks_thre + self.output_eval = output_eval + self.res_file = os.path.join(output_eval, "keypoints_results.json") + self.save_prediction_only = save_prediction_only + self.reset() + + def reset(self): + self.results = { + 'all_preds': np.zeros( + (self.num_samples, self.num_joints, 3), dtype=np.float32), + 'all_boxes': np.zeros((self.num_samples, 6)), + 'image_path': [] + } + self.eval_results = {} + self.idx = 0 + + def update(self, inputs, outputs): + kpts, _ = outputs['keypoint'][0] + + num_images = inputs['image'].shape[0] + self.results['all_preds'][self.idx:self.idx + num_images, :, 0: + 3] = kpts[:, :, 0:3] + self.results['all_boxes'][self.idx:self.idx + num_images, 0:2] = inputs[ + 'center'].numpy()[:, 0:2] if isinstance( + inputs['center'], paddle.Tensor) else inputs['center'][:, 0:2] + self.results['all_boxes'][self.idx:self.idx + num_images, 2:4] = inputs[ + 'scale'].numpy()[:, 0:2] if isinstance( + inputs['scale'], paddle.Tensor) else inputs['scale'][:, 0:2] + self.results['all_boxes'][self.idx:self.idx + num_images, 4] = np.prod( + inputs['scale'].numpy() * 200, + 1) if isinstance(inputs['scale'], paddle.Tensor) else np.prod( + inputs['scale'] * 200, 1) + self.results['all_boxes'][ + self.idx:self.idx + num_images, + 5] = np.squeeze(inputs['score'].numpy()) if isinstance( + inputs['score'], paddle.Tensor) else np.squeeze(inputs['score']) + if isinstance(inputs['im_id'], paddle.Tensor): + self.results['image_path'].extend(inputs['im_id'].numpy()) + else: + self.results['image_path'].extend(inputs['im_id']) + self.idx += num_images + + def _write_coco_keypoint_results(self, keypoints): + data_pack = [{ + 'cat_id': 1, + 'cls': 'person', + 'ann_type': 'keypoints', + 'keypoints': keypoints + }] + results = self._coco_keypoint_results_one_category_kernel(data_pack[0]) + if not os.path.exists(self.output_eval): + os.makedirs(self.output_eval) + with open(self.res_file, 'w') as f: + json.dump(results, f, sort_keys=True, indent=4) + logger.info(f'The keypoint result is saved to {self.res_file}.') + try: + json.load(open(self.res_file)) + except Exception: + content = [] + with open(self.res_file, 'r') as f: + for line in f: + content.append(line) + content[-1] = ']' + with open(self.res_file, 'w') as f: + for c in content: + f.write(c) + + def _coco_keypoint_results_one_category_kernel(self, data_pack): + cat_id = data_pack['cat_id'] + keypoints = data_pack['keypoints'] + cat_results = [] + + for img_kpts in keypoints: + if len(img_kpts) == 0: + continue + + _key_points = np.array( + [img_kpts[k]['keypoints'] for k in range(len(img_kpts))]) + _key_points = _key_points.reshape(_key_points.shape[0], -1) + + result = [{ + 'image_id': img_kpts[k]['image'], + 'category_id': cat_id, + 'keypoints': _key_points[k].tolist(), + 'score': img_kpts[k]['score'], + 'center': list(img_kpts[k]['center']), + 'scale': list(img_kpts[k]['scale']) + } for k in range(len(img_kpts))] + cat_results.extend(result) + + return cat_results + + def get_final_results(self, preds, all_boxes, img_path): + _kpts = [] + for idx, kpt in enumerate(preds): + _kpts.append({ + 'keypoints': kpt, + 'center': all_boxes[idx][0:2], + 'scale': all_boxes[idx][2:4], + 'area': all_boxes[idx][4], + 'score': all_boxes[idx][5], + 'image': int(img_path[idx]) + }) + # image x person x (keypoints) + kpts = defaultdict(list) + for kpt in _kpts: + kpts[kpt['image']].append(kpt) + + # rescoring and oks nms + num_joints = preds.shape[1] + in_vis_thre = self.in_vis_thre + oks_thre = self.oks_thre + oks_nmsed_kpts = [] + for img in kpts.keys(): + img_kpts = kpts[img] + for n_p in img_kpts: + box_score = n_p['score'] + kpt_score = 0 + valid_num = 0 + for n_jt in range(0, num_joints): + t_s = n_p['keypoints'][n_jt][2] + if t_s > in_vis_thre: + kpt_score = kpt_score + t_s + valid_num = valid_num + 1 + if valid_num != 0: + kpt_score = kpt_score / valid_num + # rescoring + n_p['score'] = kpt_score * box_score + + keep = oks_nms([img_kpts[i] for i in range(len(img_kpts))], + oks_thre) + + if len(keep) == 0: + oks_nmsed_kpts.append(img_kpts) + else: + oks_nmsed_kpts.append([img_kpts[_keep] for _keep in keep]) + + self._write_coco_keypoint_results(oks_nmsed_kpts) + + def accumulate(self): + self.get_final_results(self.results['all_preds'], + self.results['all_boxes'], + self.results['image_path']) + if self.save_prediction_only: + logger.info(f'The keypoint result is saved to {self.res_file} ' + 'and do not evaluate the mAP.') + return + coco_dt = self.coco.loadRes(self.res_file) + coco_eval = COCOeval(self.coco, coco_dt, 'keypoints') + coco_eval.params.useSegm = None + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + keypoint_stats = [] + for ind in range(len(coco_eval.stats)): + keypoint_stats.append((coco_eval.stats[ind])) + self.eval_results['keypoint'] = keypoint_stats + + def log(self): + if self.save_prediction_only: + return + stats_names = [ + 'AP', 'Ap .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5', + 'AR .75', 'AR (M)', 'AR (L)' + ] + num_values = len(stats_names) + print(' '.join(['| {}'.format(name) for name in stats_names]) + ' |') + print('|---' * (num_values + 1) + '|') + + print(' '.join([ + '| {:.3f}'.format(value) for value in self.eval_results['keypoint'] + ]) + ' |') + + def get_results(self): + return self.eval_results + + +class KeyPointTopDownMPIIEval(object): + def __init__(self, + anno_file, + num_samples, + num_joints, + output_eval, + oks_thre=0.9, + save_prediction_only=False): + super(KeyPointTopDownMPIIEval, self).__init__() + self.ann_file = anno_file + self.res_file = os.path.join(output_eval, "keypoints_results.json") + self.save_prediction_only = save_prediction_only + self.reset() + + def reset(self): + self.results = [] + self.eval_results = {} + self.idx = 0 + + def update(self, inputs, outputs): + kpts, _ = outputs['keypoint'][0] + + num_images = inputs['image'].shape[0] + results = {} + results['preds'] = kpts[:, :, 0:3] + results['boxes'] = np.zeros((num_images, 6)) + results['boxes'][:, 0:2] = inputs['center'].numpy()[:, 0:2] + results['boxes'][:, 2:4] = inputs['scale'].numpy()[:, 0:2] + results['boxes'][:, 4] = np.prod(inputs['scale'].numpy() * 200, 1) + results['boxes'][:, 5] = np.squeeze(inputs['score'].numpy()) + results['image_path'] = inputs['image_file'] + + self.results.append(results) + + def accumulate(self): + self._mpii_keypoint_results_save() + if self.save_prediction_only: + logger.info(f'The keypoint result is saved to {self.res_file} ' + 'and do not evaluate the mAP.') + return + + self.eval_results = self.evaluate(self.results) + + def _mpii_keypoint_results_save(self): + results = [] + for res in self.results: + if len(res) == 0: + continue + result = [{ + 'preds': res['preds'][k].tolist(), + 'boxes': res['boxes'][k].tolist(), + 'image_path': res['image_path'][k], + } for k in range(len(res))] + results.extend(result) + with open(self.res_file, 'w') as f: + json.dump(results, f, sort_keys=True, indent=4) + logger.info(f'The keypoint result is saved to {self.res_file}.') + + def log(self): + if self.save_prediction_only: + return + for item, value in self.eval_results.items(): + print("{} : {}".format(item, value)) + + def get_results(self): + return self.eval_results + + def evaluate(self, outputs, savepath=None): + """Evaluate PCKh for MPII dataset. refer to + https://github.com/leoxiaobin/deep-high-resolution-net.pytorch + Copyright (c) Microsoft, under the MIT License. + + Args: + outputs(list(preds, boxes)): + + * preds (np.ndarray[N,K,3]): The first two dimensions are + coordinates, score is the third dimension of the array. + * boxes (np.ndarray[N,6]): [center[0], center[1], scale[0] + , scale[1],area, score] + + Returns: + dict: PCKh for each joint + """ + + kpts = [] + for output in outputs: + preds = output['preds'] + batch_size = preds.shape[0] + for i in range(batch_size): + kpts.append({'keypoints': preds[i]}) + + preds = np.stack([kpt['keypoints'] for kpt in kpts]) + + # convert 0-based index to 1-based index, + # and get the first two dimensions. + preds = preds[..., :2] + 1.0 + + if savepath is not None: + pred_file = os.path.join(savepath, 'pred.mat') + savemat(pred_file, mdict={'preds': preds}) + + SC_BIAS = 0.6 + threshold = 0.5 + + gt_file = os.path.join( + os.path.dirname(self.ann_file), 'mpii_gt_val.mat') + gt_dict = loadmat(gt_file) + dataset_joints = gt_dict['dataset_joints'] + jnt_missing = gt_dict['jnt_missing'] + pos_gt_src = gt_dict['pos_gt_src'] + headboxes_src = gt_dict['headboxes_src'] + + pos_pred_src = np.transpose(preds, [1, 2, 0]) + + head = np.where(dataset_joints == 'head')[1][0] + lsho = np.where(dataset_joints == 'lsho')[1][0] + lelb = np.where(dataset_joints == 'lelb')[1][0] + lwri = np.where(dataset_joints == 'lwri')[1][0] + lhip = np.where(dataset_joints == 'lhip')[1][0] + lkne = np.where(dataset_joints == 'lkne')[1][0] + lank = np.where(dataset_joints == 'lank')[1][0] + + rsho = np.where(dataset_joints == 'rsho')[1][0] + relb = np.where(dataset_joints == 'relb')[1][0] + rwri = np.where(dataset_joints == 'rwri')[1][0] + rkne = np.where(dataset_joints == 'rkne')[1][0] + rank = np.where(dataset_joints == 'rank')[1][0] + rhip = np.where(dataset_joints == 'rhip')[1][0] + + jnt_visible = 1 - jnt_missing + uv_error = pos_pred_src - pos_gt_src + uv_err = np.linalg.norm(uv_error, axis=1) + headsizes = headboxes_src[1, :, :] - headboxes_src[0, :, :] + headsizes = np.linalg.norm(headsizes, axis=0) + headsizes *= SC_BIAS + scale = headsizes * np.ones((len(uv_err), 1), dtype=np.float32) + scaled_uv_err = uv_err / scale + scaled_uv_err = scaled_uv_err * jnt_visible + jnt_count = np.sum(jnt_visible, axis=1) + less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible + PCKh = 100. * np.sum(less_than_threshold, axis=1) / jnt_count + + # save + rng = np.arange(0, 0.5 + 0.01, 0.01) + pckAll = np.zeros((len(rng), 16), dtype=np.float32) + + for r, threshold in enumerate(rng): + less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible + pckAll[r, :] = 100. * np.sum(less_than_threshold, + axis=1) / jnt_count + + PCKh = np.ma.array(PCKh, mask=False) + PCKh.mask[6:8] = True + + jnt_count = np.ma.array(jnt_count, mask=False) + jnt_count.mask[6:8] = True + jnt_ratio = jnt_count / np.sum(jnt_count).astype(np.float64) + + name_value = [ #noqa + ('Head', PCKh[head]), + ('Shoulder', 0.5 * (PCKh[lsho] + PCKh[rsho])), + ('Elbow', 0.5 * (PCKh[lelb] + PCKh[relb])), + ('Wrist', 0.5 * (PCKh[lwri] + PCKh[rwri])), + ('Hip', 0.5 * (PCKh[lhip] + PCKh[rhip])), + ('Knee', 0.5 * (PCKh[lkne] + PCKh[rkne])), + ('Ankle', 0.5 * (PCKh[lank] + PCKh[rank])), + ('PCKh', np.sum(PCKh * jnt_ratio)), + ('PCKh@0.1', np.sum(pckAll[11, :] * jnt_ratio)) + ] + name_value = OrderedDict(name_value) + + return name_value + + def _sort_and_unique_bboxes(self, kpts, key='bbox_id'): + """sort kpts and remove the repeated ones.""" + kpts = sorted(kpts, key=lambda x: x[key]) + num = len(kpts) + for i in range(num - 1, 0, -1): + if kpts[i][key] == kpts[i - 1][key]: + del kpts[i] + + return kpts diff --git a/rtdetr_paddle/ppdet/metrics/map_utils.py b/rtdetr_paddle/ppdet/metrics/map_utils.py new file mode 100644 index 0000000..77ccf5e --- /dev/null +++ b/rtdetr_paddle/ppdet/metrics/map_utils.py @@ -0,0 +1,397 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import os +import sys +import numpy as np +import itertools + +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + +__all__ = [ + 'draw_pr_curve', + 'bbox_area', + 'jaccard_overlap', + 'prune_zero_padding', + 'DetectionMAP', + 'ap_per_class', + 'compute_ap', +] + + +def draw_pr_curve(precision, + recall, + iou=0.5, + out_dir='pr_curve', + file_name='precision_recall_curve.jpg'): + if not os.path.exists(out_dir): + os.makedirs(out_dir) + output_path = os.path.join(out_dir, file_name) + try: + import matplotlib.pyplot as plt + except Exception as e: + logger.error('Matplotlib not found, plaese install matplotlib.' + 'for example: `pip install matplotlib`.') + raise e + plt.cla() + plt.figure('P-R Curve') + plt.title('Precision/Recall Curve(IoU={})'.format(iou)) + plt.xlabel('Recall') + plt.ylabel('Precision') + plt.grid(True) + plt.plot(recall, precision) + plt.savefig(output_path) + + +def bbox_area(bbox, is_bbox_normalized): + """ + Calculate area of a bounding box + """ + norm = 1. - float(is_bbox_normalized) + width = bbox[2] - bbox[0] + norm + height = bbox[3] - bbox[1] + norm + return width * height + + +def jaccard_overlap(pred, gt, is_bbox_normalized=False): + """ + Calculate jaccard overlap ratio between two bounding box + """ + if pred[0] >= gt[2] or pred[2] <= gt[0] or \ + pred[1] >= gt[3] or pred[3] <= gt[1]: + return 0. + inter_xmin = max(pred[0], gt[0]) + inter_ymin = max(pred[1], gt[1]) + inter_xmax = min(pred[2], gt[2]) + inter_ymax = min(pred[3], gt[3]) + inter_size = bbox_area([inter_xmin, inter_ymin, inter_xmax, inter_ymax], + is_bbox_normalized) + pred_size = bbox_area(pred, is_bbox_normalized) + gt_size = bbox_area(gt, is_bbox_normalized) + overlap = float(inter_size) / (pred_size + gt_size - inter_size) + return overlap + + +def prune_zero_padding(gt_box, gt_label, difficult=None): + valid_cnt = 0 + for i in range(len(gt_box)): + if (gt_box[i] == 0).all(): + break + valid_cnt += 1 + return (gt_box[:valid_cnt], gt_label[:valid_cnt], difficult[:valid_cnt] + if difficult is not None else None) + + +class DetectionMAP(object): + """ + Calculate detection mean average precision. + Currently support two types: 11point and integral + + Args: + class_num (int): The class number. + overlap_thresh (float): The threshold of overlap + ratio between prediction bounding box and + ground truth bounding box for deciding + true/false positive. Default 0.5. + map_type (str): Calculation method of mean average + precision, currently support '11point' and + 'integral'. Default '11point'. + is_bbox_normalized (bool): Whether bounding boxes + is normalized to range[0, 1]. Default False. + evaluate_difficult (bool): Whether to evaluate + difficult bounding boxes. Default False. + catid2name (dict): Mapping between category id and category name. + classwise (bool): Whether per-category AP and draw + P-R Curve or not. + """ + + def __init__(self, + class_num, + overlap_thresh=0.5, + map_type='11point', + is_bbox_normalized=False, + evaluate_difficult=False, + catid2name=None, + classwise=False): + self.class_num = class_num + self.overlap_thresh = overlap_thresh + assert map_type in ['11point', 'integral'], \ + "map_type currently only support '11point' "\ + "and 'integral'" + self.map_type = map_type + self.is_bbox_normalized = is_bbox_normalized + self.evaluate_difficult = evaluate_difficult + self.classwise = classwise + self.classes = [] + for cname in catid2name.values(): + self.classes.append(cname) + self.reset() + + def update(self, bbox, score, label, gt_box, gt_label, difficult=None): + """ + Update metric statics from given prediction and ground + truth infomations. + """ + if difficult is None: + difficult = np.zeros_like(gt_label) + + # record class gt count + for gtl, diff in zip(gt_label, difficult): + if self.evaluate_difficult or int(diff) == 0: + self.class_gt_counts[int(np.array(gtl))] += 1 + + # record class score positive + visited = [False] * len(gt_label) + for b, s, l in zip(bbox, score, label): + pred = b.tolist() if isinstance(b, np.ndarray) else b + max_idx = -1 + max_overlap = -1.0 + for i, gl in enumerate(gt_label): + if int(gl) == int(l): + if len(gt_box[i]) == 8: + overlap = calc_rbox_iou(pred, gt_box[i]) + else: + overlap = jaccard_overlap(pred, gt_box[i], + self.is_bbox_normalized) + if overlap > max_overlap: + max_overlap = overlap + max_idx = i + + if max_overlap > self.overlap_thresh: + if self.evaluate_difficult or \ + int(np.array(difficult[max_idx])) == 0: + if not visited[max_idx]: + self.class_score_poss[int(l)].append([s, 1.0]) + visited[max_idx] = True + else: + self.class_score_poss[int(l)].append([s, 0.0]) + else: + self.class_score_poss[int(l)].append([s, 0.0]) + + def reset(self): + """ + Reset metric statics + """ + self.class_score_poss = [[] for _ in range(self.class_num)] + self.class_gt_counts = [0] * self.class_num + self.mAP = 0.0 + + def accumulate(self): + """ + Accumulate metric results and calculate mAP + """ + mAP = 0. + valid_cnt = 0 + eval_results = [] + for score_pos, count in zip(self.class_score_poss, + self.class_gt_counts): + if count == 0: continue + if len(score_pos) == 0: + valid_cnt += 1 + continue + + accum_tp_list, accum_fp_list = \ + self._get_tp_fp_accum(score_pos) + precision = [] + recall = [] + for ac_tp, ac_fp in zip(accum_tp_list, accum_fp_list): + precision.append(float(ac_tp) / (ac_tp + ac_fp)) + recall.append(float(ac_tp) / count) + + one_class_ap = 0.0 + if self.map_type == '11point': + max_precisions = [0.] * 11 + start_idx = len(precision) - 1 + for j in range(10, -1, -1): + for i in range(start_idx, -1, -1): + if recall[i] < float(j) / 10.: + start_idx = i + if j > 0: + max_precisions[j - 1] = max_precisions[j] + break + else: + if max_precisions[j] < precision[i]: + max_precisions[j] = precision[i] + one_class_ap = sum(max_precisions) / 11. + mAP += one_class_ap + valid_cnt += 1 + elif self.map_type == 'integral': + import math + prev_recall = 0. + for i in range(len(precision)): + recall_gap = math.fabs(recall[i] - prev_recall) + if recall_gap > 1e-6: + one_class_ap += precision[i] * recall_gap + prev_recall = recall[i] + mAP += one_class_ap + valid_cnt += 1 + else: + logger.error("Unspported mAP type {}".format(self.map_type)) + sys.exit(1) + eval_results.append({ + 'class': self.classes[valid_cnt - 1], + 'ap': one_class_ap, + 'precision': precision, + 'recall': recall, + }) + self.eval_results = eval_results + self.mAP = mAP / float(valid_cnt) if valid_cnt > 0 else mAP + + def get_map(self): + """ + Get mAP result + """ + if self.mAP is None: + logger.error("mAP is not calculated.") + if self.classwise: + # Compute per-category AP and PR curve + try: + from terminaltables import AsciiTable + except Exception as e: + logger.error( + 'terminaltables not found, plaese install terminaltables. ' + 'for example: `pip install terminaltables`.') + raise e + results_per_category = [] + for eval_result in self.eval_results: + results_per_category.append( + (str(eval_result['class']), + '{:0.3f}'.format(float(eval_result['ap'])))) + draw_pr_curve( + eval_result['precision'], + eval_result['recall'], + out_dir='voc_pr_curve', + file_name='{}_precision_recall_curve.jpg'.format( + eval_result['class'])) + + num_columns = min(6, len(results_per_category) * 2) + results_flatten = list(itertools.chain(*results_per_category)) + headers = ['category', 'AP'] * (num_columns // 2) + results_2d = itertools.zip_longest(* [ + results_flatten[i::num_columns] for i in range(num_columns) + ]) + table_data = [headers] + table_data += [result for result in results_2d] + table = AsciiTable(table_data) + logger.info('Per-category of VOC AP: \n{}'.format(table.table)) + logger.info( + "per-category PR curve has output to voc_pr_curve folder.") + return self.mAP + + def _get_tp_fp_accum(self, score_pos_list): + """ + Calculate accumulating true/false positive results from + [score, pos] records + """ + sorted_list = sorted(score_pos_list, key=lambda s: s[0], reverse=True) + accum_tp = 0 + accum_fp = 0 + accum_tp_list = [] + accum_fp_list = [] + for (score, pos) in sorted_list: + accum_tp += int(pos) + accum_tp_list.append(accum_tp) + accum_fp += 1 - int(pos) + accum_fp_list.append(accum_fp) + return accum_tp_list, accum_fp_list + + +def ap_per_class(tp, conf, pred_cls, target_cls): + """ + Computes the average precision, given the recall and precision curves. + Method originally from https://github.com/rafaelpadilla/Object-Detection-Metrics. + + Args: + tp (list): True positives. + conf (list): Objectness value from 0-1. + pred_cls (list): Predicted object classes. + target_cls (list): Target object classes. + """ + tp, conf, pred_cls, target_cls = np.array(tp), np.array(conf), np.array( + pred_cls), np.array(target_cls) + + # Sort by objectness + i = np.argsort(-conf) + tp, conf, pred_cls = tp[i], conf[i], pred_cls[i] + + # Find unique classes + unique_classes = np.unique(np.concatenate((pred_cls, target_cls), 0)) + + # Create Precision-Recall curve and compute AP for each class + ap, p, r = [], [], [] + for c in unique_classes: + i = pred_cls == c + n_gt = sum(target_cls == c) # Number of ground truth objects + n_p = sum(i) # Number of predicted objects + + if (n_p == 0) and (n_gt == 0): + continue + elif (n_p == 0) or (n_gt == 0): + ap.append(0) + r.append(0) + p.append(0) + else: + # Accumulate FPs and TPs + fpc = np.cumsum(1 - tp[i]) + tpc = np.cumsum(tp[i]) + + # Recall + recall_curve = tpc / (n_gt + 1e-16) + r.append(tpc[-1] / (n_gt + 1e-16)) + + # Precision + precision_curve = tpc / (tpc + fpc) + p.append(tpc[-1] / (tpc[-1] + fpc[-1])) + + # AP from recall-precision curve + ap.append(compute_ap(recall_curve, precision_curve)) + + return np.array(ap), unique_classes.astype('int32'), np.array(r), np.array( + p) + + +def compute_ap(recall, precision): + """ + Computes the average precision, given the recall and precision curves. + Code originally from https://github.com/rbgirshick/py-faster-rcnn. + + Args: + recall (list): The recall curve. + precision (list): The precision curve. + + Returns: + The average precision as computed in py-faster-rcnn. + """ + # correct AP calculation + # first append sentinel values at the end + mrec = np.concatenate(([0.], recall, [1.])) + mpre = np.concatenate(([0.], precision, [0.])) + + # compute the precision envelope + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + i = np.where(mrec[1:] != mrec[:-1])[0] + + # and sum (\Delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap diff --git a/rtdetr_paddle/ppdet/metrics/mcmot_metrics.py b/rtdetr_paddle/ppdet/metrics/mcmot_metrics.py new file mode 100644 index 0000000..c9b5ef7 --- /dev/null +++ b/rtdetr_paddle/ppdet/metrics/mcmot_metrics.py @@ -0,0 +1,473 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import copy +import sys +import math +from collections import defaultdict + +import numpy as np +import pandas as pd + +from .metrics import Metric +try: + import motmetrics as mm + from motmetrics.math_util import quiet_divide + metrics = mm.metrics.motchallenge_metrics + mh = mm.metrics.create() +except: + print( + 'Warning: Unable to use MCMOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics' + ) + pass +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + +__all__ = ['MCMOTEvaluator', 'MCMOTMetric'] + +METRICS_LIST = [ + 'num_frames', 'num_matches', 'num_switches', 'num_transfer', 'num_ascend', + 'num_migrate', 'num_false_positives', 'num_misses', 'num_detections', + 'num_objects', 'num_predictions', 'num_unique_objects', 'mostly_tracked', + 'partially_tracked', 'mostly_lost', 'num_fragmentations', 'motp', 'mota', + 'precision', 'recall', 'idfp', 'idfn', 'idtp', 'idp', 'idr', 'idf1' +] + +NAME_MAP = { + 'num_frames': 'num_frames', + 'num_matches': 'num_matches', + 'num_switches': 'IDs', + 'num_transfer': 'IDt', + 'num_ascend': 'IDa', + 'num_migrate': 'IDm', + 'num_false_positives': 'FP', + 'num_misses': 'FN', + 'num_detections': 'num_detections', + 'num_objects': 'num_objects', + 'num_predictions': 'num_predictions', + 'num_unique_objects': 'GT', + 'mostly_tracked': 'MT', + 'partially_tracked': 'partially_tracked', + 'mostly_lost': 'ML', + 'num_fragmentations': 'FM', + 'motp': 'MOTP', + 'mota': 'MOTA', + 'precision': 'Prcn', + 'recall': 'Rcll', + 'idfp': 'idfp', + 'idfn': 'idfn', + 'idtp': 'idtp', + 'idp': 'IDP', + 'idr': 'IDR', + 'idf1': 'IDF1' +} + + +def parse_accs_metrics(seq_acc, index_name, verbose=False): + """ + Parse the evaluation indicators of multiple MOTAccumulator + """ + mh = mm.metrics.create() + summary = MCMOTEvaluator.get_summary(seq_acc, index_name, METRICS_LIST) + summary.loc['OVERALL', 'motp'] = (summary['motp'] * summary['num_detections']).sum() / \ + summary.loc['OVERALL', 'num_detections'] + if verbose: + strsummary = mm.io.render_summary( + summary, formatters=mh.formatters, namemap=NAME_MAP) + print(strsummary) + + return summary + + +def seqs_overall_metrics(summary_df, verbose=False): + """ + Calculate overall metrics for multiple sequences + """ + add_col = [ + 'num_frames', 'num_matches', 'num_switches', 'num_transfer', + 'num_ascend', 'num_migrate', 'num_false_positives', 'num_misses', + 'num_detections', 'num_objects', 'num_predictions', + 'num_unique_objects', 'mostly_tracked', 'partially_tracked', + 'mostly_lost', 'num_fragmentations', 'idfp', 'idfn', 'idtp' + ] + calc_col = ['motp', 'mota', 'precision', 'recall', 'idp', 'idr', 'idf1'] + calc_df = summary_df.copy() + + overall_dic = {} + for col in add_col: + overall_dic[col] = calc_df[col].sum() + + for col in calc_col: + overall_dic[col] = getattr(MCMOTMetricOverall, col + '_overall')( + calc_df, overall_dic) + + overall_df = pd.DataFrame(overall_dic, index=['overall_calc']) + calc_df = pd.concat([calc_df, overall_df]) + + if verbose: + mh = mm.metrics.create() + str_calc_df = mm.io.render_summary( + calc_df, formatters=mh.formatters, namemap=NAME_MAP) + print(str_calc_df) + + return calc_df + + +class MCMOTMetricOverall(object): + def motp_overall(summary_df, overall_dic): + motp = quiet_divide((summary_df['motp'] * + summary_df['num_detections']).sum(), + overall_dic['num_detections']) + return motp + + def mota_overall(summary_df, overall_dic): + del summary_df + mota = 1. - quiet_divide( + (overall_dic['num_misses'] + overall_dic['num_switches'] + + overall_dic['num_false_positives']), overall_dic['num_objects']) + return mota + + def precision_overall(summary_df, overall_dic): + del summary_df + precision = quiet_divide(overall_dic['num_detections'], ( + overall_dic['num_false_positives'] + overall_dic['num_detections'])) + return precision + + def recall_overall(summary_df, overall_dic): + del summary_df + recall = quiet_divide(overall_dic['num_detections'], + overall_dic['num_objects']) + return recall + + def idp_overall(summary_df, overall_dic): + del summary_df + idp = quiet_divide(overall_dic['idtp'], + (overall_dic['idtp'] + overall_dic['idfp'])) + return idp + + def idr_overall(summary_df, overall_dic): + del summary_df + idr = quiet_divide(overall_dic['idtp'], + (overall_dic['idtp'] + overall_dic['idfn'])) + return idr + + def idf1_overall(summary_df, overall_dic): + del summary_df + idf1 = quiet_divide(2. * overall_dic['idtp'], ( + overall_dic['num_objects'] + overall_dic['num_predictions'])) + return idf1 + + +def read_mcmot_results_union(filename, is_gt, is_ignore): + results_dict = dict() + if os.path.isfile(filename): + all_result = np.loadtxt(filename, delimiter=',') + if all_result.shape[0] == 0 or all_result.shape[1] < 7: + return results_dict + if is_ignore: + return results_dict + if is_gt: + # only for test use + all_result = all_result[all_result[:, 7] != 0] + all_result[:, 7] = all_result[:, 7] - 1 + + if all_result.shape[0] == 0: + return results_dict + + class_unique = np.unique(all_result[:, 7]) + + last_max_id = 0 + result_cls_list = [] + for cls in class_unique: + result_cls_split = all_result[all_result[:, 7] == cls] + result_cls_split[:, 1] = result_cls_split[:, 1] + last_max_id + # make sure track id different between every category + last_max_id = max(np.unique(result_cls_split[:, 1])) + 1 + result_cls_list.append(result_cls_split) + + results_con = np.concatenate(result_cls_list) + + for line in range(len(results_con)): + linelist = results_con[line] + fid = int(linelist[0]) + if fid < 1: + continue + results_dict.setdefault(fid, list()) + + if is_gt: + score = 1 + else: + score = float(linelist[6]) + + tlwh = tuple(map(float, linelist[2:6])) + target_id = int(linelist[1]) + cls = int(linelist[7]) + + results_dict[fid].append((tlwh, target_id, cls, score)) + + return results_dict + + +def read_mcmot_results(filename, is_gt, is_ignore): + results_dict = dict() + if os.path.isfile(filename): + with open(filename, 'r') as f: + for line in f.readlines(): + linelist = line.strip().split(',') + if len(linelist) < 7: + continue + fid = int(linelist[0]) + if fid < 1: + continue + cid = int(linelist[7]) + if is_gt: + score = 1 + # only for test use + cid -= 1 + else: + score = float(linelist[6]) + + cls_result_dict = results_dict.setdefault(cid, dict()) + cls_result_dict.setdefault(fid, list()) + + tlwh = tuple(map(float, linelist[2:6])) + target_id = int(linelist[1]) + cls_result_dict[fid].append((tlwh, target_id, score)) + return results_dict + + +def read_results(filename, + data_type, + is_gt=False, + is_ignore=False, + multi_class=False, + union=False): + if data_type in ['mcmot', 'lab']: + if multi_class: + if union: + # The results are evaluated by union all the categories. + # Track IDs between different categories cannot be duplicate. + read_fun = read_mcmot_results_union + else: + # The results are evaluated separately by category. + read_fun = read_mcmot_results + else: + raise ValueError('multi_class: {}, MCMOT should have cls_id.'. + format(multi_class)) + else: + raise ValueError('Unknown data type: {}'.format(data_type)) + + return read_fun(filename, is_gt, is_ignore) + + +def unzip_objs(objs): + if len(objs) > 0: + tlwhs, ids, scores = zip(*objs) + else: + tlwhs, ids, scores = [], [], [] + tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4) + return tlwhs, ids, scores + + +def unzip_objs_cls(objs): + if len(objs) > 0: + tlwhs, ids, cls, scores = zip(*objs) + else: + tlwhs, ids, cls, scores = [], [], [], [] + tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4) + ids = np.array(ids) + cls = np.array(cls) + scores = np.array(scores) + return tlwhs, ids, cls, scores + + +class MCMOTEvaluator(object): + def __init__(self, data_root, seq_name, data_type, num_classes): + self.data_root = data_root + self.seq_name = seq_name + self.data_type = data_type + self.num_classes = num_classes + + self.load_annotations() + try: + import motmetrics as mm + mm.lap.default_solver = 'lap' + except Exception as e: + raise RuntimeError( + 'Unable to use MCMOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics' + ) + self.reset_accumulator() + + self.class_accs = [] + + def load_annotations(self): + assert self.data_type == 'mcmot' + self.gt_filename = os.path.join(self.data_root, '../', 'sequences', + '{}.txt'.format(self.seq_name)) + if not os.path.exists(self.gt_filename): + logger.warning( + "gt_filename '{}' of MCMOTEvaluator is not exist, so the MOTA will be -INF." + ) + + def reset_accumulator(self): + self.acc = mm.MOTAccumulator(auto_id=True) + + def eval_frame_dict(self, trk_objs, gt_objs, rtn_events=False, union=False): + if union: + trk_tlwhs, trk_ids, trk_cls = unzip_objs_cls(trk_objs)[:3] + gt_tlwhs, gt_ids, gt_cls = unzip_objs_cls(gt_objs)[:3] + + # get distance matrix + iou_distance = mm.distances.iou_matrix( + gt_tlwhs, trk_tlwhs, max_iou=0.5) + + # Set the distance between objects of different categories to nan + gt_cls_len = len(gt_cls) + trk_cls_len = len(trk_cls) + # When the number of GT or Trk is 0, iou_distance dimension is (0,0) + if gt_cls_len != 0 and trk_cls_len != 0: + gt_cls = gt_cls.reshape(gt_cls_len, 1) + gt_cls = np.repeat(gt_cls, trk_cls_len, axis=1) + trk_cls = trk_cls.reshape(1, trk_cls_len) + trk_cls = np.repeat(trk_cls, gt_cls_len, axis=0) + iou_distance = np.where(gt_cls == trk_cls, iou_distance, np.nan) + + else: + trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2] + gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2] + + # get distance matrix + iou_distance = mm.distances.iou_matrix( + gt_tlwhs, trk_tlwhs, max_iou=0.5) + + self.acc.update(gt_ids, trk_ids, iou_distance) + + if rtn_events and iou_distance.size > 0 and hasattr(self.acc, + 'mot_events'): + events = self.acc.mot_events # only supported by https://github.com/longcw/py-motmetrics + else: + events = None + return events + + def eval_file(self, result_filename): + # evaluation of each category + gt_frame_dict = read_results( + self.gt_filename, + self.data_type, + is_gt=True, + multi_class=True, + union=False) + result_frame_dict = read_results( + result_filename, + self.data_type, + is_gt=False, + multi_class=True, + union=False) + + for cid in range(self.num_classes): + self.reset_accumulator() + cls_result_frame_dict = result_frame_dict.setdefault(cid, dict()) + cls_gt_frame_dict = gt_frame_dict.setdefault(cid, dict()) + + # only labeled frames will be evaluated + frames = sorted(list(set(cls_gt_frame_dict.keys()))) + + for frame_id in frames: + trk_objs = cls_result_frame_dict.get(frame_id, []) + gt_objs = cls_gt_frame_dict.get(frame_id, []) + self.eval_frame_dict(trk_objs, gt_objs, rtn_events=False) + + self.class_accs.append(self.acc) + + return self.class_accs + + @staticmethod + def get_summary(accs, + names, + metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1', + 'precision', 'recall')): + names = copy.deepcopy(names) + if metrics is None: + metrics = mm.metrics.motchallenge_metrics + metrics = copy.deepcopy(metrics) + + mh = mm.metrics.create() + summary = mh.compute_many( + accs, metrics=metrics, names=names, generate_overall=True) + + return summary + + @staticmethod + def save_summary(summary, filename): + import pandas as pd + writer = pd.ExcelWriter(filename) + summary.to_excel(writer) + writer.save() + + +class MCMOTMetric(Metric): + def __init__(self, num_classes, save_summary=False): + self.num_classes = num_classes + self.save_summary = save_summary + self.MCMOTEvaluator = MCMOTEvaluator + self.result_root = None + self.reset() + + self.seqs_overall = defaultdict(list) + + def reset(self): + self.accs = [] + self.seqs = [] + + def update(self, data_root, seq, data_type, result_root, result_filename): + evaluator = self.MCMOTEvaluator(data_root, seq, data_type, + self.num_classes) + seq_acc = evaluator.eval_file(result_filename) + self.accs.append(seq_acc) + self.seqs.append(seq) + self.result_root = result_root + + cls_index_name = [ + '{}_{}'.format(seq, i) for i in range(self.num_classes) + ] + summary = parse_accs_metrics(seq_acc, cls_index_name) + summary.rename( + index={'OVERALL': '{}_OVERALL'.format(seq)}, inplace=True) + for row in range(len(summary)): + self.seqs_overall[row].append(summary.iloc[row:row + 1]) + + def accumulate(self): + self.cls_summary_list = [] + for row in range(self.num_classes): + seqs_cls_df = pd.concat(self.seqs_overall[row]) + seqs_cls_summary = seqs_overall_metrics(seqs_cls_df) + cls_summary_overall = seqs_cls_summary.iloc[-1:].copy() + cls_summary_overall.rename( + index={'overall_calc': 'overall_calc_{}'.format(row)}, + inplace=True) + self.cls_summary_list.append(cls_summary_overall) + + def log(self): + seqs_summary = seqs_overall_metrics( + pd.concat(self.seqs_overall[self.num_classes]), verbose=True) + class_summary = seqs_overall_metrics( + pd.concat(self.cls_summary_list), verbose=True) + + def get_results(self): + return 1 diff --git a/rtdetr_paddle/ppdet/metrics/metrics.py b/rtdetr_paddle/ppdet/metrics/metrics.py new file mode 100644 index 0000000..4916ca4 --- /dev/null +++ b/rtdetr_paddle/ppdet/metrics/metrics.py @@ -0,0 +1,505 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import json +import paddle +import numpy as np +import typing +from collections import defaultdict +from pathlib import Path + +from .map_utils import prune_zero_padding, DetectionMAP +from .coco_utils import get_infer_results, cocoapi_eval +from .widerface_utils import face_eval_run +from ppdet.data.source.category import get_categories + + +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + +__all__ = [ + 'Metric', 'COCOMetric', 'VOCMetric', 'WiderFaceMetric', 'get_infer_results', + 'RBoxMetric', 'SNIPERCOCOMetric' +] + +COCO_SIGMAS = np.array([ + .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, + .89, .89 +]) / 10.0 +CROWD_SIGMAS = np.array( + [.79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89, .79, + .79]) / 10.0 + + +class Metric(paddle.metric.Metric): + def name(self): + return self.__class__.__name__ + + def reset(self): + pass + + def accumulate(self): + pass + + # paddle.metric.Metric defined :metch:`update`, :meth:`accumulate` + # :metch:`reset`, in ppdet, we also need following 2 methods: + + # abstract method for logging metric results + def log(self): + pass + + # abstract method for getting metric results + def get_results(self): + pass + + +class COCOMetric(Metric): + def __init__(self, anno_file, **kwargs): + self.anno_file = anno_file + self.clsid2catid = kwargs.get('clsid2catid', None) + if self.clsid2catid is None: + self.clsid2catid, _ = get_categories('COCO', anno_file) + self.classwise = kwargs.get('classwise', False) + self.output_eval = kwargs.get('output_eval', None) + # TODO: bias should be unified + self.bias = kwargs.get('bias', 0) + self.save_prediction_only = kwargs.get('save_prediction_only', False) + self.iou_type = kwargs.get('IouType', 'bbox') + + if not self.save_prediction_only: + assert os.path.isfile(anno_file), \ + "anno_file {} not a file".format(anno_file) + + if self.output_eval is not None: + Path(self.output_eval).mkdir(exist_ok=True) + + self.reset() + + def reset(self): + # only bbox and mask evaluation support currently + self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []} + self.eval_results = {} + + def update(self, inputs, outputs): + outs = {} + # outputs Tensor -> numpy.ndarray + for k, v in outputs.items(): + outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v + + # multi-scale inputs: all inputs have same im_id + if isinstance(inputs, typing.Sequence): + im_id = inputs[0]['im_id'] + else: + im_id = inputs['im_id'] + outs['im_id'] = im_id.numpy() if isinstance(im_id, + paddle.Tensor) else im_id + + infer_results = get_infer_results( + outs, self.clsid2catid, bias=self.bias) + self.results['bbox'] += infer_results[ + 'bbox'] if 'bbox' in infer_results else [] + self.results['mask'] += infer_results[ + 'mask'] if 'mask' in infer_results else [] + self.results['segm'] += infer_results[ + 'segm'] if 'segm' in infer_results else [] + self.results['keypoint'] += infer_results[ + 'keypoint'] if 'keypoint' in infer_results else [] + + def accumulate(self): + if len(self.results['bbox']) > 0: + output = "bbox.json" + if self.output_eval: + output = os.path.join(self.output_eval, output) + with open(output, 'w') as f: + json.dump(self.results['bbox'], f) + logger.info('The bbox result is saved to bbox.json.') + + if self.save_prediction_only: + logger.info('The bbox result is saved to {} and do not ' + 'evaluate the mAP.'.format(output)) + else: + bbox_stats = cocoapi_eval( + output, + 'bbox', + anno_file=self.anno_file, + classwise=self.classwise) + self.eval_results['bbox'] = bbox_stats + sys.stdout.flush() + + if len(self.results['mask']) > 0: + output = "mask.json" + if self.output_eval: + output = os.path.join(self.output_eval, output) + with open(output, 'w') as f: + json.dump(self.results['mask'], f) + logger.info('The mask result is saved to mask.json.') + + if self.save_prediction_only: + logger.info('The mask result is saved to {} and do not ' + 'evaluate the mAP.'.format(output)) + else: + seg_stats = cocoapi_eval( + output, + 'segm', + anno_file=self.anno_file, + classwise=self.classwise) + self.eval_results['mask'] = seg_stats + sys.stdout.flush() + + if len(self.results['segm']) > 0: + output = "segm.json" + if self.output_eval: + output = os.path.join(self.output_eval, output) + with open(output, 'w') as f: + json.dump(self.results['segm'], f) + logger.info('The segm result is saved to segm.json.') + + if self.save_prediction_only: + logger.info('The segm result is saved to {} and do not ' + 'evaluate the mAP.'.format(output)) + else: + seg_stats = cocoapi_eval( + output, + 'segm', + anno_file=self.anno_file, + classwise=self.classwise) + self.eval_results['mask'] = seg_stats + sys.stdout.flush() + + if len(self.results['keypoint']) > 0: + output = "keypoint.json" + if self.output_eval: + output = os.path.join(self.output_eval, output) + with open(output, 'w') as f: + json.dump(self.results['keypoint'], f) + logger.info('The keypoint result is saved to keypoint.json.') + + if self.save_prediction_only: + logger.info('The keypoint result is saved to {} and do not ' + 'evaluate the mAP.'.format(output)) + else: + style = 'keypoints' + use_area = True + sigmas = COCO_SIGMAS + if self.iou_type == 'keypoints_crowd': + style = 'keypoints_crowd' + use_area = False + sigmas = CROWD_SIGMAS + keypoint_stats = cocoapi_eval( + output, + style, + anno_file=self.anno_file, + classwise=self.classwise, + sigmas=sigmas, + use_area=use_area) + self.eval_results['keypoint'] = keypoint_stats + sys.stdout.flush() + + def log(self): + pass + + def get_results(self): + return self.eval_results + + +class VOCMetric(Metric): + def __init__(self, + label_list, + class_num=20, + overlap_thresh=0.5, + map_type='11point', + is_bbox_normalized=False, + evaluate_difficult=False, + classwise=False, + output_eval=None, + save_prediction_only=False): + assert os.path.isfile(label_list), \ + "label_list {} not a file".format(label_list) + self.clsid2catid, self.catid2name = get_categories('VOC', label_list) + + self.overlap_thresh = overlap_thresh + self.map_type = map_type + self.evaluate_difficult = evaluate_difficult + self.output_eval = output_eval + self.save_prediction_only = save_prediction_only + self.detection_map = DetectionMAP( + class_num=class_num, + overlap_thresh=overlap_thresh, + map_type=map_type, + is_bbox_normalized=is_bbox_normalized, + evaluate_difficult=evaluate_difficult, + catid2name=self.catid2name, + classwise=classwise) + + self.reset() + + def reset(self): + self.results = {'bbox': [], 'score': [], 'label': []} + self.detection_map.reset() + + def update(self, inputs, outputs): + bbox_np = outputs['bbox'].numpy() if isinstance( + outputs['bbox'], paddle.Tensor) else outputs['bbox'] + bboxes = bbox_np[:, 2:] + scores = bbox_np[:, 1] + labels = bbox_np[:, 0] + bbox_lengths = outputs['bbox_num'].numpy() if isinstance( + outputs['bbox_num'], paddle.Tensor) else outputs['bbox_num'] + + self.results['bbox'].append(bboxes.tolist()) + self.results['score'].append(scores.tolist()) + self.results['label'].append(labels.tolist()) + + if bboxes.shape == (1, 1) or bboxes is None: + return + if self.save_prediction_only: + return + + gt_boxes = inputs['gt_bbox'] + gt_labels = inputs['gt_class'] + difficults = inputs['difficult'] if not self.evaluate_difficult \ + else None + + if 'scale_factor' in inputs: + scale_factor = inputs['scale_factor'].numpy() if isinstance( + inputs['scale_factor'], + paddle.Tensor) else inputs['scale_factor'] + else: + scale_factor = np.ones((gt_boxes.shape[0], 2)).astype('float32') + + bbox_idx = 0 + for i in range(len(gt_boxes)): + gt_box = gt_boxes[i].numpy() if isinstance( + gt_boxes[i], paddle.Tensor) else gt_boxes[i] + h, w = scale_factor[i] + gt_box = gt_box / np.array([w, h, w, h]) + gt_label = gt_labels[i].numpy() if isinstance( + gt_labels[i], paddle.Tensor) else gt_labels[i] + if difficults is not None: + difficult = difficults[i].numpy() if isinstance( + difficults[i], paddle.Tensor) else difficults[i] + else: + difficult = None + bbox_num = bbox_lengths[i] + bbox = bboxes[bbox_idx:bbox_idx + bbox_num] + score = scores[bbox_idx:bbox_idx + bbox_num] + label = labels[bbox_idx:bbox_idx + bbox_num] + gt_box, gt_label, difficult = prune_zero_padding(gt_box, gt_label, + difficult) + self.detection_map.update(bbox, score, label, gt_box, gt_label, + difficult) + bbox_idx += bbox_num + + def accumulate(self): + output = "bbox.json" + if self.output_eval: + output = os.path.join(self.output_eval, output) + with open(output, 'w') as f: + json.dump(self.results, f) + logger.info('The bbox result is saved to bbox.json.') + if self.save_prediction_only: + return + + logger.info("Accumulating evaluatation results...") + self.detection_map.accumulate() + + def log(self): + map_stat = 100. * self.detection_map.get_map() + logger.info("mAP({:.2f}, {}) = {:.2f}%".format(self.overlap_thresh, + self.map_type, map_stat)) + + def get_results(self): + return {'bbox': [self.detection_map.get_map()]} + + +class WiderFaceMetric(Metric): + def __init__(self, image_dir, anno_file, multi_scale=True): + self.image_dir = image_dir + self.anno_file = anno_file + self.multi_scale = multi_scale + self.clsid2catid, self.catid2name = get_categories('widerface') + + def update(self, model): + + face_eval_run( + model, + self.image_dir, + self.anno_file, + pred_dir='output/pred', + eval_mode='widerface', + multi_scale=self.multi_scale) + + +class RBoxMetric(Metric): + def __init__(self, anno_file, **kwargs): + self.anno_file = anno_file + self.clsid2catid, self.catid2name = get_categories('RBOX', anno_file) + self.catid2clsid = {v: k for k, v in self.clsid2catid.items()} + self.classwise = kwargs.get('classwise', False) + self.output_eval = kwargs.get('output_eval', None) + self.save_prediction_only = kwargs.get('save_prediction_only', False) + self.overlap_thresh = kwargs.get('overlap_thresh', 0.5) + self.map_type = kwargs.get('map_type', '11point') + self.evaluate_difficult = kwargs.get('evaluate_difficult', False) + self.imid2path = kwargs.get('imid2path', None) + class_num = len(self.catid2name) + self.detection_map = DetectionMAP( + class_num=class_num, + overlap_thresh=self.overlap_thresh, + map_type=self.map_type, + is_bbox_normalized=False, + evaluate_difficult=self.evaluate_difficult, + catid2name=self.catid2name, + classwise=self.classwise) + + self.reset() + + def reset(self): + self.results = [] + self.detection_map.reset() + + def update(self, inputs, outputs): + outs = {} + # outputs Tensor -> numpy.ndarray + for k, v in outputs.items(): + outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v + + im_id = inputs['im_id'] + im_id = im_id.numpy() if isinstance(im_id, paddle.Tensor) else im_id + outs['im_id'] = im_id + + infer_results = get_infer_results(outs, self.clsid2catid) + infer_results = infer_results['bbox'] if 'bbox' in infer_results else [] + self.results += infer_results + if self.save_prediction_only: + return + + gt_boxes = inputs['gt_poly'] + gt_labels = inputs['gt_class'] + + if 'scale_factor' in inputs: + scale_factor = inputs['scale_factor'].numpy() if isinstance( + inputs['scale_factor'], + paddle.Tensor) else inputs['scale_factor'] + else: + scale_factor = np.ones((gt_boxes.shape[0], 2)).astype('float32') + + for i in range(len(gt_boxes)): + gt_box = gt_boxes[i].numpy() if isinstance( + gt_boxes[i], paddle.Tensor) else gt_boxes[i] + h, w = scale_factor[i] + gt_box = gt_box / np.array([w, h, w, h, w, h, w, h]) + gt_label = gt_labels[i].numpy() if isinstance( + gt_labels[i], paddle.Tensor) else gt_labels[i] + gt_box, gt_label, _ = prune_zero_padding(gt_box, gt_label) + bbox = [ + res['bbox'] for res in infer_results + if int(res['image_id']) == int(im_id[i]) + ] + score = [ + res['score'] for res in infer_results + if int(res['image_id']) == int(im_id[i]) + ] + label = [ + self.catid2clsid[int(res['category_id'])] + for res in infer_results + if int(res['image_id']) == int(im_id[i]) + ] + self.detection_map.update(bbox, score, label, gt_box, gt_label) + + def save_results(self, results, output_dir, imid2path): + if imid2path: + data_dicts = defaultdict(list) + for result in results: + image_id = result['image_id'] + data_dicts[image_id].append(result) + + for image_id, image_path in imid2path.items(): + basename = os.path.splitext(os.path.split(image_path)[-1])[0] + output = os.path.join(output_dir, "{}.txt".format(basename)) + dets = data_dicts.get(image_id, []) + with open(output, 'w') as f: + for det in dets: + catid, bbox, score = det['category_id'], det[ + 'bbox'], det['score'] + bbox_pred = '{} {} '.format(self.catid2name[catid], + score) + ' '.join( + [str(e) for e in bbox]) + f.write(bbox_pred + '\n') + + logger.info('The bbox result is saved to {}.'.format(output_dir)) + else: + output = os.path.join(output_dir, "bbox.json") + with open(output, 'w') as f: + json.dump(results, f) + + logger.info('The bbox result is saved to {}.'.format(output)) + + def accumulate(self): + if self.output_eval: + self.save_results(self.results, self.output_eval, self.imid2path) + + if not self.save_prediction_only: + logger.info("Accumulating evaluatation results...") + self.detection_map.accumulate() + + def log(self): + map_stat = 100. * self.detection_map.get_map() + logger.info("mAP({:.2f}, {}) = {:.2f}%".format(self.overlap_thresh, + self.map_type, map_stat)) + + def get_results(self): + return {'bbox': [self.detection_map.get_map()]} + + +class SNIPERCOCOMetric(COCOMetric): + def __init__(self, anno_file, **kwargs): + super(SNIPERCOCOMetric, self).__init__(anno_file, **kwargs) + self.dataset = kwargs["dataset"] + self.chip_results = [] + + def reset(self): + # only bbox and mask evaluation support currently + self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []} + self.eval_results = {} + self.chip_results = [] + + def update(self, inputs, outputs): + outs = {} + # outputs Tensor -> numpy.ndarray + for k, v in outputs.items(): + outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v + + im_id = inputs['im_id'] + outs['im_id'] = im_id.numpy() if isinstance(im_id, + paddle.Tensor) else im_id + + self.chip_results.append(outs) + + def accumulate(self): + results = self.dataset.anno_cropper.aggregate_chips_detections( + self.chip_results) + for outs in results: + infer_results = get_infer_results( + outs, self.clsid2catid, bias=self.bias) + self.results['bbox'] += infer_results[ + 'bbox'] if 'bbox' in infer_results else [] + + super(SNIPERCOCOMetric, self).accumulate() diff --git a/rtdetr_paddle/ppdet/metrics/mot_metrics.py b/rtdetr_paddle/ppdet/metrics/mot_metrics.py new file mode 100644 index 0000000..b5ed8a2 --- /dev/null +++ b/rtdetr_paddle/ppdet/metrics/mot_metrics.py @@ -0,0 +1,1246 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import copy +import sys +import math +from collections import defaultdict +import numpy as np + +from ppdet.modeling.bbox_utils import bbox_iou_np_expand +from .map_utils import ap_per_class +from .metrics import Metric +from .munkres import Munkres + +try: + import motmetrics as mm + mm.lap.default_solver = 'lap' +except: + print( + 'Warning: Unable to use MOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics' + ) + pass + +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + +__all__ = ['MOTEvaluator', 'MOTMetric', 'JDEDetMetric', 'KITTIMOTMetric'] + + +def read_mot_results(filename, is_gt=False, is_ignore=False): + valid_label = [1] + ignore_labels = [2, 7, 8, 12] # only in motchallenge datasets like 'MOT16' + if is_gt: + logger.info( + "In MOT16/17 dataset the valid_label of ground truth is '{}', " + "in other dataset it should be '0' for single classs MOT.".format( + valid_label[0])) + results_dict = dict() + if os.path.isfile(filename): + with open(filename, 'r') as f: + for line in f.readlines(): + linelist = line.split(',') + if len(linelist) < 7: + continue + fid = int(linelist[0]) + if fid < 1: + continue + results_dict.setdefault(fid, list()) + + if is_gt: + label = int(float(linelist[7])) + mark = int(float(linelist[6])) + if mark == 0 or label not in valid_label: + continue + score = 1 + elif is_ignore: + if 'MOT16-' in filename or 'MOT17-' in filename or 'MOT15-' in filename or 'MOT20-' in filename: + label = int(float(linelist[7])) + vis_ratio = float(linelist[8]) + if label not in ignore_labels and vis_ratio >= 0: + continue + else: + continue + score = 1 + else: + score = float(linelist[6]) + + tlwh = tuple(map(float, linelist[2:6])) + target_id = int(linelist[1]) + + results_dict[fid].append((tlwh, target_id, score)) + return results_dict + + +""" +MOT dataset label list, see in https://motchallenge.net +labels={'ped', ... % 1 + 'person_on_vhcl', ... % 2 + 'car', ... % 3 + 'bicycle', ... % 4 + 'mbike', ... % 5 + 'non_mot_vhcl', ... % 6 + 'static_person', ... % 7 + 'distractor', ... % 8 + 'occluder', ... % 9 + 'occluder_on_grnd', ... % 10 + 'occluder_full', ... % 11 + 'reflection', ... % 12 + 'crowd' ... % 13 +}; +""" + + +def unzip_objs(objs): + if len(objs) > 0: + tlwhs, ids, scores = zip(*objs) + else: + tlwhs, ids, scores = [], [], [] + tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4) + return tlwhs, ids, scores + + +class MOTEvaluator(object): + def __init__(self, data_root, seq_name, data_type): + self.data_root = data_root + self.seq_name = seq_name + self.data_type = data_type + + self.load_annotations() + try: + import motmetrics as mm + mm.lap.default_solver = 'lap' + except Exception as e: + raise RuntimeError( + 'Unable to use MOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics' + ) + self.reset_accumulator() + + def load_annotations(self): + assert self.data_type == 'mot' + gt_filename = os.path.join(self.data_root, self.seq_name, 'gt', + 'gt.txt') + if not os.path.exists(gt_filename): + logger.warning( + "gt_filename '{}' of MOTEvaluator is not exist, so the MOTA will be -INF." + ) + self.gt_frame_dict = read_mot_results(gt_filename, is_gt=True) + self.gt_ignore_frame_dict = read_mot_results( + gt_filename, is_ignore=True) + + def reset_accumulator(self): + self.acc = mm.MOTAccumulator(auto_id=True) + + def eval_frame(self, frame_id, trk_tlwhs, trk_ids, rtn_events=False): + # results + trk_tlwhs = np.copy(trk_tlwhs) + trk_ids = np.copy(trk_ids) + + # gts + gt_objs = self.gt_frame_dict.get(frame_id, []) + gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2] + + # ignore boxes + ignore_objs = self.gt_ignore_frame_dict.get(frame_id, []) + ignore_tlwhs = unzip_objs(ignore_objs)[0] + + # remove ignored results + keep = np.ones(len(trk_tlwhs), dtype=bool) + iou_distance = mm.distances.iou_matrix( + ignore_tlwhs, trk_tlwhs, max_iou=0.5) + if len(iou_distance) > 0: + match_is, match_js = mm.lap.linear_sum_assignment(iou_distance) + match_is, match_js = map(lambda a: np.asarray(a, dtype=int), [match_is, match_js]) + match_ious = iou_distance[match_is, match_js] + + match_js = np.asarray(match_js, dtype=int) + match_js = match_js[np.logical_not(np.isnan(match_ious))] + keep[match_js] = False + trk_tlwhs = trk_tlwhs[keep] + trk_ids = trk_ids[keep] + + # get distance matrix + iou_distance = mm.distances.iou_matrix(gt_tlwhs, trk_tlwhs, max_iou=0.5) + + # acc + self.acc.update(gt_ids, trk_ids, iou_distance) + + if rtn_events and iou_distance.size > 0 and hasattr(self.acc, + 'last_mot_events'): + events = self.acc.last_mot_events # only supported by https://github.com/longcw/py-motmetrics + else: + events = None + return events + + def eval_file(self, filename): + self.reset_accumulator() + + result_frame_dict = read_mot_results(filename, is_gt=False) + frames = sorted(list(set(result_frame_dict.keys()))) + for frame_id in frames: + trk_objs = result_frame_dict.get(frame_id, []) + trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2] + self.eval_frame(frame_id, trk_tlwhs, trk_ids, rtn_events=False) + + return self.acc + + @staticmethod + def get_summary(accs, + names, + metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1', + 'precision', 'recall')): + names = copy.deepcopy(names) + if metrics is None: + metrics = mm.metrics.motchallenge_metrics + metrics = copy.deepcopy(metrics) + + mh = mm.metrics.create() + summary = mh.compute_many( + accs, metrics=metrics, names=names, generate_overall=True) + return summary + + @staticmethod + def save_summary(summary, filename): + import pandas as pd + writer = pd.ExcelWriter(filename) + summary.to_excel(writer) + writer.save() + + +class MOTMetric(Metric): + def __init__(self, save_summary=False): + self.save_summary = save_summary + self.MOTEvaluator = MOTEvaluator + self.result_root = None + self.reset() + + def reset(self): + self.accs = [] + self.seqs = [] + + def update(self, data_root, seq, data_type, result_root, result_filename): + evaluator = self.MOTEvaluator(data_root, seq, data_type) + self.accs.append(evaluator.eval_file(result_filename)) + self.seqs.append(seq) + self.result_root = result_root + + def accumulate(self): + metrics = mm.metrics.motchallenge_metrics + mh = mm.metrics.create() + summary = self.MOTEvaluator.get_summary(self.accs, self.seqs, metrics) + self.strsummary = mm.io.render_summary( + summary, + formatters=mh.formatters, + namemap=mm.io.motchallenge_metric_names) + if self.save_summary: + self.MOTEvaluator.save_summary( + summary, os.path.join(self.result_root, 'summary.xlsx')) + + def log(self): + print(self.strsummary) + + def get_results(self): + return self.strsummary + + +class JDEDetMetric(Metric): + # Note this detection AP metric is different from COCOMetric or VOCMetric, + # and the bboxes coordinates are not scaled to the original image + def __init__(self, overlap_thresh=0.5): + self.overlap_thresh = overlap_thresh + self.reset() + + def reset(self): + self.AP_accum = np.zeros(1) + self.AP_accum_count = np.zeros(1) + + def update(self, inputs, outputs): + bboxes = outputs['bbox'][:, 2:].numpy() + scores = outputs['bbox'][:, 1].numpy() + labels = outputs['bbox'][:, 0].numpy() + bbox_lengths = outputs['bbox_num'].numpy() + if bboxes.shape[0] == 1 and bboxes.sum() == 0.0: + return + + gt_boxes = inputs['gt_bbox'].numpy()[0] + gt_labels = inputs['gt_class'].numpy()[0] + if gt_labels.shape[0] == 0: + return + + correct = [] + detected = [] + for i in range(bboxes.shape[0]): + obj_pred = 0 + pred_bbox = bboxes[i].reshape(1, 4) + # Compute iou with target boxes + iou = bbox_iou_np_expand(pred_bbox, gt_boxes, x1y1x2y2=True)[0] + # Extract index of largest overlap + best_i = np.argmax(iou) + # If overlap exceeds threshold and classification is correct mark as correct + if iou[best_i] > self.overlap_thresh and obj_pred == gt_labels[ + best_i] and best_i not in detected: + correct.append(1) + detected.append(best_i) + else: + correct.append(0) + + # Compute Average Precision (AP) per class + target_cls = list(gt_labels.T[0]) + AP, AP_class, R, P = ap_per_class( + tp=correct, + conf=scores, + pred_cls=np.zeros_like(scores), + target_cls=target_cls) + self.AP_accum_count += np.bincount(AP_class, minlength=1) + self.AP_accum += np.bincount(AP_class, minlength=1, weights=AP) + + def accumulate(self): + logger.info("Accumulating evaluatation results...") + self.map_stat = self.AP_accum[0] / (self.AP_accum_count[0] + 1E-16) + + def log(self): + map_stat = 100. * self.map_stat + logger.info("mAP({:.2f}) = {:.2f}%".format(self.overlap_thresh, + map_stat)) + + def get_results(self): + return self.map_stat + + +""" +Following code is borrow from https://github.com/xingyizhou/CenterTrack/blob/master/src/tools/eval_kitti_track/evaluate_tracking.py +""" + + +class tData: + """ + Utility class to load data. + """ + def __init__(self,frame=-1,obj_type="unset",truncation=-1,occlusion=-1,\ + obs_angle=-10,x1=-1,y1=-1,x2=-1,y2=-1,w=-1,h=-1,l=-1,\ + X=-1000,Y=-1000,Z=-1000,yaw=-10,score=-1000,track_id=-1): + """ + Constructor, initializes the object given the parameters. + """ + self.frame = frame + self.track_id = track_id + self.obj_type = obj_type + self.truncation = truncation + self.occlusion = occlusion + self.obs_angle = obs_angle + self.x1 = x1 + self.y1 = y1 + self.x2 = x2 + self.y2 = y2 + self.w = w + self.h = h + self.l = l + self.X = X + self.Y = Y + self.Z = Z + self.yaw = yaw + self.score = score + self.ignored = False + self.valid = False + self.tracker = -1 + + def __str__(self): + attrs = vars(self) + return '\n'.join("%s: %s" % item for item in attrs.items()) + + +class KITTIEvaluation(object): + """ KITTI tracking statistics (CLEAR MOT, id-switches, fragments, ML/PT/MT, precision/recall) + MOTA - Multi-object tracking accuracy in [0,100] + MOTP - Multi-object tracking precision in [0,100] (3D) / [td,100] (2D) + MOTAL - Multi-object tracking accuracy in [0,100] with log10(id-switches) + + id-switches - number of id switches + fragments - number of fragmentations + + MT, PT, ML - number of mostly tracked, partially tracked and mostly lost trajectories + + recall - recall = percentage of detected targets + precision - precision = percentage of correctly detected targets + FAR - number of false alarms per frame + falsepositives - number of false positives (FP) + missed - number of missed targets (FN) + """ + def __init__(self, result_path, gt_path, min_overlap=0.5, max_truncation = 0,\ + min_height = 25, max_occlusion = 2, cls="car",\ + n_frames=[], seqs=[], n_sequences=0): + # get number of sequences and + # get number of frames per sequence from test mapping + # (created while extracting the benchmark) + self.gt_path = os.path.join(gt_path, "../labels") + self.n_frames = n_frames + self.sequence_name = seqs + self.n_sequences = n_sequences + + self.cls = cls # class to evaluate, i.e. pedestrian or car + + self.result_path = result_path + + # statistics and numbers for evaluation + self.n_gt = 0 # number of ground truth detections minus ignored false negatives and true positives + self.n_igt = 0 # number of ignored ground truth detections + self.n_gts = [ + ] # number of ground truth detections minus ignored false negatives and true positives PER SEQUENCE + self.n_igts = [ + ] # number of ground ignored truth detections PER SEQUENCE + self.n_gt_trajectories = 0 + self.n_gt_seq = [] + self.n_tr = 0 # number of tracker detections minus ignored tracker detections + self.n_trs = [ + ] # number of tracker detections minus ignored tracker detections PER SEQUENCE + self.n_itr = 0 # number of ignored tracker detections + self.n_itrs = [] # number of ignored tracker detections PER SEQUENCE + self.n_igttr = 0 # number of ignored ground truth detections where the corresponding associated tracker detection is also ignored + self.n_tr_trajectories = 0 + self.n_tr_seq = [] + self.MOTA = 0 + self.MOTP = 0 + self.MOTAL = 0 + self.MODA = 0 + self.MODP = 0 + self.MODP_t = [] + self.recall = 0 + self.precision = 0 + self.F1 = 0 + self.FAR = 0 + self.total_cost = 0 + self.itp = 0 # number of ignored true positives + self.itps = [] # number of ignored true positives PER SEQUENCE + self.tp = 0 # number of true positives including ignored true positives! + self.tps = [ + ] # number of true positives including ignored true positives PER SEQUENCE + self.fn = 0 # number of false negatives WITHOUT ignored false negatives + self.fns = [ + ] # number of false negatives WITHOUT ignored false negatives PER SEQUENCE + self.ifn = 0 # number of ignored false negatives + self.ifns = [] # number of ignored false negatives PER SEQUENCE + self.fp = 0 # number of false positives + # a bit tricky, the number of ignored false negatives and ignored true positives + # is subtracted, but if both tracker detection and ground truth detection + # are ignored this number is added again to avoid double counting + self.fps = [] # above PER SEQUENCE + self.mme = 0 + self.fragments = 0 + self.id_switches = 0 + self.MT = 0 + self.PT = 0 + self.ML = 0 + + self.min_overlap = min_overlap # minimum bounding box overlap for 3rd party metrics + self.max_truncation = max_truncation # maximum truncation of an object for evaluation + self.max_occlusion = max_occlusion # maximum occlusion of an object for evaluation + self.min_height = min_height # minimum height of an object for evaluation + self.n_sample_points = 500 + + # this should be enough to hold all groundtruth trajectories + # is expanded if necessary and reduced in any case + self.gt_trajectories = [[] for x in range(self.n_sequences)] + self.ign_trajectories = [[] for x in range(self.n_sequences)] + + def loadGroundtruth(self): + try: + self._loadData(self.gt_path, cls=self.cls, loading_groundtruth=True) + except IOError: + return False + return True + + def loadTracker(self): + try: + if not self._loadData( + self.result_path, cls=self.cls, loading_groundtruth=False): + return False + except IOError: + return False + return True + + def _loadData(self, + root_dir, + cls, + min_score=-1000, + loading_groundtruth=False): + """ + Generic loader for ground truth and tracking data. + Use loadGroundtruth() or loadTracker() to load this data. + Loads detections in KITTI format from textfiles. + """ + # construct objectDetections object to hold detection data + t_data = tData() + data = [] + eval_2d = True + eval_3d = True + + seq_data = [] + n_trajectories = 0 + n_trajectories_seq = [] + for seq, s_name in enumerate(self.sequence_name): + i = 0 + filename = os.path.join(root_dir, "%s.txt" % s_name) + f = open(filename, "r") + + f_data = [ + [] for x in range(self.n_frames[seq]) + ] # current set has only 1059 entries, sufficient length is checked anyway + ids = [] + n_in_seq = 0 + id_frame_cache = [] + for line in f: + # KITTI tracking benchmark data format: + # (frame,tracklet_id,objectType,truncation,occlusion,alpha,x1,y1,x2,y2,h,w,l,X,Y,Z,ry) + line = line.strip() + fields = line.split(" ") + # classes that should be loaded (ignored neighboring classes) + if "car" in cls.lower(): + classes = ["car", "van"] + elif "pedestrian" in cls.lower(): + classes = ["pedestrian", "person_sitting"] + else: + classes = [cls.lower()] + classes += ["dontcare"] + if not any([s for s in classes if s in fields[2].lower()]): + continue + # get fields from table + t_data.frame = int(float(fields[0])) # frame + t_data.track_id = int(float(fields[1])) # id + t_data.obj_type = fields[ + 2].lower() # object type [car, pedestrian, cyclist, ...] + t_data.truncation = int( + float(fields[3])) # truncation [-1,0,1,2] + t_data.occlusion = int( + float(fields[4])) # occlusion [-1,0,1,2] + t_data.obs_angle = float(fields[5]) # observation angle [rad] + t_data.x1 = float(fields[6]) # left [px] + t_data.y1 = float(fields[7]) # top [px] + t_data.x2 = float(fields[8]) # right [px] + t_data.y2 = float(fields[9]) # bottom [px] + t_data.h = float(fields[10]) # height [m] + t_data.w = float(fields[11]) # width [m] + t_data.l = float(fields[12]) # length [m] + t_data.X = float(fields[13]) # X [m] + t_data.Y = float(fields[14]) # Y [m] + t_data.Z = float(fields[15]) # Z [m] + t_data.yaw = float(fields[16]) # yaw angle [rad] + if not loading_groundtruth: + if len(fields) == 17: + t_data.score = -1 + elif len(fields) == 18: + t_data.score = float(fields[17]) # detection score + else: + logger.info("file is not in KITTI format") + return + + # do not consider objects marked as invalid + if t_data.track_id is -1 and t_data.obj_type != "dontcare": + continue + + idx = t_data.frame + # check if length for frame data is sufficient + if idx >= len(f_data): + print("extend f_data", idx, len(f_data)) + f_data += [[] for x in range(max(500, idx - len(f_data)))] + try: + id_frame = (t_data.frame, t_data.track_id) + if id_frame in id_frame_cache and not loading_groundtruth: + logger.info( + "track ids are not unique for sequence %d: frame %d" + % (seq, t_data.frame)) + logger.info( + "track id %d occurred at least twice for this frame" + % t_data.track_id) + logger.info("Exiting...") + #continue # this allows to evaluate non-unique result files + return False + id_frame_cache.append(id_frame) + f_data[t_data.frame].append(copy.copy(t_data)) + except: + print(len(f_data), idx) + raise + + if t_data.track_id not in ids and t_data.obj_type != "dontcare": + ids.append(t_data.track_id) + n_trajectories += 1 + n_in_seq += 1 + + # check if uploaded data provides information for 2D and 3D evaluation + if not loading_groundtruth and eval_2d is True and ( + t_data.x1 == -1 or t_data.x2 == -1 or t_data.y1 == -1 or + t_data.y2 == -1): + eval_2d = False + if not loading_groundtruth and eval_3d is True and ( + t_data.X == -1000 or t_data.Y == -1000 or + t_data.Z == -1000): + eval_3d = False + + # only add existing frames + n_trajectories_seq.append(n_in_seq) + seq_data.append(f_data) + f.close() + + if not loading_groundtruth: + self.tracker = seq_data + self.n_tr_trajectories = n_trajectories + self.eval_2d = eval_2d + self.eval_3d = eval_3d + self.n_tr_seq = n_trajectories_seq + if self.n_tr_trajectories == 0: + return False + else: + # split ground truth and DontCare areas + self.dcareas = [] + self.groundtruth = [] + for seq_idx in range(len(seq_data)): + seq_gt = seq_data[seq_idx] + s_g, s_dc = [], [] + for f in range(len(seq_gt)): + all_gt = seq_gt[f] + g, dc = [], [] + for gg in all_gt: + if gg.obj_type == "dontcare": + dc.append(gg) + else: + g.append(gg) + s_g.append(g) + s_dc.append(dc) + self.dcareas.append(s_dc) + self.groundtruth.append(s_g) + self.n_gt_seq = n_trajectories_seq + self.n_gt_trajectories = n_trajectories + return True + + def boxoverlap(self, a, b, criterion="union"): + """ + boxoverlap computes intersection over union for bbox a and b in KITTI format. + If the criterion is 'union', overlap = (a inter b) / a union b). + If the criterion is 'a', overlap = (a inter b) / a, where b should be a dontcare area. + """ + x1 = max(a.x1, b.x1) + y1 = max(a.y1, b.y1) + x2 = min(a.x2, b.x2) + y2 = min(a.y2, b.y2) + + w = x2 - x1 + h = y2 - y1 + + if w <= 0. or h <= 0.: + return 0. + inter = w * h + aarea = (a.x2 - a.x1) * (a.y2 - a.y1) + barea = (b.x2 - b.x1) * (b.y2 - b.y1) + # intersection over union overlap + if criterion.lower() == "union": + o = inter / float(aarea + barea - inter) + elif criterion.lower() == "a": + o = float(inter) / float(aarea) + else: + raise TypeError("Unkown type for criterion") + return o + + def compute3rdPartyMetrics(self): + """ + Computes the metrics defined in + - Stiefelhagen 2008: Evaluating Multiple Object Tracking Performance: The CLEAR MOT Metrics + MOTA, MOTAL, MOTP + - Nevatia 2008: Global Data Association for Multi-Object Tracking Using Network Flows + MT/PT/ML + """ + # construct Munkres object for Hungarian Method association + hm = Munkres() + max_cost = 1e9 + + # go through all frames and associate ground truth and tracker results + # groundtruth and tracker contain lists for every single frame containing lists of KITTI format detections + fr, ids = 0, 0 + for seq_idx in range(len(self.groundtruth)): + seq_gt = self.groundtruth[seq_idx] + seq_dc = self.dcareas[seq_idx] # don't care areas + seq_tracker = self.tracker[seq_idx] + seq_trajectories = defaultdict(list) + seq_ignored = defaultdict(list) + + # statistics over the current sequence, check the corresponding + # variable comments in __init__ to get their meaning + seqtp = 0 + seqitp = 0 + seqfn = 0 + seqifn = 0 + seqfp = 0 + seqigt = 0 + seqitr = 0 + + last_ids = [[], []] + n_gts = 0 + n_trs = 0 + + for f in range(len(seq_gt)): + g = seq_gt[f] + dc = seq_dc[f] + + t = seq_tracker[f] + # counting total number of ground truth and tracker objects + self.n_gt += len(g) + self.n_tr += len(t) + + n_gts += len(g) + n_trs += len(t) + + # use hungarian method to associate, using boxoverlap 0..1 as cost + # build cost matrix + cost_matrix = [] + this_ids = [[], []] + for gg in g: + # save current ids + this_ids[0].append(gg.track_id) + this_ids[1].append(-1) + gg.tracker = -1 + gg.id_switch = 0 + gg.fragmentation = 0 + cost_row = [] + for tt in t: + # overlap == 1 is cost ==0 + c = 1 - self.boxoverlap(gg, tt) + # gating for boxoverlap + if c <= self.min_overlap: + cost_row.append(c) + else: + cost_row.append(max_cost) # = 1e9 + cost_matrix.append(cost_row) + # all ground truth trajectories are initially not associated + # extend groundtruth trajectories lists (merge lists) + seq_trajectories[gg.track_id].append(-1) + seq_ignored[gg.track_id].append(False) + + if len(g) is 0: + cost_matrix = [[]] + # associate + association_matrix = hm.compute(cost_matrix) + + # tmp variables for sanity checks and MODP computation + tmptp = 0 + tmpfp = 0 + tmpfn = 0 + tmpc = 0 # this will sum up the overlaps for all true positives + tmpcs = [0] * len( + g) # this will save the overlaps for all true positives + # the reason is that some true positives might be ignored + # later such that the corrsponding overlaps can + # be subtracted from tmpc for MODP computation + + # mapping for tracker ids and ground truth ids + for row, col in association_matrix: + # apply gating on boxoverlap + c = cost_matrix[row][col] + if c < max_cost: + g[row].tracker = t[col].track_id + this_ids[1][row] = t[col].track_id + t[col].valid = True + g[row].distance = c + self.total_cost += 1 - c + tmpc += 1 - c + tmpcs[row] = 1 - c + seq_trajectories[g[row].track_id][-1] = t[col].track_id + + # true positives are only valid associations + self.tp += 1 + tmptp += 1 + else: + g[row].tracker = -1 + self.fn += 1 + tmpfn += 1 + + # associate tracker and DontCare areas + # ignore tracker in neighboring classes + nignoredtracker = 0 # number of ignored tracker detections + ignoredtrackers = dict() # will associate the track_id with -1 + # if it is not ignored and 1 if it is + # ignored; + # this is used to avoid double counting ignored + # cases, see the next loop + + for tt in t: + ignoredtrackers[tt.track_id] = -1 + # ignore detection if it belongs to a neighboring class or is + # smaller or equal to the minimum height + + tt_height = abs(tt.y1 - tt.y2) + if ((self.cls == "car" and tt.obj_type == "van") or + (self.cls == "pedestrian" and + tt.obj_type == "person_sitting") or + tt_height <= self.min_height) and not tt.valid: + nignoredtracker += 1 + tt.ignored = True + ignoredtrackers[tt.track_id] = 1 + continue + for d in dc: + overlap = self.boxoverlap(tt, d, "a") + if overlap > 0.5 and not tt.valid: + tt.ignored = True + nignoredtracker += 1 + ignoredtrackers[tt.track_id] = 1 + break + + # check for ignored FN/TP (truncation or neighboring object class) + ignoredfn = 0 # the number of ignored false negatives + nignoredtp = 0 # the number of ignored true positives + nignoredpairs = 0 # the number of ignored pairs, i.e. a true positive + # which is ignored but where the associated tracker + # detection has already been ignored + + gi = 0 + for gg in g: + if gg.tracker < 0: + if gg.occlusion>self.max_occlusion or gg.truncation>self.max_truncation\ + or (self.cls=="car" and gg.obj_type=="van") or (self.cls=="pedestrian" and gg.obj_type=="person_sitting"): + seq_ignored[gg.track_id][-1] = True + gg.ignored = True + ignoredfn += 1 + + elif gg.tracker >= 0: + if gg.occlusion>self.max_occlusion or gg.truncation>self.max_truncation\ + or (self.cls=="car" and gg.obj_type=="van") or (self.cls=="pedestrian" and gg.obj_type=="person_sitting"): + + seq_ignored[gg.track_id][-1] = True + gg.ignored = True + nignoredtp += 1 + + # if the associated tracker detection is already ignored, + # we want to avoid double counting ignored detections + if ignoredtrackers[gg.tracker] > 0: + nignoredpairs += 1 + + # for computing MODP, the overlaps from ignored detections + # are subtracted + tmpc -= tmpcs[gi] + gi += 1 + + # the below might be confusion, check the comments in __init__ + # to see what the individual statistics represent + + # correct TP by number of ignored TP due to truncation + # ignored TP are shown as tracked in visualization + tmptp -= nignoredtp + + # count the number of ignored true positives + self.itp += nignoredtp + + # adjust the number of ground truth objects considered + self.n_gt -= (ignoredfn + nignoredtp) + + # count the number of ignored ground truth objects + self.n_igt += ignoredfn + nignoredtp + + # count the number of ignored tracker objects + self.n_itr += nignoredtracker + + # count the number of ignored pairs, i.e. associated tracker and + # ground truth objects that are both ignored + self.n_igttr += nignoredpairs + + # false negatives = associated gt bboxes exceding association threshold + non-associated gt bboxes + tmpfn += len(g) - len(association_matrix) - ignoredfn + self.fn += len(g) - len(association_matrix) - ignoredfn + self.ifn += ignoredfn + + # false positives = tracker bboxes - associated tracker bboxes + # mismatches (mme_t) + tmpfp += len( + t) - tmptp - nignoredtracker - nignoredtp + nignoredpairs + self.fp += len( + t) - tmptp - nignoredtracker - nignoredtp + nignoredpairs + + # update sequence data + seqtp += tmptp + seqitp += nignoredtp + seqfp += tmpfp + seqfn += tmpfn + seqifn += ignoredfn + seqigt += ignoredfn + nignoredtp + seqitr += nignoredtracker + + # sanity checks + # - the number of true positives minues ignored true positives + # should be greater or equal to 0 + # - the number of false negatives should be greater or equal to 0 + # - the number of false positives needs to be greater or equal to 0 + # otherwise ignored detections might be counted double + # - the number of counted true positives (plus ignored ones) + # and the number of counted false negatives (plus ignored ones) + # should match the total number of ground truth objects + # - the number of counted true positives (plus ignored ones) + # and the number of counted false positives + # plus the number of ignored tracker detections should + # match the total number of tracker detections; note that + # nignoredpairs is subtracted here to avoid double counting + # of ignored detection sin nignoredtp and nignoredtracker + if tmptp < 0: + print(tmptp, nignoredtp) + raise NameError("Something went wrong! TP is negative") + if tmpfn < 0: + print(tmpfn, + len(g), + len(association_matrix), ignoredfn, nignoredpairs) + raise NameError("Something went wrong! FN is negative") + if tmpfp < 0: + print(tmpfp, + len(t), tmptp, nignoredtracker, nignoredtp, + nignoredpairs) + raise NameError("Something went wrong! FP is negative") + if tmptp + tmpfn is not len(g) - ignoredfn - nignoredtp: + print("seqidx", seq_idx) + print("frame ", f) + print("TP ", tmptp) + print("FN ", tmpfn) + print("FP ", tmpfp) + print("nGT ", len(g)) + print("nAss ", len(association_matrix)) + print("ign GT", ignoredfn) + print("ign TP", nignoredtp) + raise NameError( + "Something went wrong! nGroundtruth is not TP+FN") + if tmptp + tmpfp + nignoredtp + nignoredtracker - nignoredpairs is not len( + t): + print(seq_idx, f, len(t), tmptp, tmpfp) + print(len(association_matrix), association_matrix) + raise NameError( + "Something went wrong! nTracker is not TP+FP") + + # check for id switches or fragmentations + for i, tt in enumerate(this_ids[0]): + if tt in last_ids[0]: + idx = last_ids[0].index(tt) + tid = this_ids[1][i] + lid = last_ids[1][idx] + if tid != lid and lid != -1 and tid != -1: + if g[i].truncation < self.max_truncation: + g[i].id_switch = 1 + ids += 1 + if tid != lid and lid != -1: + if g[i].truncation < self.max_truncation: + g[i].fragmentation = 1 + fr += 1 + + # save current index + last_ids = this_ids + # compute MOTP_t + MODP_t = 1 + if tmptp != 0: + MODP_t = tmpc / float(tmptp) + self.MODP_t.append(MODP_t) + + # remove empty lists for current gt trajectories + self.gt_trajectories[seq_idx] = seq_trajectories + self.ign_trajectories[seq_idx] = seq_ignored + + # gather statistics for "per sequence" statistics. + self.n_gts.append(n_gts) + self.n_trs.append(n_trs) + self.tps.append(seqtp) + self.itps.append(seqitp) + self.fps.append(seqfp) + self.fns.append(seqfn) + self.ifns.append(seqifn) + self.n_igts.append(seqigt) + self.n_itrs.append(seqitr) + + # compute MT/PT/ML, fragments, idswitches for all groundtruth trajectories + n_ignored_tr_total = 0 + for seq_idx, ( + seq_trajectories, seq_ignored + ) in enumerate(zip(self.gt_trajectories, self.ign_trajectories)): + if len(seq_trajectories) == 0: + continue + tmpMT, tmpML, tmpPT, tmpId_switches, tmpFragments = [0] * 5 + n_ignored_tr = 0 + for g, ign_g in zip(seq_trajectories.values(), + seq_ignored.values()): + # all frames of this gt trajectory are ignored + if all(ign_g): + n_ignored_tr += 1 + n_ignored_tr_total += 1 + continue + # all frames of this gt trajectory are not assigned to any detections + if all([this == -1 for this in g]): + tmpML += 1 + self.ML += 1 + continue + # compute tracked frames in trajectory + last_id = g[0] + # first detection (necessary to be in gt_trajectories) is always tracked + tracked = 1 if g[0] >= 0 else 0 + lgt = 0 if ign_g[0] else 1 + for f in range(1, len(g)): + if ign_g[f]: + last_id = -1 + continue + lgt += 1 + if last_id != g[f] and last_id != -1 and g[f] != -1 and g[ + f - 1] != -1: + tmpId_switches += 1 + self.id_switches += 1 + if f < len(g) - 1 and g[f - 1] != g[ + f] and last_id != -1 and g[f] != -1 and g[f + + 1] != -1: + tmpFragments += 1 + self.fragments += 1 + if g[f] != -1: + tracked += 1 + last_id = g[f] + # handle last frame; tracked state is handled in for loop (g[f]!=-1) + if len(g) > 1 and g[f - 1] != g[f] and last_id != -1 and g[ + f] != -1 and not ign_g[f]: + tmpFragments += 1 + self.fragments += 1 + + # compute MT/PT/ML + tracking_ratio = tracked / float(len(g) - sum(ign_g)) + if tracking_ratio > 0.8: + tmpMT += 1 + self.MT += 1 + elif tracking_ratio < 0.2: + tmpML += 1 + self.ML += 1 + else: # 0.2 <= tracking_ratio <= 0.8 + tmpPT += 1 + self.PT += 1 + + if (self.n_gt_trajectories - n_ignored_tr_total) == 0: + self.MT = 0. + self.PT = 0. + self.ML = 0. + else: + self.MT /= float(self.n_gt_trajectories - n_ignored_tr_total) + self.PT /= float(self.n_gt_trajectories - n_ignored_tr_total) + self.ML /= float(self.n_gt_trajectories - n_ignored_tr_total) + + # precision/recall etc. + if (self.fp + self.tp) == 0 or (self.tp + self.fn) == 0: + self.recall = 0. + self.precision = 0. + else: + self.recall = self.tp / float(self.tp + self.fn) + self.precision = self.tp / float(self.fp + self.tp) + if (self.recall + self.precision) == 0: + self.F1 = 0. + else: + self.F1 = 2. * (self.precision * self.recall) / ( + self.precision + self.recall) + if sum(self.n_frames) == 0: + self.FAR = "n/a" + else: + self.FAR = self.fp / float(sum(self.n_frames)) + + # compute CLEARMOT + if self.n_gt == 0: + self.MOTA = -float("inf") + self.MODA = -float("inf") + else: + self.MOTA = 1 - (self.fn + self.fp + self.id_switches + ) / float(self.n_gt) + self.MODA = 1 - (self.fn + self.fp) / float(self.n_gt) + if self.tp == 0: + self.MOTP = float("inf") + else: + self.MOTP = self.total_cost / float(self.tp) + if self.n_gt != 0: + if self.id_switches == 0: + self.MOTAL = 1 - (self.fn + self.fp + self.id_switches + ) / float(self.n_gt) + else: + self.MOTAL = 1 - (self.fn + self.fp + + math.log10(self.id_switches) + ) / float(self.n_gt) + else: + self.MOTAL = -float("inf") + if sum(self.n_frames) == 0: + self.MODP = "n/a" + else: + self.MODP = sum(self.MODP_t) / float(sum(self.n_frames)) + return True + + def createSummary(self): + summary = "" + summary += "tracking evaluation summary".center(80, "=") + "\n" + summary += self.printEntry("Multiple Object Tracking Accuracy (MOTA)", + self.MOTA) + "\n" + summary += self.printEntry("Multiple Object Tracking Precision (MOTP)", + self.MOTP) + "\n" + summary += self.printEntry("Multiple Object Tracking Accuracy (MOTAL)", + self.MOTAL) + "\n" + summary += self.printEntry("Multiple Object Detection Accuracy (MODA)", + self.MODA) + "\n" + summary += self.printEntry("Multiple Object Detection Precision (MODP)", + self.MODP) + "\n" + summary += "\n" + summary += self.printEntry("Recall", self.recall) + "\n" + summary += self.printEntry("Precision", self.precision) + "\n" + summary += self.printEntry("F1", self.F1) + "\n" + summary += self.printEntry("False Alarm Rate", self.FAR) + "\n" + summary += "\n" + summary += self.printEntry("Mostly Tracked", self.MT) + "\n" + summary += self.printEntry("Partly Tracked", self.PT) + "\n" + summary += self.printEntry("Mostly Lost", self.ML) + "\n" + summary += "\n" + summary += self.printEntry("True Positives", self.tp) + "\n" + #summary += self.printEntry("True Positives per Sequence", self.tps) + "\n" + summary += self.printEntry("Ignored True Positives", self.itp) + "\n" + #summary += self.printEntry("Ignored True Positives per Sequence", self.itps) + "\n" + + summary += self.printEntry("False Positives", self.fp) + "\n" + #summary += self.printEntry("False Positives per Sequence", self.fps) + "\n" + summary += self.printEntry("False Negatives", self.fn) + "\n" + #summary += self.printEntry("False Negatives per Sequence", self.fns) + "\n" + summary += self.printEntry("ID-switches", self.id_switches) + "\n" + self.fp = self.fp / self.n_gt + self.fn = self.fn / self.n_gt + self.id_switches = self.id_switches / self.n_gt + summary += self.printEntry("False Positives Ratio", self.fp) + "\n" + #summary += self.printEntry("False Positives per Sequence", self.fps) + "\n" + summary += self.printEntry("False Negatives Ratio", self.fn) + "\n" + #summary += self.printEntry("False Negatives per Sequence", self.fns) + "\n" + summary += self.printEntry("Ignored False Negatives Ratio", + self.ifn) + "\n" + + #summary += self.printEntry("Ignored False Negatives per Sequence", self.ifns) + "\n" + summary += self.printEntry("Missed Targets", self.fn) + "\n" + summary += self.printEntry("ID-switches", self.id_switches) + "\n" + summary += self.printEntry("Fragmentations", self.fragments) + "\n" + summary += "\n" + summary += self.printEntry("Ground Truth Objects (Total)", self.n_gt + + self.n_igt) + "\n" + #summary += self.printEntry("Ground Truth Objects (Total) per Sequence", self.n_gts) + "\n" + summary += self.printEntry("Ignored Ground Truth Objects", + self.n_igt) + "\n" + #summary += self.printEntry("Ignored Ground Truth Objects per Sequence", self.n_igts) + "\n" + summary += self.printEntry("Ground Truth Trajectories", + self.n_gt_trajectories) + "\n" + summary += "\n" + summary += self.printEntry("Tracker Objects (Total)", self.n_tr) + "\n" + #summary += self.printEntry("Tracker Objects (Total) per Sequence", self.n_trs) + "\n" + summary += self.printEntry("Ignored Tracker Objects", self.n_itr) + "\n" + #summary += self.printEntry("Ignored Tracker Objects per Sequence", self.n_itrs) + "\n" + summary += self.printEntry("Tracker Trajectories", + self.n_tr_trajectories) + "\n" + #summary += "\n" + #summary += self.printEntry("Ignored Tracker Objects with Associated Ignored Ground Truth Objects", self.n_igttr) + "\n" + summary += "=" * 80 + return summary + + def printEntry(self, key, val, width=(70, 10)): + """ + Pretty print an entry in a table fashion. + """ + s_out = key.ljust(width[0]) + if type(val) == int: + s = "%%%dd" % width[1] + s_out += s % val + elif type(val) == float: + s = "%%%df" % (width[1]) + s_out += s % val + else: + s_out += ("%s" % val).rjust(width[1]) + return s_out + + def saveToStats(self, save_summary): + """ + Save the statistics in a whitespace separate file. + """ + summary = self.createSummary() + if save_summary: + filename = os.path.join(self.result_path, + "summary_%s.txt" % self.cls) + dump = open(filename, "w+") + dump.write(summary) + dump.close() + return summary + + +class KITTIMOTMetric(Metric): + def __init__(self, save_summary=True): + self.save_summary = save_summary + self.MOTEvaluator = KITTIEvaluation + self.result_root = None + self.reset() + + def reset(self): + self.seqs = [] + self.n_sequences = 0 + self.n_frames = [] + self.strsummary = '' + + def update(self, data_root, seq, data_type, result_root, result_filename): + assert data_type == 'kitti', "data_type should 'kitti'" + self.result_root = result_root + self.gt_path = data_root + gt_path = '{}/../labels/{}.txt'.format(data_root, seq) + gt = open(gt_path, "r") + max_frame = 0 + for line in gt: + line = line.strip() + line_list = line.split(" ") + if int(line_list[0]) > max_frame: + max_frame = int(line_list[0]) + rs = open(result_filename, "r") + for line in rs: + line = line.strip() + line_list = line.split(" ") + if int(line_list[0]) > max_frame: + max_frame = int(line_list[0]) + gt.close() + rs.close() + self.n_frames.append(max_frame + 1) + self.seqs.append(seq) + self.n_sequences += 1 + + def accumulate(self): + logger.info("Processing Result for KITTI Tracking Benchmark") + e = self.MOTEvaluator(result_path=self.result_root, gt_path=self.gt_path,\ + n_frames=self.n_frames, seqs=self.seqs, n_sequences=self.n_sequences) + try: + if not e.loadTracker(): + return + logger.info("Loading Results - Success") + logger.info("Evaluate Object Class: %s" % c.upper()) + except: + logger.info("Caught exception while loading result data.") + if not e.loadGroundtruth(): + raise ValueError("Ground truth not found.") + logger.info("Loading Groundtruth - Success") + # sanity checks + if len(e.groundtruth) is not len(e.tracker): + logger.info( + "The uploaded data does not provide results for every sequence.") + return False + logger.info("Loaded %d Sequences." % len(e.groundtruth)) + logger.info("Start Evaluation...") + + if e.compute3rdPartyMetrics(): + self.strsummary = e.saveToStats(self.save_summary) + else: + logger.info( + "There seem to be no true positives or false positives at all in the submitted data." + ) + + def log(self): + print(self.strsummary) + + def get_results(self): + return self.strsummary diff --git a/rtdetr_paddle/ppdet/metrics/munkres.py b/rtdetr_paddle/ppdet/metrics/munkres.py new file mode 100644 index 0000000..fbd4a92 --- /dev/null +++ b/rtdetr_paddle/ppdet/metrics/munkres.py @@ -0,0 +1,428 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is borrow from https://github.com/xingyizhou/CenterTrack/blob/master/src/tools/eval_kitti_track/munkres.py +""" + +import sys + +__all__ = ['Munkres', 'make_cost_matrix'] + + +class Munkres: + """ + Calculate the Munkres solution to the classical assignment problem. + See the module documentation for usage. + """ + + def __init__(self): + """Create a new instance""" + self.C = None + self.row_covered = [] + self.col_covered = [] + self.n = 0 + self.Z0_r = 0 + self.Z0_c = 0 + self.marked = None + self.path = None + + def make_cost_matrix(profit_matrix, inversion_function): + """ + **DEPRECATED** + + Please use the module function ``make_cost_matrix()``. + """ + import munkres + return munkres.make_cost_matrix(profit_matrix, inversion_function) + + make_cost_matrix = staticmethod(make_cost_matrix) + + def pad_matrix(self, matrix, pad_value=0): + """ + Pad a possibly non-square matrix to make it square. + + :Parameters: + matrix : list of lists + matrix to pad + + pad_value : int + value to use to pad the matrix + + :rtype: list of lists + :return: a new, possibly padded, matrix + """ + max_columns = 0 + total_rows = len(matrix) + + for row in matrix: + max_columns = max(max_columns, len(row)) + + total_rows = max(max_columns, total_rows) + + new_matrix = [] + for row in matrix: + row_len = len(row) + new_row = row[:] + if total_rows > row_len: + # Row too short. Pad it. + new_row += [0] * (total_rows - row_len) + new_matrix += [new_row] + + while len(new_matrix) < total_rows: + new_matrix += [[0] * total_rows] + + return new_matrix + + def compute(self, cost_matrix): + """ + Compute the indexes for the lowest-cost pairings between rows and + columns in the database. Returns a list of (row, column) tuples + that can be used to traverse the matrix. + + :Parameters: + cost_matrix : list of lists + The cost matrix. If this cost matrix is not square, it + will be padded with zeros, via a call to ``pad_matrix()``. + (This method does *not* modify the caller's matrix. It + operates on a copy of the matrix.) + + **WARNING**: This code handles square and rectangular + matrices. It does *not* handle irregular matrices. + + :rtype: list + :return: A list of ``(row, column)`` tuples that describe the lowest + cost path through the matrix + + """ + self.C = self.pad_matrix(cost_matrix) + self.n = len(self.C) + self.original_length = len(cost_matrix) + self.original_width = len(cost_matrix[0]) + self.row_covered = [False for i in range(self.n)] + self.col_covered = [False for i in range(self.n)] + self.Z0_r = 0 + self.Z0_c = 0 + self.path = self.__make_matrix(self.n * 2, 0) + self.marked = self.__make_matrix(self.n, 0) + + done = False + step = 1 + + steps = { + 1: self.__step1, + 2: self.__step2, + 3: self.__step3, + 4: self.__step4, + 5: self.__step5, + 6: self.__step6 + } + + while not done: + try: + func = steps[step] + step = func() + except KeyError: + done = True + + # Look for the starred columns + results = [] + for i in range(self.original_length): + for j in range(self.original_width): + if self.marked[i][j] == 1: + results += [(i, j)] + + return results + + def __copy_matrix(self, matrix): + """Return an exact copy of the supplied matrix""" + return copy.deepcopy(matrix) + + def __make_matrix(self, n, val): + """Create an *n*x*n* matrix, populating it with the specific value.""" + matrix = [] + for i in range(n): + matrix += [[val for j in range(n)]] + return matrix + + def __step1(self): + """ + For each row of the matrix, find the smallest element and + subtract it from every element in its row. Go to Step 2. + """ + C = self.C + n = self.n + for i in range(n): + minval = min(self.C[i]) + # Find the minimum value for this row and subtract that minimum + # from every element in the row. + for j in range(n): + self.C[i][j] -= minval + + return 2 + + def __step2(self): + """ + Find a zero (Z) in the resulting matrix. If there is no starred + zero in its row or column, star Z. Repeat for each element in the + matrix. Go to Step 3. + """ + n = self.n + for i in range(n): + for j in range(n): + if (self.C[i][j] == 0) and \ + (not self.col_covered[j]) and \ + (not self.row_covered[i]): + self.marked[i][j] = 1 + self.col_covered[j] = True + self.row_covered[i] = True + + self.__clear_covers() + return 3 + + def __step3(self): + """ + Cover each column containing a starred zero. If K columns are + covered, the starred zeros describe a complete set of unique + assignments. In this case, Go to DONE, otherwise, Go to Step 4. + """ + n = self.n + count = 0 + for i in range(n): + for j in range(n): + if self.marked[i][j] == 1: + self.col_covered[j] = True + count += 1 + + if count >= n: + step = 7 # done + else: + step = 4 + + return step + + def __step4(self): + """ + Find a noncovered zero and prime it. If there is no starred zero + in the row containing this primed zero, Go to Step 5. Otherwise, + cover this row and uncover the column containing the starred + zero. Continue in this manner until there are no uncovered zeros + left. Save the smallest uncovered value and Go to Step 6. + """ + step = 0 + done = False + row = -1 + col = -1 + star_col = -1 + while not done: + (row, col) = self.__find_a_zero() + if row < 0: + done = True + step = 6 + else: + self.marked[row][col] = 2 + star_col = self.__find_star_in_row(row) + if star_col >= 0: + col = star_col + self.row_covered[row] = True + self.col_covered[col] = False + else: + done = True + self.Z0_r = row + self.Z0_c = col + step = 5 + + return step + + def __step5(self): + """ + Construct a series of alternating primed and starred zeros as + follows. Let Z0 represent the uncovered primed zero found in Step 4. + Let Z1 denote the starred zero in the column of Z0 (if any). + Let Z2 denote the primed zero in the row of Z1 (there will always + be one). Continue until the series terminates at a primed zero + that has no starred zero in its column. Unstar each starred zero + of the series, star each primed zero of the series, erase all + primes and uncover every line in the matrix. Return to Step 3 + """ + count = 0 + path = self.path + path[count][0] = self.Z0_r + path[count][1] = self.Z0_c + done = False + while not done: + row = self.__find_star_in_col(path[count][1]) + if row >= 0: + count += 1 + path[count][0] = row + path[count][1] = path[count - 1][1] + else: + done = True + + if not done: + col = self.__find_prime_in_row(path[count][0]) + count += 1 + path[count][0] = path[count - 1][0] + path[count][1] = col + + self.__convert_path(path, count) + self.__clear_covers() + self.__erase_primes() + return 3 + + def __step6(self): + """ + Add the value found in Step 4 to every element of each covered + row, and subtract it from every element of each uncovered column. + Return to Step 4 without altering any stars, primes, or covered + lines. + """ + minval = self.__find_smallest() + for i in range(self.n): + for j in range(self.n): + if self.row_covered[i]: + self.C[i][j] += minval + if not self.col_covered[j]: + self.C[i][j] -= minval + return 4 + + def __find_smallest(self): + """Find the smallest uncovered value in the matrix.""" + minval = 2e9 # sys.maxint + for i in range(self.n): + for j in range(self.n): + if (not self.row_covered[i]) and (not self.col_covered[j]): + if minval > self.C[i][j]: + minval = self.C[i][j] + return minval + + def __find_a_zero(self): + """Find the first uncovered element with value 0""" + row = -1 + col = -1 + i = 0 + n = self.n + done = False + + while not done: + j = 0 + while True: + if (self.C[i][j] == 0) and \ + (not self.row_covered[i]) and \ + (not self.col_covered[j]): + row = i + col = j + done = True + j += 1 + if j >= n: + break + i += 1 + if i >= n: + done = True + + return (row, col) + + def __find_star_in_row(self, row): + """ + Find the first starred element in the specified row. Returns + the column index, or -1 if no starred element was found. + """ + col = -1 + for j in range(self.n): + if self.marked[row][j] == 1: + col = j + break + + return col + + def __find_star_in_col(self, col): + """ + Find the first starred element in the specified row. Returns + the row index, or -1 if no starred element was found. + """ + row = -1 + for i in range(self.n): + if self.marked[i][col] == 1: + row = i + break + + return row + + def __find_prime_in_row(self, row): + """ + Find the first prime element in the specified row. Returns + the column index, or -1 if no starred element was found. + """ + col = -1 + for j in range(self.n): + if self.marked[row][j] == 2: + col = j + break + + return col + + def __convert_path(self, path, count): + for i in range(count + 1): + if self.marked[path[i][0]][path[i][1]] == 1: + self.marked[path[i][0]][path[i][1]] = 0 + else: + self.marked[path[i][0]][path[i][1]] = 1 + + def __clear_covers(self): + """Clear all covered matrix cells""" + for i in range(self.n): + self.row_covered[i] = False + self.col_covered[i] = False + + def __erase_primes(self): + """Erase all prime markings""" + for i in range(self.n): + for j in range(self.n): + if self.marked[i][j] == 2: + self.marked[i][j] = 0 + + +def make_cost_matrix(profit_matrix, inversion_function): + """ + Create a cost matrix from a profit matrix by calling + 'inversion_function' to invert each value. The inversion + function must take one numeric argument (of any type) and return + another numeric argument which is presumed to be the cost inverse + of the original profit. + + This is a static method. Call it like this: + + .. python:: + + cost_matrix = Munkres.make_cost_matrix(matrix, inversion_func) + + For example: + + .. python:: + + cost_matrix = Munkres.make_cost_matrix(matrix, lambda x : sys.maxint - x) + + :Parameters: + profit_matrix : list of lists + The matrix to convert from a profit to a cost matrix + + inversion_function : function + The function to use to invert each entry in the profit matrix + + :rtype: list of lists + :return: The converted matrix + """ + cost_matrix = [] + for row in profit_matrix: + cost_matrix.append([inversion_function(value) for value in row]) + return cost_matrix diff --git a/rtdetr_paddle/ppdet/metrics/pose3d_metrics.py b/rtdetr_paddle/ppdet/metrics/pose3d_metrics.py new file mode 100644 index 0000000..ea21de9 --- /dev/null +++ b/rtdetr_paddle/ppdet/metrics/pose3d_metrics.py @@ -0,0 +1,200 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle.distributed import ParallelEnv +import os +import json +from collections import defaultdict, OrderedDict +import numpy as np +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + +__all__ = ['Pose3DEval'] + + +class AverageMeter(object): + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + +def mean_per_joint_position_error(pred, gt, has_3d_joints): + """ + Compute mPJPE + """ + gt = gt[has_3d_joints == 1] + gt = gt[:, :, :3] + pred = pred[has_3d_joints == 1] + + with paddle.no_grad(): + gt_pelvis = (gt[:, 2, :] + gt[:, 3, :]) / 2 + gt = gt - gt_pelvis[:, None, :] + pred_pelvis = (pred[:, 2, :] + pred[:, 3, :]) / 2 + pred = pred - pred_pelvis[:, None, :] + error = paddle.sqrt(((pred - gt)**2).sum(axis=-1)).mean(axis=-1).numpy() + return error + + +def compute_similarity_transform(S1, S2): + """Computes a similarity transform (sR, t) that takes + a set of 3D points S1 (3 x N) closest to a set of 3D points S2, + where R is an 3x3 rotation matrix, t 3x1 translation, s scale. + i.e. solves the orthogonal Procrutes problem. + """ + transposed = False + if S1.shape[0] != 3 and S1.shape[0] != 2: + S1 = S1.T + S2 = S2.T + transposed = True + assert (S2.shape[1] == S1.shape[1]) + + # 1. Remove mean. + mu1 = S1.mean(axis=1, keepdims=True) + mu2 = S2.mean(axis=1, keepdims=True) + X1 = S1 - mu1 + X2 = S2 - mu2 + + # 2. Compute variance of X1 used for scale. + var1 = np.sum(X1**2) + + # 3. The outer product of X1 and X2. + K = X1.dot(X2.T) + + # 4. Solution that Maximizes trace(R'K) is R=U*V', where U, V are + # singular vectors of K. + U, s, Vh = np.linalg.svd(K) + V = Vh.T + # Construct Z that fixes the orientation of R to get det(R)=1. + Z = np.eye(U.shape[0]) + Z[-1, -1] *= np.sign(np.linalg.det(U.dot(V.T))) + # Construct R. + R = V.dot(Z.dot(U.T)) + + # 5. Recover scale. + scale = np.trace(R.dot(K)) / var1 + + # 6. Recover translation. + t = mu2 - scale * (R.dot(mu1)) + + # 7. Error: + S1_hat = scale * R.dot(S1) + t + + if transposed: + S1_hat = S1_hat.T + + return S1_hat + + +def compute_similarity_transform_batch(S1, S2): + """Batched version of compute_similarity_transform.""" + S1_hat = np.zeros_like(S1) + for i in range(S1.shape[0]): + S1_hat[i] = compute_similarity_transform(S1[i], S2[i]) + return S1_hat + + +def reconstruction_error(S1, S2, reduction='mean'): + """Do Procrustes alignment and compute reconstruction error.""" + S1_hat = compute_similarity_transform_batch(S1, S2) + re = np.sqrt(((S1_hat - S2)**2).sum(axis=-1)).mean(axis=-1) + if reduction == 'mean': + re = re.mean() + elif reduction == 'sum': + re = re.sum() + return re + + +def all_gather(data): + if paddle.distributed.get_world_size() == 1: + return data + vlist = [] + paddle.distributed.all_gather(vlist, data) + data = paddle.concat(vlist, 0) + return data + + +class Pose3DEval(object): + def __init__(self, output_eval, save_prediction_only=False): + super(Pose3DEval, self).__init__() + self.output_eval = output_eval + self.res_file = os.path.join(output_eval, "pose3d_results.json") + self.save_prediction_only = save_prediction_only + self.reset() + + def reset(self): + self.PAmPJPE = AverageMeter() + self.mPJPE = AverageMeter() + self.eval_results = {} + + def get_human36m_joints(self, input): + J24_TO_J14 = paddle.to_tensor( + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18]) + J24_TO_J17 = paddle.to_tensor( + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19]) + return paddle.index_select(input, J24_TO_J14, axis=1) + + def update(self, inputs, outputs): + gt_3d_joints = all_gather(inputs['joints_3d'].cuda(ParallelEnv() + .local_rank)) + has_3d_joints = all_gather(inputs['has_3d_joints'].cuda(ParallelEnv() + .local_rank)) + pred_3d_joints = all_gather(outputs['pose3d']) + if gt_3d_joints.shape[1] == 24: + gt_3d_joints = self.get_human36m_joints(gt_3d_joints) + if pred_3d_joints.shape[1] == 24: + pred_3d_joints = self.get_human36m_joints(pred_3d_joints) + mPJPE_val = mean_per_joint_position_error(pred_3d_joints, gt_3d_joints, + has_3d_joints).mean() + PAmPJPE_val = reconstruction_error( + pred_3d_joints.numpy(), + gt_3d_joints[:, :, :3].numpy(), + reduction=None).mean() + count = int(np.sum(has_3d_joints.numpy())) + self.PAmPJPE.update(PAmPJPE_val * 1000., count) + self.mPJPE.update(mPJPE_val * 1000., count) + + def accumulate(self): + if self.save_prediction_only: + logger.info(f'The pose3d result is saved to {self.res_file} ' + 'and do not evaluate the model.') + return + self.eval_results['pose3d'] = [-self.mPJPE.avg, -self.PAmPJPE.avg] + + def log(self): + if self.save_prediction_only: + return + stats_names = ['mPJPE', 'PAmPJPE'] + num_values = len(stats_names) + print(' '.join(['| {}'.format(name) for name in stats_names]) + ' |') + print('|---' * (num_values + 1) + '|') + + print(' '.join([ + '| {:.3f}'.format(abs(value)) + for value in self.eval_results['pose3d'] + ]) + ' |') + + def get_results(self): + return self.eval_results diff --git a/rtdetr_paddle/ppdet/metrics/widerface_utils.py b/rtdetr_paddle/ppdet/metrics/widerface_utils.py new file mode 100644 index 0000000..2f64bf6 --- /dev/null +++ b/rtdetr_paddle/ppdet/metrics/widerface_utils.py @@ -0,0 +1,391 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import cv2 +import numpy as np +from collections import OrderedDict + +import paddle + +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + +__all__ = ['face_eval_run', 'lmk2out'] + + +def face_eval_run(model, + image_dir, + gt_file, + pred_dir='output/pred', + eval_mode='widerface', + multi_scale=False): + # load ground truth files + with open(gt_file, 'r') as f: + gt_lines = f.readlines() + imid2path = [] + pos_gt = 0 + while pos_gt < len(gt_lines): + name_gt = gt_lines[pos_gt].strip('\n\t').split()[0] + imid2path.append(name_gt) + pos_gt += 1 + n_gt = int(gt_lines[pos_gt].strip('\n\t').split()[0]) + pos_gt += 1 + n_gt + logger.info('The ground truth file load {} images'.format(len(imid2path))) + + dets_dist = OrderedDict() + for iter_id, im_path in enumerate(imid2path): + image_path = os.path.join(image_dir, im_path) + if eval_mode == 'fddb': + image_path += '.jpg' + assert os.path.exists(image_path) + image = cv2.imread(image_path) + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + if multi_scale: + shrink, max_shrink = get_shrink(image.shape[0], image.shape[1]) + det0 = detect_face(model, image, shrink) + det1 = flip_test(model, image, shrink) + [det2, det3] = multi_scale_test(model, image, max_shrink) + det4 = multi_scale_test_pyramid(model, image, max_shrink) + det = np.row_stack((det0, det1, det2, det3, det4)) + dets = bbox_vote(det) + else: + dets = detect_face(model, image, 1) + if eval_mode == 'widerface': + save_widerface_bboxes(image_path, dets, pred_dir) + else: + dets_dist[im_path] = dets + if iter_id % 100 == 0: + logger.info('Test iter {}'.format(iter_id)) + if eval_mode == 'fddb': + save_fddb_bboxes(dets_dist, pred_dir) + logger.info("Finish evaluation.") + + +def detect_face(model, image, shrink): + image_shape = [image.shape[0], image.shape[1]] + if shrink != 1: + h, w = int(image_shape[0] * shrink), int(image_shape[1] * shrink) + image = cv2.resize(image, (w, h)) + image_shape = [h, w] + + img = face_img_process(image) + image_shape = np.asarray([image_shape]) + scale_factor = np.asarray([[shrink, shrink]]) + data = { + "image": paddle.to_tensor( + img, dtype='float32'), + "im_shape": paddle.to_tensor( + image_shape, dtype='float32'), + "scale_factor": paddle.to_tensor( + scale_factor, dtype='float32') + } + model.eval() + detection = model(data) + detection = detection['bbox'].numpy() + # layout: xmin, ymin, xmax. ymax, score + if np.prod(detection.shape) == 1: + logger.info("No face detected") + return np.array([[0, 0, 0, 0, 0]]) + det_conf = detection[:, 1] + det_xmin = detection[:, 2] + det_ymin = detection[:, 3] + det_xmax = detection[:, 4] + det_ymax = detection[:, 5] + + det = np.column_stack((det_xmin, det_ymin, det_xmax, det_ymax, det_conf)) + return det + + +def flip_test(model, image, shrink): + img = cv2.flip(image, 1) + det_f = detect_face(model, img, shrink) + det_t = np.zeros(det_f.shape) + img_width = image.shape[1] + det_t[:, 0] = img_width - det_f[:, 2] + det_t[:, 1] = det_f[:, 1] + det_t[:, 2] = img_width - det_f[:, 0] + det_t[:, 3] = det_f[:, 3] + det_t[:, 4] = det_f[:, 4] + return det_t + + +def multi_scale_test(model, image, max_shrink): + # Shrink detecting is only used to detect big faces + st = 0.5 if max_shrink >= 0.75 else 0.5 * max_shrink + det_s = detect_face(model, image, st) + index = np.where( + np.maximum(det_s[:, 2] - det_s[:, 0] + 1, det_s[:, 3] - det_s[:, 1] + 1) + > 30)[0] + det_s = det_s[index, :] + # Enlarge one times + bt = min(2, max_shrink) if max_shrink > 1 else (st + max_shrink) / 2 + det_b = detect_face(model, image, bt) + + # Enlarge small image x times for small faces + if max_shrink > 2: + bt *= 2 + while bt < max_shrink: + det_b = np.row_stack((det_b, detect_face(model, image, bt))) + bt *= 2 + det_b = np.row_stack((det_b, detect_face(model, image, max_shrink))) + + # Enlarged images are only used to detect small faces. + if bt > 1: + index = np.where( + np.minimum(det_b[:, 2] - det_b[:, 0] + 1, + det_b[:, 3] - det_b[:, 1] + 1) < 100)[0] + det_b = det_b[index, :] + # Shrinked images are only used to detect big faces. + else: + index = np.where( + np.maximum(det_b[:, 2] - det_b[:, 0] + 1, + det_b[:, 3] - det_b[:, 1] + 1) > 30)[0] + det_b = det_b[index, :] + return det_s, det_b + + +def multi_scale_test_pyramid(model, image, max_shrink): + # Use image pyramids to detect faces + det_b = detect_face(model, image, 0.25) + index = np.where( + np.maximum(det_b[:, 2] - det_b[:, 0] + 1, det_b[:, 3] - det_b[:, 1] + 1) + > 30)[0] + det_b = det_b[index, :] + + st = [0.75, 1.25, 1.5, 1.75] + for i in range(len(st)): + if st[i] <= max_shrink: + det_temp = detect_face(model, image, st[i]) + # Enlarged images are only used to detect small faces. + if st[i] > 1: + index = np.where( + np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1, + det_temp[:, 3] - det_temp[:, 1] + 1) < 100)[0] + det_temp = det_temp[index, :] + # Shrinked images are only used to detect big faces. + else: + index = np.where( + np.maximum(det_temp[:, 2] - det_temp[:, 0] + 1, + det_temp[:, 3] - det_temp[:, 1] + 1) > 30)[0] + det_temp = det_temp[index, :] + det_b = np.row_stack((det_b, det_temp)) + return det_b + + +def to_chw(image): + """ + Transpose image from HWC to CHW. + Args: + image (np.array): an image with HWC layout. + """ + # HWC to CHW + if len(image.shape) == 3: + image = np.swapaxes(image, 1, 2) + image = np.swapaxes(image, 1, 0) + return image + + +def face_img_process(image, + mean=[104., 117., 123.], + std=[127.502231, 127.502231, 127.502231]): + img = np.array(image) + img = to_chw(img) + img = img.astype('float32') + img -= np.array(mean)[:, np.newaxis, np.newaxis].astype('float32') + img /= np.array(std)[:, np.newaxis, np.newaxis].astype('float32') + img = [img] + img = np.array(img) + return img + + +def get_shrink(height, width): + """ + Args: + height (int): image height. + width (int): image width. + """ + # avoid out of memory + max_shrink_v1 = (0x7fffffff / 577.0 / (height * width))**0.5 + max_shrink_v2 = ((678 * 1024 * 2.0 * 2.0) / (height * width))**0.5 + + def get_round(x, loc): + str_x = str(x) + if '.' in str_x: + str_before, str_after = str_x.split('.') + len_after = len(str_after) + if len_after >= 3: + str_final = str_before + '.' + str_after[0:loc] + return float(str_final) + else: + return x + + max_shrink = get_round(min(max_shrink_v1, max_shrink_v2), 2) - 0.3 + if max_shrink >= 1.5 and max_shrink < 2: + max_shrink = max_shrink - 0.1 + elif max_shrink >= 2 and max_shrink < 3: + max_shrink = max_shrink - 0.2 + elif max_shrink >= 3 and max_shrink < 4: + max_shrink = max_shrink - 0.3 + elif max_shrink >= 4 and max_shrink < 5: + max_shrink = max_shrink - 0.4 + elif max_shrink >= 5: + max_shrink = max_shrink - 0.5 + elif max_shrink <= 0.1: + max_shrink = 0.1 + + shrink = max_shrink if max_shrink < 1 else 1 + return shrink, max_shrink + + +def bbox_vote(det): + order = det[:, 4].ravel().argsort()[::-1] + det = det[order, :] + if det.shape[0] == 0: + dets = np.array([[10, 10, 20, 20, 0.002]]) + det = np.empty(shape=[0, 5]) + while det.shape[0] > 0: + # IOU + area = (det[:, 2] - det[:, 0] + 1) * (det[:, 3] - det[:, 1] + 1) + xx1 = np.maximum(det[0, 0], det[:, 0]) + yy1 = np.maximum(det[0, 1], det[:, 1]) + xx2 = np.minimum(det[0, 2], det[:, 2]) + yy2 = np.minimum(det[0, 3], det[:, 3]) + w = np.maximum(0.0, xx2 - xx1 + 1) + h = np.maximum(0.0, yy2 - yy1 + 1) + inter = w * h + o = inter / (area[0] + area[:] - inter) + + # nms + merge_index = np.where(o >= 0.3)[0] + det_accu = det[merge_index, :] + det = np.delete(det, merge_index, 0) + if merge_index.shape[0] <= 1: + if det.shape[0] == 0: + try: + dets = np.row_stack((dets, det_accu)) + except: + dets = det_accu + continue + det_accu[:, 0:4] = det_accu[:, 0:4] * np.tile(det_accu[:, -1:], (1, 4)) + max_score = np.max(det_accu[:, 4]) + det_accu_sum = np.zeros((1, 5)) + det_accu_sum[:, 0:4] = np.sum(det_accu[:, 0:4], + axis=0) / np.sum(det_accu[:, -1:]) + det_accu_sum[:, 4] = max_score + try: + dets = np.row_stack((dets, det_accu_sum)) + except: + dets = det_accu_sum + dets = dets[0:750, :] + keep_index = np.where(dets[:, 4] >= 0.01)[0] + dets = dets[keep_index, :] + return dets + + +def save_widerface_bboxes(image_path, bboxes_scores, output_dir): + image_name = image_path.split('/')[-1] + image_class = image_path.split('/')[-2] + odir = os.path.join(output_dir, image_class) + if not os.path.exists(odir): + os.makedirs(odir) + + ofname = os.path.join(odir, '%s.txt' % (image_name[:-4])) + f = open(ofname, 'w') + f.write('{:s}\n'.format(image_class + '/' + image_name)) + f.write('{:d}\n'.format(bboxes_scores.shape[0])) + for box_score in bboxes_scores: + xmin, ymin, xmax, ymax, score = box_score + f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n'.format(xmin, ymin, ( + xmax - xmin + 1), (ymax - ymin + 1), score)) + f.close() + logger.info("The predicted result is saved as {}".format(ofname)) + + +def save_fddb_bboxes(bboxes_scores, + output_dir, + output_fname='pred_fddb_res.txt'): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + predict_file = os.path.join(output_dir, output_fname) + f = open(predict_file, 'w') + for image_path, dets in bboxes_scores.iteritems(): + f.write('{:s}\n'.format(image_path)) + f.write('{:d}\n'.format(dets.shape[0])) + for box_score in dets: + xmin, ymin, xmax, ymax, score = box_score + width, height = xmax - xmin, ymax - ymin + f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n' + .format(xmin, ymin, width, height, score)) + logger.info("The predicted result is saved as {}".format(predict_file)) + return predict_file + + +def lmk2out(results, is_bbox_normalized=False): + """ + Args: + results: request a dict, should include: `landmark`, `im_id`, + if is_bbox_normalized=True, also need `im_shape`. + is_bbox_normalized: whether or not landmark is normalized. + """ + xywh_res = [] + for t in results: + bboxes = t['bbox'][0] + lengths = t['bbox'][1][0] + im_ids = np.array(t['im_id'][0]).flatten() + if bboxes.shape == (1, 1) or bboxes is None: + continue + face_index = t['face_index'][0] + prior_box = t['prior_boxes'][0] + predict_lmk = t['landmark'][0] + prior = np.reshape(prior_box, (-1, 4)) + predictlmk = np.reshape(predict_lmk, (-1, 10)) + + k = 0 + for a in range(len(lengths)): + num = lengths[a] + im_id = int(im_ids[a]) + for i in range(num): + score = bboxes[k][1] + theindex = face_index[i][0] + me_prior = prior[theindex, :] + lmk_pred = predictlmk[theindex, :] + prior_w = me_prior[2] - me_prior[0] + prior_h = me_prior[3] - me_prior[1] + prior_w_center = (me_prior[2] + me_prior[0]) / 2 + prior_h_center = (me_prior[3] + me_prior[1]) / 2 + lmk_decode = np.zeros((10)) + for j in [0, 2, 4, 6, 8]: + lmk_decode[j] = lmk_pred[j] * 0.1 * prior_w + prior_w_center + for j in [1, 3, 5, 7, 9]: + lmk_decode[j] = lmk_pred[j] * 0.1 * prior_h + prior_h_center + im_shape = t['im_shape'][0][a].tolist() + image_h, image_w = int(im_shape[0]), int(im_shape[1]) + if is_bbox_normalized: + lmk_decode = lmk_decode * np.array([ + image_w, image_h, image_w, image_h, image_w, image_h, + image_w, image_h, image_w, image_h + ]) + lmk_res = { + 'image_id': im_id, + 'landmark': lmk_decode, + 'score': score, + } + xywh_res.append(lmk_res) + k += 1 + return xywh_res diff --git a/rtdetr_paddle/ppdet/modeling/__init__.py b/rtdetr_paddle/ppdet/modeling/__init__.py new file mode 100644 index 0000000..9c29c8c --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +warnings.filterwarnings( + action='ignore', category=DeprecationWarning, module='ops') + + +from .ops import * +from .backbones import * +from .heads import * +from .losses import * +from .architectures import * +from .post_process import * +from .layers import * +from .transformers import * diff --git a/rtdetr_paddle/ppdet/modeling/architectures/__init__.py b/rtdetr_paddle/ppdet/modeling/architectures/__init__.py new file mode 100644 index 0000000..318b760 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/architectures/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .meta_arch import * +from .detr import * diff --git a/rtdetr_paddle/ppdet/modeling/architectures/detr.py b/rtdetr_paddle/ppdet/modeling/architectures/detr.py new file mode 100644 index 0000000..7839a12 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/architectures/detr.py @@ -0,0 +1,116 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from .meta_arch import BaseArch +from ppdet.core.workspace import register, create + +__all__ = ['DETR'] +# Deformable DETR, DINO use the same architecture as DETR + + +@register +class DETR(BaseArch): + __category__ = 'architecture' + __inject__ = ['post_process'] + __shared__ = ['with_mask', 'exclude_post_process'] + + def __init__(self, + backbone, + transformer='DETRTransformer', + detr_head='DETRHead', + neck=None, + post_process='DETRPostProcess', + with_mask=False, + exclude_post_process=False): + super(DETR, self).__init__() + self.backbone = backbone + self.transformer = transformer + self.detr_head = detr_head + self.neck = neck + self.post_process = post_process + self.with_mask = with_mask + self.exclude_post_process = exclude_post_process + + @classmethod + def from_config(cls, cfg, *args, **kwargs): + # backbone + backbone = create(cfg['backbone']) + # neck + kwargs = {'input_shape': backbone.out_shape} + neck = create(cfg['neck'], **kwargs) if cfg['neck'] else None + + # transformer + if neck is not None: + kwargs = {'input_shape': neck.out_shape} + transformer = create(cfg['transformer'], **kwargs) + # head + kwargs = { + 'hidden_dim': transformer.hidden_dim, + 'nhead': transformer.nhead, + 'input_shape': backbone.out_shape + } + detr_head = create(cfg['detr_head'], **kwargs) + + return { + 'backbone': backbone, + 'transformer': transformer, + "detr_head": detr_head, + "neck": neck + } + + def _forward(self): + # Backbone + body_feats = self.backbone(self.inputs) + + # Neck + if self.neck is not None: + body_feats = self.neck(body_feats) + + # Transformer + pad_mask = self.inputs.get('pad_mask', None) + out_transformer = self.transformer(body_feats, pad_mask, self.inputs) + + # DETR Head + if self.training: + detr_losses = self.detr_head(out_transformer, body_feats, + self.inputs) + detr_losses.update({ + 'loss': paddle.add_n( + [v for k, v in detr_losses.items() if 'log' not in k]) + }) + return detr_losses + else: + preds = self.detr_head(out_transformer, body_feats) + if self.exclude_post_process: + bbox, bbox_num, mask = preds + else: + bbox, bbox_num, mask = self.post_process( + preds, self.inputs['im_shape'], self.inputs['scale_factor'], + paddle.shape(self.inputs['image'])[2:]) + + output = {'bbox': bbox, 'bbox_num': bbox_num} + if self.with_mask: + output['mask'] = mask + return output + + def get_loss(self): + return self._forward() + + def get_pred(self): + return self._forward() diff --git a/rtdetr_paddle/ppdet/modeling/architectures/meta_arch.py b/rtdetr_paddle/ppdet/modeling/architectures/meta_arch.py new file mode 100644 index 0000000..370b2b1 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/architectures/meta_arch.py @@ -0,0 +1,132 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import paddle +import paddle.nn as nn +import typing + +from ppdet.core.workspace import register +from ppdet.modeling.post_process import nms + +__all__ = ['BaseArch'] + + +@register +class BaseArch(nn.Layer): + def __init__(self, data_format='NCHW', use_extra_data=False): + super(BaseArch, self).__init__() + self.data_format = data_format + self.inputs = {} + self.fuse_norm = False + self.use_extra_data = use_extra_data + + def load_meanstd(self, cfg_transform): + scale = 1. + mean = np.array([0.485, 0.456, 0.406], dtype=np.float32) + std = np.array([0.229, 0.224, 0.225], dtype=np.float32) + for item in cfg_transform: + if 'NormalizeImage' in item: + mean = np.array( + item['NormalizeImage']['mean'], dtype=np.float32) + std = np.array(item['NormalizeImage']['std'], dtype=np.float32) + if item['NormalizeImage'].get('is_scale', True): + scale = 1. / 255. + break + if self.data_format == 'NHWC': + self.scale = paddle.to_tensor(scale / std).reshape((1, 1, 1, 3)) + self.bias = paddle.to_tensor(-mean / std).reshape((1, 1, 1, 3)) + else: + self.scale = paddle.to_tensor(scale / std).reshape((1, 3, 1, 1)) + self.bias = paddle.to_tensor(-mean / std).reshape((1, 3, 1, 1)) + + def forward(self, inputs): + if self.data_format == 'NHWC': + image = inputs['image'] + inputs['image'] = paddle.transpose(image, [0, 2, 3, 1]) + + if self.fuse_norm: + image = inputs['image'] + self.inputs['image'] = image * self.scale + self.bias + self.inputs['im_shape'] = inputs['im_shape'] + self.inputs['scale_factor'] = inputs['scale_factor'] + else: + self.inputs = inputs + + self.model_arch() + + if self.training: + out = self.get_loss() + else: + inputs_list = [] + # multi-scale input + if not isinstance(inputs, typing.Sequence): + inputs_list.append(inputs) + else: + inputs_list.extend(inputs) + outs = [] + for inp in inputs_list: + if self.fuse_norm: + self.inputs['image'] = inp['image'] * self.scale + self.bias + self.inputs['im_shape'] = inp['im_shape'] + self.inputs['scale_factor'] = inp['scale_factor'] + else: + self.inputs = inp + outs.append(self.get_pred()) + + # multi-scale test + if len(outs) > 1: + out = self.merge_multi_scale_predictions(outs) + else: + out = outs[0] + return out + + def merge_multi_scale_predictions(self, outs): + # default values for architectures not included in following list + num_classes = 80 + nms_threshold = 0.5 + keep_top_k = 100 + + if self.__class__.__name__ in ('CascadeRCNN', 'FasterRCNN', 'MaskRCNN'): + num_classes = self.bbox_head.num_classes + keep_top_k = self.bbox_post_process.nms.keep_top_k + nms_threshold = self.bbox_post_process.nms.nms_threshold + else: + raise Exception( + "Multi scale test only supports CascadeRCNN, FasterRCNN and MaskRCNN for now" + ) + + final_boxes = [] + all_scale_outs = paddle.concat([o['bbox'] for o in outs]).numpy() + for c in range(num_classes): + idxs = all_scale_outs[:, 0] == c + if np.count_nonzero(idxs) == 0: + continue + r = nms(all_scale_outs[idxs, 1:], nms_threshold) + final_boxes.append( + np.concatenate([np.full((r.shape[0], 1), c), r], 1)) + out = np.concatenate(final_boxes) + out = np.concatenate(sorted( + out, key=lambda e: e[1])[-keep_top_k:]).reshape((-1, 6)) + out = { + 'bbox': paddle.to_tensor(out), + 'bbox_num': paddle.to_tensor(np.array([out.shape[0], ])) + } + + return out + + def build_inputs(self, data, input_def): + inputs = {} + for i, k in enumerate(input_def): + inputs[k] = data[i] + return inputs + + def model_arch(self, ): + pass + + def get_loss(self, ): + raise NotImplementedError("Should implement get_loss method!") + + def get_pred(self, ): + raise NotImplementedError("Should implement get_pred method!") diff --git a/rtdetr_paddle/ppdet/modeling/backbones/__init__.py b/rtdetr_paddle/ppdet/modeling/backbones/__init__.py new file mode 100644 index 0000000..2ea3991 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/backbones/__init__.py @@ -0,0 +1,30 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .resnet import * +from .darknet import * +from .mobilenet_v1 import * +from .mobilenet_v3 import * +from .shufflenet_v2 import * +from .swin_transformer import * +from .lcnet import * +from .cspresnet import * +from .csp_darknet import * +from .convnext import * +from .vision_transformer import * +from .mobileone import * +from .trans_encoder import * +from .focalnet import * +from .vit_mae import * +from .hgnet_v2 import * diff --git a/rtdetr_paddle/ppdet/modeling/backbones/convnext.py b/rtdetr_paddle/ppdet/modeling/backbones/convnext.py new file mode 100644 index 0000000..476e12b --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/backbones/convnext.py @@ -0,0 +1,245 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +''' +Modified from https://github.com/facebookresearch/ConvNeXt +Copyright (c) Meta Platforms, Inc. and affiliates. +All rights reserved. +This source code is licensed under the license found in the +LICENSE file in the root directory of this source tree. +''' + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.nn.initializer import Constant + +import numpy as np + +from ppdet.core.workspace import register, serializable +from ..shape_spec import ShapeSpec +from .transformer_utils import DropPath, trunc_normal_, zeros_ + +__all__ = ['ConvNeXt'] + + +class Block(nn.Layer): + r""" ConvNeXt Block. There are two equivalent implementations: + (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W) + (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back + We use (2) as we find it slightly faster in Pypaddle + + Args: + dim (int): Number of input channels. + drop_path (float): Stochastic depth rate. Default: 0.0 + layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. + """ + + def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6): + super().__init__() + self.dwconv = nn.Conv2D( + dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv + self.norm = LayerNorm(dim, eps=1e-6) + self.pwconv1 = nn.Linear( + dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers + self.act = nn.GELU() + self.pwconv2 = nn.Linear(4 * dim, dim) + + if layer_scale_init_value > 0: + self.gamma = self.create_parameter( + shape=(dim, ), + attr=ParamAttr(initializer=Constant(layer_scale_init_value))) + else: + self.gamma = None + + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity( + ) + + def forward(self, x): + input = x + x = self.dwconv(x) + x = x.transpose([0, 2, 3, 1]) + x = self.norm(x) + x = self.pwconv1(x) + x = self.act(x) + x = self.pwconv2(x) + if self.gamma is not None: + x = self.gamma * x + x = x.transpose([0, 3, 1, 2]) + x = input + self.drop_path(x) + return x + + +class LayerNorm(nn.Layer): + r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. + The ordering of the dimensions in the inputs. channels_last corresponds to inputs with + shape (batch_size, height, width, channels) while channels_first corresponds to inputs + with shape (batch_size, channels, height, width). + """ + + def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"): + super().__init__() + + self.weight = self.create_parameter( + shape=(normalized_shape, ), + attr=ParamAttr(initializer=Constant(1.))) + self.bias = self.create_parameter( + shape=(normalized_shape, ), + attr=ParamAttr(initializer=Constant(0.))) + + self.eps = eps + self.data_format = data_format + if self.data_format not in ["channels_last", "channels_first"]: + raise NotImplementedError + self.normalized_shape = (normalized_shape, ) + + def forward(self, x): + if self.data_format == "channels_last": + return F.layer_norm(x, self.normalized_shape, self.weight, + self.bias, self.eps) + elif self.data_format == "channels_first": + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / paddle.sqrt(s + self.eps) + x = self.weight[:, None, None] * x + self.bias[:, None, None] + return x + + +@register +@serializable +class ConvNeXt(nn.Layer): + r""" ConvNeXt + A Pypaddle impl of : `A ConvNet for the 2020s` - + https://arxiv.org/pdf/2201.03545.pdf + + Args: + in_chans (int): Number of input image channels. Default: 3 + depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3] + dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768] + drop_path_rate (float): Stochastic depth rate. Default: 0. + layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. + """ + + arch_settings = { + 'tiny': { + 'depths': [3, 3, 9, 3], + 'dims': [96, 192, 384, 768] + }, + 'small': { + 'depths': [3, 3, 27, 3], + 'dims': [96, 192, 384, 768] + }, + 'base': { + 'depths': [3, 3, 27, 3], + 'dims': [128, 256, 512, 1024] + }, + 'large': { + 'depths': [3, 3, 27, 3], + 'dims': [192, 384, 768, 1536] + }, + 'xlarge': { + 'depths': [3, 3, 27, 3], + 'dims': [256, 512, 1024, 2048] + }, + } + + def __init__( + self, + arch='tiny', + in_chans=3, + drop_path_rate=0., + layer_scale_init_value=1e-6, + return_idx=[1, 2, 3], + norm_output=True, + pretrained=None, ): + super().__init__() + depths = self.arch_settings[arch]['depths'] + dims = self.arch_settings[arch]['dims'] + self.downsample_layers = nn.LayerList( + ) # stem and 3 intermediate downsampling conv layers + stem = nn.Sequential( + nn.Conv2D( + in_chans, dims[0], kernel_size=4, stride=4), + LayerNorm( + dims[0], eps=1e-6, data_format="channels_first")) + self.downsample_layers.append(stem) + for i in range(3): + downsample_layer = nn.Sequential( + LayerNorm( + dims[i], eps=1e-6, data_format="channels_first"), + nn.Conv2D( + dims[i], dims[i + 1], kernel_size=2, stride=2), ) + self.downsample_layers.append(downsample_layer) + + self.stages = nn.LayerList( + ) # 4 feature resolution stages, each consisting of multiple residual blocks + dp_rates = [x for x in np.linspace(0, drop_path_rate, sum(depths))] + cur = 0 + for i in range(4): + stage = nn.Sequential(* [ + Block( + dim=dims[i], + drop_path=dp_rates[cur + j], + layer_scale_init_value=layer_scale_init_value) + for j in range(depths[i]) + ]) + self.stages.append(stage) + cur += depths[i] + + self.return_idx = return_idx + self.dims = [dims[i] for i in return_idx] # [::-1] + + self.norm_output = norm_output + if norm_output: + self.norms = nn.LayerList([ + LayerNorm( + c, eps=1e-6, data_format="channels_first") + for c in self.dims + ]) + + self.apply(self._init_weights) + + if pretrained is not None: + if 'http' in pretrained: #URL + path = paddle.utils.download.get_weights_path_from_url( + pretrained) + else: #model in local path + path = pretrained + self.set_state_dict(paddle.load(path)) + + def _init_weights(self, m): + if isinstance(m, (nn.Conv2D, nn.Linear)): + trunc_normal_(m.weight) + zeros_(m.bias) + + def forward_features(self, x): + output = [] + for i in range(4): + x = self.downsample_layers[i](x) + x = self.stages[i](x) + output.append(x) + + outputs = [output[i] for i in self.return_idx] + if self.norm_output: + outputs = [self.norms[i](out) for i, out in enumerate(outputs)] + + return outputs + + def forward(self, x): + x = self.forward_features(x['image']) + return x + + @property + def out_shape(self): + return [ShapeSpec(channels=c) for c in self.dims] diff --git a/rtdetr_paddle/ppdet/modeling/backbones/csp_darknet.py b/rtdetr_paddle/ppdet/modeling/backbones/csp_darknet.py new file mode 100644 index 0000000..4c225d1 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/backbones/csp_darknet.py @@ -0,0 +1,404 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.regularizer import L2Decay +from ppdet.core.workspace import register, serializable +from ppdet.modeling.initializer import conv_init_ +from ..shape_spec import ShapeSpec + +__all__ = [ + 'CSPDarkNet', 'BaseConv', 'DWConv', 'BottleNeck', 'SPPLayer', 'SPPFLayer' +] + + +class BaseConv(nn.Layer): + def __init__(self, + in_channels, + out_channels, + ksize, + stride, + groups=1, + bias=False, + act="silu"): + super(BaseConv, self).__init__() + self.conv = nn.Conv2D( + in_channels, + out_channels, + kernel_size=ksize, + stride=stride, + padding=(ksize - 1) // 2, + groups=groups, + bias_attr=bias) + self.bn = nn.BatchNorm2D( + out_channels, + weight_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0))) + + self._init_weights() + + def _init_weights(self): + conv_init_(self.conv) + + def forward(self, x): + # use 'x * F.sigmoid(x)' replace 'silu' + x = self.bn(self.conv(x)) + y = x * F.sigmoid(x) + return y + + +class DWConv(nn.Layer): + """Depthwise Conv""" + + def __init__(self, + in_channels, + out_channels, + ksize, + stride=1, + bias=False, + act="silu"): + super(DWConv, self).__init__() + self.dw_conv = BaseConv( + in_channels, + in_channels, + ksize=ksize, + stride=stride, + groups=in_channels, + bias=bias, + act=act) + self.pw_conv = BaseConv( + in_channels, + out_channels, + ksize=1, + stride=1, + groups=1, + bias=bias, + act=act) + + def forward(self, x): + return self.pw_conv(self.dw_conv(x)) + + +class Focus(nn.Layer): + """Focus width and height information into channel space, used in YOLOX.""" + + def __init__(self, + in_channels, + out_channels, + ksize=3, + stride=1, + bias=False, + act="silu"): + super(Focus, self).__init__() + self.conv = BaseConv( + in_channels * 4, + out_channels, + ksize=ksize, + stride=stride, + bias=bias, + act=act) + + def forward(self, inputs): + # inputs [bs, C, H, W] -> outputs [bs, 4C, W/2, H/2] + top_left = inputs[:, :, 0::2, 0::2] + top_right = inputs[:, :, 0::2, 1::2] + bottom_left = inputs[:, :, 1::2, 0::2] + bottom_right = inputs[:, :, 1::2, 1::2] + outputs = paddle.concat( + [top_left, bottom_left, top_right, bottom_right], 1) + return self.conv(outputs) + + +class BottleNeck(nn.Layer): + def __init__(self, + in_channels, + out_channels, + shortcut=True, + expansion=0.5, + depthwise=False, + bias=False, + act="silu"): + super(BottleNeck, self).__init__() + hidden_channels = int(out_channels * expansion) + Conv = DWConv if depthwise else BaseConv + self.conv1 = BaseConv( + in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) + self.conv2 = Conv( + hidden_channels, + out_channels, + ksize=3, + stride=1, + bias=bias, + act=act) + self.add_shortcut = shortcut and in_channels == out_channels + + def forward(self, x): + y = self.conv2(self.conv1(x)) + if self.add_shortcut: + y = y + x + return y + + +class SPPLayer(nn.Layer): + """Spatial Pyramid Pooling (SPP) layer used in YOLOv3-SPP and YOLOX""" + + def __init__(self, + in_channels, + out_channels, + kernel_sizes=(5, 9, 13), + bias=False, + act="silu"): + super(SPPLayer, self).__init__() + hidden_channels = in_channels // 2 + self.conv1 = BaseConv( + in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) + self.maxpoolings = nn.LayerList([ + nn.MaxPool2D( + kernel_size=ks, stride=1, padding=ks // 2) + for ks in kernel_sizes + ]) + conv2_channels = hidden_channels * (len(kernel_sizes) + 1) + self.conv2 = BaseConv( + conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act) + + def forward(self, x): + x = self.conv1(x) + x = paddle.concat([x] + [mp(x) for mp in self.maxpoolings], axis=1) + x = self.conv2(x) + return x + + +class SPPFLayer(nn.Layer): + """ Spatial Pyramid Pooling - Fast (SPPF) layer used in YOLOv5 by Glenn Jocher, + equivalent to SPP(k=(5, 9, 13)) + """ + + def __init__(self, + in_channels, + out_channels, + ksize=5, + bias=False, + act='silu'): + super(SPPFLayer, self).__init__() + hidden_channels = in_channels // 2 + self.conv1 = BaseConv( + in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) + self.maxpooling = nn.MaxPool2D( + kernel_size=ksize, stride=1, padding=ksize // 2) + conv2_channels = hidden_channels * 4 + self.conv2 = BaseConv( + conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act) + + def forward(self, x): + x = self.conv1(x) + y1 = self.maxpooling(x) + y2 = self.maxpooling(y1) + y3 = self.maxpooling(y2) + concats = paddle.concat([x, y1, y2, y3], axis=1) + out = self.conv2(concats) + return out + + +class CSPLayer(nn.Layer): + """CSP (Cross Stage Partial) layer with 3 convs, named C3 in YOLOv5""" + + def __init__(self, + in_channels, + out_channels, + num_blocks=1, + shortcut=True, + expansion=0.5, + depthwise=False, + bias=False, + act="silu"): + super(CSPLayer, self).__init__() + hidden_channels = int(out_channels * expansion) + self.conv1 = BaseConv( + in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) + self.conv2 = BaseConv( + in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) + self.bottlenecks = nn.Sequential(* [ + BottleNeck( + hidden_channels, + hidden_channels, + shortcut=shortcut, + expansion=1.0, + depthwise=depthwise, + bias=bias, + act=act) for _ in range(num_blocks) + ]) + self.conv3 = BaseConv( + hidden_channels * 2, + out_channels, + ksize=1, + stride=1, + bias=bias, + act=act) + + def forward(self, x): + x_1 = self.conv1(x) + x_1 = self.bottlenecks(x_1) + x_2 = self.conv2(x) + x = paddle.concat([x_1, x_2], axis=1) + x = self.conv3(x) + return x + + +@register +@serializable +class CSPDarkNet(nn.Layer): + """ + CSPDarkNet backbone. + Args: + arch (str): Architecture of CSPDarkNet, from {P5, P6, X}, default as X, + and 'X' means used in YOLOX, 'P5/P6' means used in YOLOv5. + depth_mult (float): Depth multiplier, multiply number of channels in + each layer, default as 1.0. + width_mult (float): Width multiplier, multiply number of blocks in + CSPLayer, default as 1.0. + depthwise (bool): Whether to use depth-wise conv layer. + act (str): Activation function type, default as 'silu'. + return_idx (list): Index of stages whose feature maps are returned. + """ + + __shared__ = ['depth_mult', 'width_mult', 'act', 'trt'] + + # in_channels, out_channels, num_blocks, add_shortcut, use_spp(use_sppf) + # 'X' means setting used in YOLOX, 'P5/P6' means setting used in YOLOv5. + arch_settings = { + 'X': [[64, 128, 3, True, False], [128, 256, 9, True, False], + [256, 512, 9, True, False], [512, 1024, 3, False, True]], + 'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False], + [256, 512, 9, True, False], [512, 1024, 3, True, True]], + 'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False], + [256, 512, 9, True, False], [512, 768, 3, True, False], + [768, 1024, 3, True, True]], + } + + def __init__(self, + arch='X', + depth_mult=1.0, + width_mult=1.0, + depthwise=False, + act='silu', + trt=False, + return_idx=[2, 3, 4]): + super(CSPDarkNet, self).__init__() + self.arch = arch + self.return_idx = return_idx + Conv = DWConv if depthwise else BaseConv + arch_setting = self.arch_settings[arch] + base_channels = int(arch_setting[0][0] * width_mult) + + # Note: differences between the latest YOLOv5 and the original YOLOX + # 1. self.stem, use SPPF(in YOLOv5) or SPP(in YOLOX) + # 2. use SPPF(in YOLOv5) or SPP(in YOLOX) + # 3. put SPPF before(YOLOv5) or SPP after(YOLOX) the last cspdark block's CSPLayer + # 4. whether SPPF(SPP)'CSPLayer add shortcut, True in YOLOv5, False in YOLOX + if arch in ['P5', 'P6']: + # in the latest YOLOv5, use Conv stem, and SPPF (fast, only single spp kernal size) + self.stem = Conv( + 3, base_channels, ksize=6, stride=2, bias=False, act=act) + spp_kernal_sizes = 5 + elif arch in ['X']: + # in the original YOLOX, use Focus stem, and SPP (three spp kernal sizes) + self.stem = Focus( + 3, base_channels, ksize=3, stride=1, bias=False, act=act) + spp_kernal_sizes = (5, 9, 13) + else: + raise AttributeError("Unsupported arch type: {}".format(arch)) + + _out_channels = [base_channels] + layers_num = 1 + self.csp_dark_blocks = [] + + for i, (in_channels, out_channels, num_blocks, shortcut, + use_spp) in enumerate(arch_setting): + in_channels = int(in_channels * width_mult) + out_channels = int(out_channels * width_mult) + _out_channels.append(out_channels) + num_blocks = max(round(num_blocks * depth_mult), 1) + stage = [] + + conv_layer = self.add_sublayer( + 'layers{}.stage{}.conv_layer'.format(layers_num, i + 1), + Conv( + in_channels, out_channels, 3, 2, bias=False, act=act)) + stage.append(conv_layer) + layers_num += 1 + + if use_spp and arch in ['X']: + # in YOLOX use SPPLayer + spp_layer = self.add_sublayer( + 'layers{}.stage{}.spp_layer'.format(layers_num, i + 1), + SPPLayer( + out_channels, + out_channels, + kernel_sizes=spp_kernal_sizes, + bias=False, + act=act)) + stage.append(spp_layer) + layers_num += 1 + + csp_layer = self.add_sublayer( + 'layers{}.stage{}.csp_layer'.format(layers_num, i + 1), + CSPLayer( + out_channels, + out_channels, + num_blocks=num_blocks, + shortcut=shortcut, + depthwise=depthwise, + bias=False, + act=act)) + stage.append(csp_layer) + layers_num += 1 + + if use_spp and arch in ['P5', 'P6']: + # in latest YOLOv5 use SPPFLayer instead of SPPLayer + sppf_layer = self.add_sublayer( + 'layers{}.stage{}.sppf_layer'.format(layers_num, i + 1), + SPPFLayer( + out_channels, + out_channels, + ksize=5, + bias=False, + act=act)) + stage.append(sppf_layer) + layers_num += 1 + + self.csp_dark_blocks.append(nn.Sequential(*stage)) + + self._out_channels = [_out_channels[i] for i in self.return_idx] + self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx] + + def forward(self, inputs): + x = inputs['image'] + outputs = [] + x = self.stem(x) + for i, layer in enumerate(self.csp_dark_blocks): + x = layer(x) + if i + 1 in self.return_idx: + outputs.append(x) + return outputs + + @property + def out_shape(self): + return [ + ShapeSpec( + channels=c, stride=s) + for c, s in zip(self._out_channels, self.strides) + ] diff --git a/rtdetr_paddle/ppdet/modeling/backbones/cspresnet.py b/rtdetr_paddle/ppdet/modeling/backbones/cspresnet.py new file mode 100644 index 0000000..5268ec8 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/backbones/cspresnet.py @@ -0,0 +1,321 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.regularizer import L2Decay +from paddle.nn.initializer import Constant + +from ppdet.modeling.ops import get_act_fn +from ppdet.core.workspace import register, serializable +from ..shape_spec import ShapeSpec + +__all__ = ['CSPResNet', 'BasicBlock', 'EffectiveSELayer', 'ConvBNLayer'] + + +class ConvBNLayer(nn.Layer): + def __init__(self, + ch_in, + ch_out, + filter_size=3, + stride=1, + groups=1, + padding=0, + act=None): + super(ConvBNLayer, self).__init__() + + self.conv = nn.Conv2D( + in_channels=ch_in, + out_channels=ch_out, + kernel_size=filter_size, + stride=stride, + padding=padding, + groups=groups, + bias_attr=False) + + self.bn = nn.BatchNorm2D( + ch_out, + weight_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0))) + self.act = get_act_fn(act) if act is None or isinstance(act, ( + str, dict)) else act + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.act(x) + + return x + + +class RepVggBlock(nn.Layer): + def __init__(self, ch_in, ch_out, act='relu', alpha=False): + super(RepVggBlock, self).__init__() + self.ch_in = ch_in + self.ch_out = ch_out + self.conv1 = ConvBNLayer( + ch_in, ch_out, 3, stride=1, padding=1, act=None) + self.conv2 = ConvBNLayer( + ch_in, ch_out, 1, stride=1, padding=0, act=None) + self.act = get_act_fn(act) if act is None or isinstance(act, ( + str, dict)) else act + if alpha: + self.alpha = self.create_parameter( + shape=[1], + attr=ParamAttr(initializer=Constant(value=1.)), + dtype="float32") + else: + self.alpha = None + + def forward(self, x): + if hasattr(self, 'conv'): + y = self.conv(x) + else: + if self.alpha: + y = self.conv1(x) + self.alpha * self.conv2(x) + else: + y = self.conv1(x) + self.conv2(x) + y = self.act(y) + return y + + def convert_to_deploy(self): + if not hasattr(self, 'conv'): + self.conv = nn.Conv2D( + in_channels=self.ch_in, + out_channels=self.ch_out, + kernel_size=3, + stride=1, + padding=1, + groups=1) + kernel, bias = self.get_equivalent_kernel_bias() + self.conv.weight.set_value(kernel) + self.conv.bias.set_value(bias) + self.__delattr__('conv1') + self.__delattr__('conv2') + + def get_equivalent_kernel_bias(self): + kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1) + kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2) + if self.alpha: + return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor( + kernel1x1), bias3x3 + self.alpha * bias1x1 + else: + return kernel3x3 + self._pad_1x1_to_3x3_tensor( + kernel1x1), bias3x3 + bias1x1 + + def _pad_1x1_to_3x3_tensor(self, kernel1x1): + if kernel1x1 is None: + return 0 + else: + return nn.functional.pad(kernel1x1, [1, 1, 1, 1]) + + def _fuse_bn_tensor(self, branch): + if branch is None: + return 0, 0 + kernel = branch.conv.weight + running_mean = branch.bn._mean + running_var = branch.bn._variance + gamma = branch.bn.weight + beta = branch.bn.bias + eps = branch.bn._epsilon + std = (running_var + eps).sqrt() + t = (gamma / std).reshape((-1, 1, 1, 1)) + return kernel * t, beta - running_mean * gamma / std + + +class BasicBlock(nn.Layer): + def __init__(self, + ch_in, + ch_out, + act='relu', + shortcut=True, + use_alpha=False): + super(BasicBlock, self).__init__() + assert ch_in == ch_out + self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act) + self.conv2 = RepVggBlock(ch_out, ch_out, act=act, alpha=use_alpha) + self.shortcut = shortcut + + def forward(self, x): + y = self.conv1(x) + y = self.conv2(y) + if self.shortcut: + return paddle.add(x, y) + else: + return y + + +class EffectiveSELayer(nn.Layer): + """ Effective Squeeze-Excitation + From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667 + """ + + def __init__(self, channels, act='hardsigmoid'): + super(EffectiveSELayer, self).__init__() + self.fc = nn.Conv2D(channels, channels, kernel_size=1, padding=0) + self.act = get_act_fn(act) if act is None or isinstance(act, ( + str, dict)) else act + + def forward(self, x): + x_se = x.mean((2, 3), keepdim=True) + x_se = self.fc(x_se) + return x * self.act(x_se) + + +class CSPResStage(nn.Layer): + def __init__(self, + block_fn, + ch_in, + ch_out, + n, + stride, + act='relu', + attn='eca', + use_alpha=False): + super(CSPResStage, self).__init__() + + ch_mid = (ch_in + ch_out) // 2 + if stride == 2: + self.conv_down = ConvBNLayer( + ch_in, ch_mid, 3, stride=2, padding=1, act=act) + else: + self.conv_down = None + self.conv1 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act) + self.conv2 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act) + self.blocks = nn.Sequential(*[ + block_fn( + ch_mid // 2, + ch_mid // 2, + act=act, + shortcut=True, + use_alpha=use_alpha) for i in range(n) + ]) + if attn: + self.attn = EffectiveSELayer(ch_mid, act='hardsigmoid') + else: + self.attn = None + + self.conv3 = ConvBNLayer(ch_mid, ch_out, 1, act=act) + + def forward(self, x): + if self.conv_down is not None: + x = self.conv_down(x) + y1 = self.conv1(x) + y2 = self.blocks(self.conv2(x)) + y = paddle.concat([y1, y2], axis=1) + if self.attn is not None: + y = self.attn(y) + y = self.conv3(y) + return y + + +@register +@serializable +class CSPResNet(nn.Layer): + __shared__ = ['width_mult', 'depth_mult', 'trt'] + + def __init__(self, + layers=[3, 6, 6, 3], + channels=[64, 128, 256, 512, 1024], + act='swish', + return_idx=[1, 2, 3], + depth_wise=False, + use_large_stem=False, + width_mult=1.0, + depth_mult=1.0, + trt=False, + use_checkpoint=False, + use_alpha=False, + **args): + super(CSPResNet, self).__init__() + self.use_checkpoint = use_checkpoint + channels = [max(round(c * width_mult), 1) for c in channels] + layers = [max(round(l * depth_mult), 1) for l in layers] + act = get_act_fn( + act, trt=trt) if act is None or isinstance(act, + (str, dict)) else act + + if use_large_stem: + self.stem = nn.Sequential( + ('conv1', ConvBNLayer( + 3, channels[0] // 2, 3, stride=2, padding=1, act=act)), + ('conv2', ConvBNLayer( + channels[0] // 2, + channels[0] // 2, + 3, + stride=1, + padding=1, + act=act)), ('conv3', ConvBNLayer( + channels[0] // 2, + channels[0], + 3, + stride=1, + padding=1, + act=act))) + else: + self.stem = nn.Sequential( + ('conv1', ConvBNLayer( + 3, channels[0] // 2, 3, stride=2, padding=1, act=act)), + ('conv2', ConvBNLayer( + channels[0] // 2, + channels[0], + 3, + stride=1, + padding=1, + act=act))) + + n = len(channels) - 1 + self.stages = nn.Sequential(*[(str(i), CSPResStage( + BasicBlock, + channels[i], + channels[i + 1], + layers[i], + 2, + act=act, + use_alpha=use_alpha)) for i in range(n)]) + + self._out_channels = channels[1:] + self._out_strides = [4 * 2**i for i in range(n)] + self.return_idx = return_idx + if use_checkpoint: + paddle.seed(0) + + def forward(self, inputs): + x = inputs['image'] + x = self.stem(x) + outs = [] + for idx, stage in enumerate(self.stages): + if self.use_checkpoint and self.training: + x = paddle.distributed.fleet.utils.recompute( + stage, x, **{"preserve_rng_state": True}) + else: + x = stage(x) + if idx in self.return_idx: + outs.append(x) + + return outs + + @property + def out_shape(self): + return [ + ShapeSpec( + channels=self._out_channels[i], stride=self._out_strides[i]) + for i in self.return_idx + ] diff --git a/rtdetr_paddle/ppdet/modeling/backbones/darknet.py b/rtdetr_paddle/ppdet/modeling/backbones/darknet.py new file mode 100755 index 0000000..c68c650 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/backbones/darknet.py @@ -0,0 +1,345 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from ppdet.core.workspace import register, serializable +from ppdet.modeling.ops import batch_norm, mish +from ..shape_spec import ShapeSpec + +__all__ = ['DarkNet', 'ConvBNLayer'] + + +class ConvBNLayer(nn.Layer): + def __init__(self, + ch_in, + ch_out, + filter_size=3, + stride=1, + groups=1, + padding=0, + norm_type='bn', + norm_decay=0., + act="leaky", + freeze_norm=False, + data_format='NCHW', + name=''): + """ + conv + bn + activation layer + + Args: + ch_in (int): input channel + ch_out (int): output channel + filter_size (int): filter size, default 3 + stride (int): stride, default 1 + groups (int): number of groups of conv layer, default 1 + padding (int): padding size, default 0 + norm_type (str): batch norm type, default bn + norm_decay (str): decay for weight and bias of batch norm layer, default 0. + act (str): activation function type, default 'leaky', which means leaky_relu + freeze_norm (bool): whether to freeze norm, default False + data_format (str): data format, NCHW or NHWC + """ + super(ConvBNLayer, self).__init__() + + self.conv = nn.Conv2D( + in_channels=ch_in, + out_channels=ch_out, + kernel_size=filter_size, + stride=stride, + padding=padding, + groups=groups, + data_format=data_format, + bias_attr=False) + self.batch_norm = batch_norm( + ch_out, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + data_format=data_format) + self.act = act + + def forward(self, inputs): + out = self.conv(inputs) + out = self.batch_norm(out) + if self.act == 'leaky': + out = F.leaky_relu(out, 0.1) + else: + out = getattr(F, self.act)(out) + return out + + +class DownSample(nn.Layer): + def __init__(self, + ch_in, + ch_out, + filter_size=3, + stride=2, + padding=1, + norm_type='bn', + norm_decay=0., + freeze_norm=False, + data_format='NCHW'): + """ + downsample layer + + Args: + ch_in (int): input channel + ch_out (int): output channel + filter_size (int): filter size, default 3 + stride (int): stride, default 2 + padding (int): padding size, default 1 + norm_type (str): batch norm type, default bn + norm_decay (str): decay for weight and bias of batch norm layer, default 0. + freeze_norm (bool): whether to freeze norm, default False + data_format (str): data format, NCHW or NHWC + """ + + super(DownSample, self).__init__() + + self.conv_bn_layer = ConvBNLayer( + ch_in=ch_in, + ch_out=ch_out, + filter_size=filter_size, + stride=stride, + padding=padding, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + data_format=data_format) + self.ch_out = ch_out + + def forward(self, inputs): + out = self.conv_bn_layer(inputs) + return out + + +class BasicBlock(nn.Layer): + def __init__(self, + ch_in, + ch_out, + norm_type='bn', + norm_decay=0., + freeze_norm=False, + data_format='NCHW'): + """ + BasicBlock layer of DarkNet + + Args: + ch_in (int): input channel + ch_out (int): output channel + norm_type (str): batch norm type, default bn + norm_decay (str): decay for weight and bias of batch norm layer, default 0. + freeze_norm (bool): whether to freeze norm, default False + data_format (str): data format, NCHW or NHWC + """ + + super(BasicBlock, self).__init__() + + assert ch_in == ch_out and (ch_in % 2) == 0, \ + f"ch_in and ch_out should be the same even int, but the input \'ch_in is {ch_in}, \'ch_out is {ch_out}" + # example: + # --------------{conv1} --> {conv2} + # channel route: 10-->5 --> 5-->10 + self.conv1 = ConvBNLayer( + ch_in=ch_in, + ch_out=int(ch_out / 2), + filter_size=1, + stride=1, + padding=0, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + data_format=data_format) + self.conv2 = ConvBNLayer( + ch_in=int(ch_out / 2), + ch_out=ch_out, + filter_size=3, + stride=1, + padding=1, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + data_format=data_format) + + def forward(self, inputs): + conv1 = self.conv1(inputs) + conv2 = self.conv2(conv1) + out = paddle.add(x=inputs, y=conv2) + return out + + +class Blocks(nn.Layer): + def __init__(self, + ch_in, + ch_out, + count, + norm_type='bn', + norm_decay=0., + freeze_norm=False, + name=None, + data_format='NCHW'): + """ + Blocks layer, which consist of some BaickBlock layers + + Args: + ch_in (int): input channel + ch_out (int): output channel + count (int): number of BasicBlock layer + norm_type (str): batch norm type, default bn + norm_decay (str): decay for weight and bias of batch norm layer, default 0. + freeze_norm (bool): whether to freeze norm, default False + name (str): layer name + data_format (str): data format, NCHW or NHWC + """ + super(Blocks, self).__init__() + + self.basicblock0 = BasicBlock( + ch_in, + ch_out, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + data_format=data_format) + self.res_out_list = [] + for i in range(1, count): + block_name = '{}.{}'.format(name, i) + res_out = self.add_sublayer( + block_name, + BasicBlock( + ch_out, + ch_out, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + data_format=data_format)) + self.res_out_list.append(res_out) + self.ch_out = ch_out + + def forward(self, inputs): + y = self.basicblock0(inputs) + for basic_block_i in self.res_out_list: + y = basic_block_i(y) + return y + + +DarkNet_cfg = {53: ([1, 2, 8, 8, 4])} + + +@register +@serializable +class DarkNet(nn.Layer): + __shared__ = ['norm_type', 'data_format'] + + def __init__(self, + depth=53, + freeze_at=-1, + return_idx=[2, 3, 4], + num_stages=5, + norm_type='bn', + norm_decay=0., + freeze_norm=False, + data_format='NCHW'): + """ + Darknet, see https://pjreddie.com/darknet/yolo/ + + Args: + depth (int): depth of network + freeze_at (int): freeze the backbone at which stage + filter_size (int): filter size, default 3 + return_idx (list): index of stages whose feature maps are returned + norm_type (str): batch norm type, default bn + norm_decay (str): decay for weight and bias of batch norm layer, default 0. + data_format (str): data format, NCHW or NHWC + """ + super(DarkNet, self).__init__() + self.depth = depth + self.freeze_at = freeze_at + self.return_idx = return_idx + self.num_stages = num_stages + self.stages = DarkNet_cfg[self.depth][0:num_stages] + + self.conv0 = ConvBNLayer( + ch_in=3, + ch_out=32, + filter_size=3, + stride=1, + padding=1, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + data_format=data_format) + + self.downsample0 = DownSample( + ch_in=32, + ch_out=32 * 2, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + data_format=data_format) + + self._out_channels = [] + self.darknet_conv_block_list = [] + self.downsample_list = [] + ch_in = [64, 128, 256, 512, 1024] + for i, stage in enumerate(self.stages): + name = 'stage.{}'.format(i) + conv_block = self.add_sublayer( + name, + Blocks( + int(ch_in[i]), + int(ch_in[i]), + stage, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + data_format=data_format, + name=name)) + self.darknet_conv_block_list.append(conv_block) + if i in return_idx: + self._out_channels.append(int(ch_in[i])) + for i in range(num_stages - 1): + down_name = 'stage.{}.downsample'.format(i) + downsample = self.add_sublayer( + down_name, + DownSample( + ch_in=int(ch_in[i]), + ch_out=int(ch_in[i + 1]), + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + data_format=data_format)) + self.downsample_list.append(downsample) + + def forward(self, inputs): + x = inputs['image'] + + out = self.conv0(x) + out = self.downsample0(out) + blocks = [] + for i, conv_block_i in enumerate(self.darknet_conv_block_list): + out = conv_block_i(out) + if i == self.freeze_at: + out.stop_gradient = True + if i in self.return_idx: + blocks.append(out) + if i < self.num_stages - 1: + out = self.downsample_list[i](out) + return blocks + + @property + def out_shape(self): + return [ShapeSpec(channels=c) for c in self._out_channels] diff --git a/rtdetr_paddle/ppdet/modeling/backbones/focalnet.py b/rtdetr_paddle/ppdet/modeling/backbones/focalnet.py new file mode 100644 index 0000000..54c2877 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/backbones/focalnet.py @@ -0,0 +1,720 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is based on https://github.com/microsoft/FocalNet/blob/main/classification/focalnet.py +""" +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ppdet.modeling.shape_spec import ShapeSpec +from ppdet.core.workspace import register, serializable +from .transformer_utils import DropPath, Identity +from .transformer_utils import add_parameter, to_2tuple +from .transformer_utils import ones_, zeros_, trunc_normal_ +from .swin_transformer import Mlp + +__all__ = ['FocalNet'] + +MODEL_cfg = { + 'focalnet_T_224_1k_srf': dict( + embed_dim=96, + depths=[2, 2, 6, 2], + focal_levels=[2, 2, 2, 2], + focal_windows=[3, 3, 3, 3], + drop_path_rate=0.2, + use_conv_embed=False, + use_postln=False, + use_postln_in_modulation=False, + use_layerscale=False, + normalize_modulator=False, + pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_srf_pretrained.pdparams', + ), + 'focalnet_S_224_1k_srf': dict( + embed_dim=96, + depths=[2, 2, 18, 2], + focal_levels=[2, 2, 2, 2], + focal_windows=[3, 3, 3, 3], + drop_path_rate=0.3, + use_conv_embed=False, + use_postln=False, + use_postln_in_modulation=False, + use_layerscale=False, + normalize_modulator=False, + pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_srf_pretrained.pdparams', + ), + 'focalnet_B_224_1k_srf': dict( + embed_dim=128, + depths=[2, 2, 18, 2], + focal_levels=[2, 2, 2, 2], + focal_windows=[3, 3, 3, 3], + drop_path_rate=0.5, + use_conv_embed=False, + use_postln=False, + use_postln_in_modulation=False, + use_layerscale=False, + normalize_modulator=False, + pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_srf_pretrained.pdparams', + ), + 'focalnet_T_224_1k_lrf': dict( + embed_dim=96, + depths=[2, 2, 6, 2], + focal_levels=[3, 3, 3, 3], + focal_windows=[3, 3, 3, 3], + drop_path_rate=0.2, + use_conv_embed=False, + use_postln=False, + use_postln_in_modulation=False, + use_layerscale=False, + normalize_modulator=False, + pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_lrf_pretrained.pdparams', + ), + 'focalnet_S_224_1k_lrf': dict( + embed_dim=96, + depths=[2, 2, 18, 2], + focal_levels=[3, 3, 3, 3], + focal_windows=[3, 3, 3, 3], + drop_path_rate=0.3, + use_conv_embed=False, + use_postln=False, + use_postln_in_modulation=False, + use_layerscale=False, + normalize_modulator=False, + pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_lrf_pretrained.pdparams', + ), + 'focalnet_B_224_1k_lrf': dict( + embed_dim=128, + depths=[2, 2, 18, 2], + focal_levels=[3, 3, 3, 3], + focal_windows=[3, 3, 3, 3], + drop_path_rate=0.5, + use_conv_embed=False, + use_postln=False, + use_postln_in_modulation=False, + use_layerscale=False, + normalize_modulator=False, + pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_lrf_pretrained.pdparams', + ), + 'focalnet_L_384_22k_fl3': dict( + embed_dim=192, + depths=[2, 2, 18, 2], + focal_levels=[3, 3, 3, 3], + focal_windows=[5, 5, 5, 5], + drop_path_rate=0.5, + use_conv_embed=True, + use_postln=True, + use_postln_in_modulation=False, + use_layerscale=True, + normalize_modulator=False, + pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_pretrained.pdparams', + ), + 'focalnet_L_384_22k_fl4': dict( + embed_dim=192, + depths=[2, 2, 18, 2], + focal_levels=[4, 4, 4, 4], + focal_windows=[3, 3, 3, 3], + drop_path_rate=0.5, + use_conv_embed=True, + use_postln=True, + use_postln_in_modulation=False, + use_layerscale=True, + normalize_modulator=True, # + pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_fl4_pretrained.pdparams', + ), + 'focalnet_XL_384_22k_fl3': dict( + embed_dim=256, + depths=[2, 2, 18, 2], + focal_levels=[3, 3, 3, 3], + focal_windows=[5, 5, 5, 5], + drop_path_rate=0.5, + use_conv_embed=True, + use_postln=True, + use_postln_in_modulation=False, + use_layerscale=True, + normalize_modulator=False, + pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_pretrained.pdparams', + ), + 'focalnet_XL_384_22k_fl4': dict( + embed_dim=256, + depths=[2, 2, 18, 2], + focal_levels=[4, 4, 4, 4], + focal_windows=[3, 3, 3, 3], + drop_path_rate=0.5, + use_conv_embed=True, + use_postln=True, + use_postln_in_modulation=False, + use_layerscale=True, + normalize_modulator=False, + pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_fl4_pretrained.pdparams', + ), + 'focalnet_H_224_22k_fl3': dict( + embed_dim=352, + depths=[2, 2, 18, 2], + focal_levels=[3, 3, 3, 3], + focal_windows=[3, 3, 3, 3], + drop_path_rate=0.5, + use_conv_embed=True, + use_postln=True, + use_postln_in_modulation=True, # + use_layerscale=True, + normalize_modulator=False, + pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_pretrained.pdparams', + ), + 'focalnet_H_224_22k_fl4': dict( + embed_dim=352, + depths=[2, 2, 18, 2], + focal_levels=[4, 4, 4, 4], + focal_windows=[3, 3, 3, 3], + drop_path_rate=0.5, + use_conv_embed=True, + use_postln=True, + use_postln_in_modulation=True, # + use_layerscale=True, + normalize_modulator=False, + pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_fl4_pretrained.pdparams', + ), +} + + +class FocalModulation(nn.Layer): + """ + Args: + dim (int): Number of input channels. + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + focal_level (int): Number of focal levels + focal_window (int): Focal window size at focal level 1 + focal_factor (int): Step to increase the focal window. Default: 2 + use_postln_in_modulation (bool): Whether use post-modulation layernorm + normalize_modulator (bool): Whether use normalize in modulator + """ + + def __init__(self, + dim, + proj_drop=0., + focal_level=2, + focal_window=7, + focal_factor=2, + use_postln_in_modulation=False, + normalize_modulator=False): + super().__init__() + self.dim = dim + + # specific args for focalv3 + self.focal_level = focal_level + self.focal_window = focal_window + self.focal_factor = focal_factor + self.use_postln_in_modulation = use_postln_in_modulation + self.normalize_modulator = normalize_modulator + + self.f = nn.Linear( + dim, 2 * dim + (self.focal_level + 1), bias_attr=True) + self.h = nn.Conv2D( + dim, + dim, + kernel_size=1, + stride=1, + padding=0, + groups=1, + bias_attr=True) + + self.act = nn.GELU() + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + self.focal_layers = nn.LayerList() + + if self.use_postln_in_modulation: + self.ln = nn.LayerNorm(dim) + + for k in range(self.focal_level): + kernel_size = self.focal_factor * k + self.focal_window + self.focal_layers.append( + nn.Sequential( + nn.Conv2D( + dim, + dim, + kernel_size=kernel_size, + stride=1, + groups=dim, + padding=kernel_size // 2, + bias_attr=False), + nn.GELU())) + + def forward(self, x): + """ Forward function. + Args: + x: input features with shape of (B, H, W, C) + """ + _, _, _, C = x.shape + x = self.f(x) + x = x.transpose([0, 3, 1, 2]) + q, ctx, gates = paddle.split(x, (C, C, self.focal_level + 1), 1) + + ctx_all = 0 + for l in range(self.focal_level): + ctx = self.focal_layers[l](ctx) + ctx_all = ctx_all + ctx * gates[:, l:l + 1] + ctx_global = self.act(ctx.mean(2, keepdim=True).mean(3, keepdim=True)) + ctx_all = ctx_all + ctx_global * gates[:, self.focal_level:] + if self.normalize_modulator: + ctx_all = ctx_all / (self.focal_level + 1) + + x_out = q * self.h(ctx_all) + x_out = x_out.transpose([0, 2, 3, 1]) + if self.use_postln_in_modulation: + x_out = self.ln(x_out) + x_out = self.proj(x_out) + x_out = self.proj_drop(x_out) + return x_out + + +class FocalModulationBlock(nn.Layer): + """ Focal Modulation Block. + Args: + dim (int): Number of input channels. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + drop (float, optional): Dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm + focal_level (int): number of focal levels + focal_window (int): focal kernel size at level 1 + use_postln (bool): Whether use layernorm after modulation. Default: False. + use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False. + normalize_modulator (bool): Whether use normalize in modulator + use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False + layerscale_value (float): Value for layer scale. Default: 1e-4 + """ + + def __init__(self, + dim, + mlp_ratio=4., + drop=0., + drop_path=0., + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + focal_level=2, + focal_window=9, + use_postln=False, + use_postln_in_modulation=False, + normalize_modulator=False, + use_layerscale=False, + layerscale_value=1e-4): + super().__init__() + self.dim = dim + self.mlp_ratio = mlp_ratio + self.focal_window = focal_window + self.focal_level = focal_level + self.use_postln = use_postln + self.use_layerscale = use_layerscale + + self.norm1 = norm_layer(dim) + self.modulation = FocalModulation( + dim, + proj_drop=drop, + focal_level=self.focal_level, + focal_window=self.focal_window, + use_postln_in_modulation=use_postln_in_modulation, + normalize_modulator=normalize_modulator) + + self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + self.H = None + self.W = None + + self.gamma_1 = 1.0 + self.gamma_2 = 1.0 + if self.use_layerscale: + self.gamma_1 = add_parameter(self, + layerscale_value * paddle.ones([dim])) + self.gamma_2 = add_parameter(self, + layerscale_value * paddle.ones([dim])) + + def forward(self, x): + """ + Args: + x: Input feature, tensor size (B, H*W, C). + """ + B, L, C = x.shape + H, W = self.H, self.W + assert L == H * W, "input feature has wrong size" + + shortcut = x + if not self.use_postln: + x = self.norm1(x) + x = x.reshape([-1, H, W, C]) + + # FM + x = self.modulation(x).reshape([-1, H * W, C]) + if self.use_postln: + x = self.norm1(x) + + # FFN + x = shortcut + self.drop_path(self.gamma_1 * x) + + if self.use_postln: + x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x))) + else: + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + return x + + +class BasicLayer(nn.Layer): + """ A basic focal modulation layer for one stage. + Args: + dim (int): Number of feature channels + depth (int): Depths of this stage. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + drop (float, optional): Dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None + focal_level (int): Number of focal levels + focal_window (int): Focal window size at focal level 1 + use_conv_embed (bool): Whether use overlapped convolution for patch embedding + use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False + layerscale_value (float): Value of layerscale + use_postln (bool): Whether use layernorm after modulation. Default: False. + use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False. + normalize_modulator (bool): Whether use normalize in modulator + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__(self, + dim, + depth, + mlp_ratio=4., + drop=0., + drop_path=0., + norm_layer=nn.LayerNorm, + downsample=None, + focal_level=2, + focal_window=9, + use_conv_embed=False, + use_layerscale=False, + layerscale_value=1e-4, + use_postln=False, + use_postln_in_modulation=False, + normalize_modulator=False, + use_checkpoint=False): + super().__init__() + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.LayerList([ + FocalModulationBlock( + dim=dim, + mlp_ratio=mlp_ratio, + drop=drop, + drop_path=drop_path[i] + if isinstance(drop_path, np.ndarray) else drop_path, + act_layer=nn.GELU, + norm_layer=norm_layer, + focal_level=focal_level, + focal_window=focal_window, + use_postln=use_postln, + use_postln_in_modulation=use_postln_in_modulation, + normalize_modulator=normalize_modulator, + use_layerscale=use_layerscale, + layerscale_value=layerscale_value) for i in range(depth) + ]) + + # patch merging layer + if downsample is not None: + self.downsample = downsample( + patch_size=2, + in_chans=dim, + embed_dim=2 * dim, + use_conv_embed=use_conv_embed, + norm_layer=norm_layer, + is_stem=False) + else: + self.downsample = None + + def forward(self, x, H, W): + """ + Args: + x: Input feature, tensor size (B, H*W, C). + """ + for blk in self.blocks: + blk.H, blk.W = H, W + x = blk(x) + + if self.downsample is not None: + x_reshaped = x.transpose([0, 2, 1]).reshape( + [x.shape[0], x.shape[-1], H, W]) + x_down = self.downsample(x_reshaped) + x_down = x_down.flatten(2).transpose([0, 2, 1]) + Wh, Ww = (H + 1) // 2, (W + 1) // 2 + return x, H, W, x_down, Wh, Ww + else: + return x, H, W, x, H, W + + +class PatchEmbed(nn.Layer): + """ Image to Patch Embedding + Args: + patch_size (int): Patch token size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Layer, optional): Normalization layer. Default: None + use_conv_embed (bool): Whether use overlapped convolution for patch embedding. Default: False + is_stem (bool): Is the stem block or not. + """ + + def __init__(self, + patch_size=4, + in_chans=3, + embed_dim=96, + norm_layer=None, + use_conv_embed=False, + is_stem=False): + super().__init__() + patch_size = to_2tuple(patch_size) + self.patch_size = patch_size + + self.in_chans = in_chans + self.embed_dim = embed_dim + + if use_conv_embed: + # if we choose to use conv embedding, then we treat the stem and non-stem differently + if is_stem: + kernel_size = 7 + padding = 2 + stride = 4 + else: + kernel_size = 3 + padding = 1 + stride = 2 + self.proj = nn.Conv2D( + in_chans, + embed_dim, + kernel_size=kernel_size, + stride=stride, + padding=padding) + else: + self.proj = nn.Conv2D( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + _, _, H, W = x.shape + + if W % self.patch_size[1] != 0: + # for 3D tensor: [pad_left, pad_right] + # for 4D tensor: [pad_left, pad_right, pad_top, pad_bottom] + x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0]) + W += W % self.patch_size[1] + if H % self.patch_size[0] != 0: + x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]]) + H += H % self.patch_size[0] + + x = self.proj(x) + if self.norm is not None: + _, _, Wh, Ww = x.shape + x = x.flatten(2).transpose([0, 2, 1]) + x = self.norm(x) + x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww]) + + return x + + +@register +@serializable +class FocalNet(nn.Layer): + """ FocalNet backbone + Args: + arch (str): Architecture of FocalNet + out_indices (Sequence[int]): Output from which stages. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. + patch_size (int | tuple(int)): Patch size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + depths (tuple[int]): Depths of each FocalNet Transformer stage. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + drop_rate (float): Dropout rate. + drop_path_rate (float): Stochastic depth rate. Default: 0.2. + norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm. + patch_norm (bool): If True, add normalization after patch embedding. Default: True. + focal_levels (Sequence[int]): Number of focal levels at four stages + focal_windows (Sequence[int]): Focal window sizes at first focal level at four stages + use_conv_embed (bool): Whether use overlapped convolution for patch embedding + use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False + layerscale_value (float): Value of layerscale + use_postln (bool): Whether use layernorm after modulation. Default: False. + use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False. + normalize_modulator (bool): Whether use normalize in modulator + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__( + self, + arch='focalnet_T_224_1k_srf', + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + patch_size=4, + in_chans=3, + embed_dim=96, + depths=[2, 2, 6, 2], + mlp_ratio=4., + drop_rate=0., + drop_path_rate=0.2, # 0.5 better for large+ models + norm_layer=nn.LayerNorm, + patch_norm=True, + focal_levels=[2, 2, 2, 2], + focal_windows=[3, 3, 3, 3], + use_conv_embed=False, + use_layerscale=False, + layerscale_value=1e-4, + use_postln=False, + use_postln_in_modulation=False, + normalize_modulator=False, + use_checkpoint=False, + pretrained=None): + super(FocalNet, self).__init__() + assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch) + + embed_dim = MODEL_cfg[arch]['embed_dim'] + depths = MODEL_cfg[arch]['depths'] + drop_path_rate = MODEL_cfg[arch]['drop_path_rate'] + focal_levels = MODEL_cfg[arch]['focal_levels'] + focal_windows = MODEL_cfg[arch]['focal_windows'] + use_conv_embed = MODEL_cfg[arch]['use_conv_embed'] + use_layerscale = MODEL_cfg[arch]['use_layerscale'] + use_postln = MODEL_cfg[arch]['use_postln'] + use_postln_in_modulation = MODEL_cfg[arch]['use_postln_in_modulation'] + normalize_modulator = MODEL_cfg[arch]['normalize_modulator'] + if pretrained is None: + pretrained = MODEL_cfg[arch]['pretrained'] + + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.num_layers = len(depths) + self.patch_norm = patch_norm + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed( + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None, + use_conv_embed=use_conv_embed, + is_stem=True) + + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth decay rule + dpr = np.linspace(0, drop_path_rate, sum(depths)) + + # build layers + self.layers = nn.LayerList() + for i_layer in range(self.num_layers): + layer = BasicLayer( + dim=int(embed_dim * 2**i_layer), + depth=depths[i_layer], + mlp_ratio=mlp_ratio, + drop=drop_rate, + drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], + norm_layer=norm_layer, + downsample=PatchEmbed + if (i_layer < self.num_layers - 1) else None, + focal_level=focal_levels[i_layer], + focal_window=focal_windows[i_layer], + use_conv_embed=use_conv_embed, + use_layerscale=use_layerscale, + layerscale_value=layerscale_value, + use_postln=use_postln, + use_postln_in_modulation=use_postln_in_modulation, + normalize_modulator=normalize_modulator, + use_checkpoint=use_checkpoint) + self.layers.append(layer) + + num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)] + self.num_features = num_features + + # add a norm layer for each output + for i_layer in out_indices: + layer = norm_layer(num_features[i_layer]) + layer_name = f'norm{i_layer}' + self.add_sublayer(layer_name, layer) + + self.apply(self._init_weights) + self._freeze_stages() + if pretrained: + if 'http' in pretrained: #URL + path = paddle.utils.download.get_weights_path_from_url( + pretrained) + else: #model in local path + path = pretrained + self.set_state_dict(paddle.load(path)) + + def _freeze_stages(self): + if self.frozen_stages >= 0: + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.stop_gradient = True + + if self.frozen_stages >= 2: + self.pos_drop.eval() + for i in range(0, self.frozen_stages - 1): + m = self.layers[i] + m.eval() + for param in m.parameters(): + param.stop_gradient = True + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + + def forward(self, x): + x = self.patch_embed(x['image']) + B, _, Wh, Ww = x.shape + x = x.flatten(2).transpose([0, 2, 1]) + x = self.pos_drop(x) + outs = [] + for i in range(self.num_layers): + layer = self.layers[i] + x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) + if i in self.out_indices: + norm_layer = getattr(self, f'norm{i}') + x_out = norm_layer(x_out) + out = x_out.reshape([-1, H, W, self.num_features[i]]).transpose( + (0, 3, 1, 2)) + outs.append(out) + + return outs + + @property + def out_shape(self): + out_strides = [4, 8, 16, 32] + return [ + ShapeSpec( + channels=self.num_features[i], stride=out_strides[i]) + for i in self.out_indices + ] diff --git a/rtdetr_paddle/ppdet/modeling/backbones/hgnet_v2.py b/rtdetr_paddle/ppdet/modeling/backbones/hgnet_v2.py new file mode 100644 index 0000000..88f989a --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/backbones/hgnet_v2.py @@ -0,0 +1,447 @@ +# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn.initializer import KaimingNormal, Constant +from paddle.nn import Conv2D, BatchNorm2D, ReLU, AdaptiveAvgPool2D, MaxPool2D +from paddle.regularizer import L2Decay +from paddle import ParamAttr + +import copy + +from ppdet.core.workspace import register, serializable +from ..shape_spec import ShapeSpec + +__all__ = ['PPHGNetV2'] + +kaiming_normal_ = KaimingNormal() +zeros_ = Constant(value=0.) +ones_ = Constant(value=1.) + + +class LearnableAffineBlock(nn.Layer): + def __init__(self, + scale_value=1.0, + bias_value=0.0, + lr_mult=1.0, + lab_lr=0.01): + super().__init__() + self.scale = self.create_parameter( + shape=[1, ], + default_initializer=Constant(value=scale_value), + attr=ParamAttr(learning_rate=lr_mult * lab_lr)) + self.add_parameter("scale", self.scale) + self.bias = self.create_parameter( + shape=[1, ], + default_initializer=Constant(value=bias_value), + attr=ParamAttr(learning_rate=lr_mult * lab_lr)) + self.add_parameter("bias", self.bias) + + def forward(self, x): + return self.scale * x + self.bias + + +class ConvBNAct(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=1, + groups=1, + use_act=True, + use_lab=False, + lr_mult=1.0): + super().__init__() + self.use_act = use_act + self.use_lab = use_lab + self.conv = Conv2D( + in_channels, + out_channels, + kernel_size, + stride, + padding=padding + if isinstance(padding, str) else (kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(learning_rate=lr_mult), + bias_attr=False) + self.bn = BatchNorm2D( + out_channels, + weight_attr=ParamAttr( + regularizer=L2Decay(0.0), learning_rate=lr_mult), + bias_attr=ParamAttr( + regularizer=L2Decay(0.0), learning_rate=lr_mult)) + if self.use_act: + self.act = ReLU() + if self.use_lab: + self.lab = LearnableAffineBlock(lr_mult=lr_mult) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.use_act: + x = self.act(x) + if self.use_lab: + x = self.lab(x) + return x + + +class LightConvBNAct(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + groups=1, + use_lab=False, + lr_mult=1.0): + super().__init__() + self.conv1 = ConvBNAct( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + use_act=False, + use_lab=use_lab, + lr_mult=lr_mult) + self.conv2 = ConvBNAct( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=kernel_size, + groups=out_channels, + use_act=True, + use_lab=use_lab, + lr_mult=lr_mult) + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + return x + + +class StemBlock(nn.Layer): + def __init__(self, + in_channels, + mid_channels, + out_channels, + use_lab=False, + lr_mult=1.0): + super().__init__() + self.stem1 = ConvBNAct( + in_channels=in_channels, + out_channels=mid_channels, + kernel_size=3, + stride=2, + use_lab=use_lab, + lr_mult=lr_mult) + self.stem2a = ConvBNAct( + in_channels=mid_channels, + out_channels=mid_channels // 2, + kernel_size=2, + stride=1, + padding="SAME", + use_lab=use_lab, + lr_mult=lr_mult) + self.stem2b = ConvBNAct( + in_channels=mid_channels // 2, + out_channels=mid_channels, + kernel_size=2, + stride=1, + padding="SAME", + use_lab=use_lab, + lr_mult=lr_mult) + self.stem3 = ConvBNAct( + in_channels=mid_channels * 2, + out_channels=mid_channels, + kernel_size=3, + stride=2, + use_lab=use_lab, + lr_mult=lr_mult) + self.stem4 = ConvBNAct( + in_channels=mid_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + use_lab=use_lab, + lr_mult=lr_mult) + self.pool = nn.MaxPool2D( + kernel_size=2, stride=1, ceil_mode=True, padding="SAME") + + def forward(self, x): + x = self.stem1(x) + x2 = self.stem2a(x) + x2 = self.stem2b(x2) + x1 = self.pool(x) + x = paddle.concat([x1, x2], 1) + x = self.stem3(x) + x = self.stem4(x) + + return x + + +class HG_Block(nn.Layer): + def __init__(self, + in_channels, + mid_channels, + out_channels, + kernel_size=3, + layer_num=6, + identity=False, + light_block=True, + use_lab=False, + lr_mult=1.0): + super().__init__() + self.identity = identity + + self.layers = nn.LayerList() + block_type = "LightConvBNAct" if light_block else "ConvBNAct" + for i in range(layer_num): + self.layers.append( + eval(block_type)(in_channels=in_channels + if i == 0 else mid_channels, + out_channels=mid_channels, + stride=1, + kernel_size=kernel_size, + use_lab=use_lab, + lr_mult=lr_mult)) + # feature aggregation + total_channels = in_channels + layer_num * mid_channels + self.aggregation_squeeze_conv = ConvBNAct( + in_channels=total_channels, + out_channels=out_channels // 2, + kernel_size=1, + stride=1, + use_lab=use_lab, + lr_mult=lr_mult) + self.aggregation_excitation_conv = ConvBNAct( + in_channels=out_channels // 2, + out_channels=out_channels, + kernel_size=1, + stride=1, + use_lab=use_lab, + lr_mult=lr_mult) + + def forward(self, x): + identity = x + output = [] + output.append(x) + for layer in self.layers: + x = layer(x) + output.append(x) + x = paddle.concat(output, axis=1) + x = self.aggregation_squeeze_conv(x) + x = self.aggregation_excitation_conv(x) + if self.identity: + x += identity + return x + + +class HG_Stage(nn.Layer): + def __init__(self, + in_channels, + mid_channels, + out_channels, + block_num, + layer_num=6, + downsample=True, + light_block=True, + kernel_size=3, + use_lab=False, + lr_mult=1.0): + super().__init__() + self.downsample = downsample + if downsample: + self.downsample = ConvBNAct( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=3, + stride=2, + groups=in_channels, + use_act=False, + use_lab=use_lab, + lr_mult=lr_mult) + + blocks_list = [] + for i in range(block_num): + blocks_list.append( + HG_Block( + in_channels=in_channels if i == 0 else out_channels, + mid_channels=mid_channels, + out_channels=out_channels, + kernel_size=kernel_size, + layer_num=layer_num, + identity=False if i == 0 else True, + light_block=light_block, + use_lab=use_lab, + lr_mult=lr_mult)) + self.blocks = nn.Sequential(*blocks_list) + + def forward(self, x): + if self.downsample: + x = self.downsample(x) + x = self.blocks(x) + return x + + +def _freeze_norm(m: nn.BatchNorm2D): + param_attr = ParamAttr( + learning_rate=0., regularizer=L2Decay(0.), trainable=False) + bias_attr = ParamAttr( + learning_rate=0., regularizer=L2Decay(0.), trainable=False) + global_stats = True + norm = nn.BatchNorm2D( + m._num_features, + weight_attr=param_attr, + bias_attr=bias_attr, + use_global_stats=global_stats) + for param in norm.parameters(): + param.stop_gradient = True + return norm + + +def reset_bn(model: nn.Layer, reset_func=_freeze_norm): + if isinstance(model, nn.BatchNorm2D): + model = reset_func(model) + else: + for name, child in model.named_children(): + _child = reset_bn(child, reset_func) + if _child is not child: + setattr(model, name, _child) + return model + + +@register +@serializable +class PPHGNetV2(nn.Layer): + """ + PPHGNetV2 + Args: + stem_channels: list. Number of channels for the stem block. + stage_type: str. The stage configuration of PPHGNet. such as the number of channels, stride, etc. + use_lab: boolean. Whether to use LearnableAffineBlock in network. + lr_mult_list: list. Control the learning rate of different stages. + Returns: + model: nn.Layer. Specific PPHGNetV2 model depends on args. + """ + + arch_configs = { + 'L': { + 'stem_channels': [3, 32, 48], + 'stage_config': { + # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num + "stage1": [48, 48, 128, 1, False, False, 3, 6], + "stage2": [128, 96, 512, 1, True, False, 3, 6], + "stage3": [512, 192, 1024, 3, True, True, 5, 6], + "stage4": [1024, 384, 2048, 1, True, True, 5, 6], + } + }, + 'X': { + 'stem_channels': [3, 32, 64], + 'stage_config': { + # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num + "stage1": [64, 64, 128, 1, False, False, 3, 6], + "stage2": [128, 128, 512, 2, True, False, 3, 6], + "stage3": [512, 256, 1024, 5, True, True, 5, 6], + "stage4": [1024, 512, 2048, 2, True, True, 5, 6], + } + } + } + + def __init__(self, + arch, + use_lab=False, + lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0], + return_idx=[1, 2, 3], + freeze_stem_only=True, + freeze_at=0, + freeze_norm=True): + super().__init__() + self.use_lab = use_lab + self.return_idx = return_idx + + stem_channels = self.arch_configs[arch]['stem_channels'] + stage_config = self.arch_configs[arch]['stage_config'] + + self._out_strides = [4, 8, 16, 32] + self._out_channels = [stage_config[k][2] for k in stage_config] + + # stem + self.stem = StemBlock( + in_channels=stem_channels[0], + mid_channels=stem_channels[1], + out_channels=stem_channels[2], + use_lab=use_lab, + lr_mult=lr_mult_list[0]) + + # stages + self.stages = nn.LayerList() + for i, k in enumerate(stage_config): + in_channels, mid_channels, out_channels, block_num, downsample, light_block, kernel_size, layer_num = stage_config[ + k] + self.stages.append( + HG_Stage( + in_channels, + mid_channels, + out_channels, + block_num, + layer_num, + downsample, + light_block, + kernel_size, + use_lab, + lr_mult=lr_mult_list[i + 1])) + + if freeze_at >= 0: + self._freeze_parameters(self.stem) + if not freeze_stem_only: + for i in range(min(freeze_at + 1, len(self.stages))): + self._freeze_parameters(self.stages[i]) + + if freeze_norm: + reset_bn(self, reset_func=_freeze_norm) + + self._init_weights() + + def _freeze_parameters(self, m): + for p in m.parameters(): + p.stop_gradient = True + + def _init_weights(self): + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + kaiming_normal_(m.weight) + elif isinstance(m, (nn.BatchNorm2D)): + ones_(m.weight) + zeros_(m.bias) + elif isinstance(m, nn.Linear): + zeros_(m.bias) + + @property + def out_shape(self): + return [ + ShapeSpec( + channels=self._out_channels[i], stride=self._out_strides[i]) + for i in self.return_idx + ] + + def forward(self, inputs): + x = inputs['image'] + x = self.stem(x) + outs = [] + for idx, stage in enumerate(self.stages): + x = stage(x) + if idx in self.return_idx: + outs.append(x) + return outs diff --git a/rtdetr_paddle/ppdet/modeling/backbones/lcnet.py b/rtdetr_paddle/ppdet/modeling/backbones/lcnet.py new file mode 100644 index 0000000..76da139 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/backbones/lcnet.py @@ -0,0 +1,271 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +from paddle import ParamAttr +from paddle.nn import AdaptiveAvgPool2D, Conv2D +from paddle.regularizer import L2Decay +from paddle.nn.initializer import KaimingNormal + +from ppdet.core.workspace import register, serializable +from numbers import Integral +from ..shape_spec import ShapeSpec + +__all__ = ['LCNet'] + +NET_CONFIG = { + "blocks2": + #k, in_c, out_c, s, use_se + [[3, 16, 32, 1, False], ], + "blocks3": [ + [3, 32, 64, 2, False], + [3, 64, 64, 1, False], + ], + "blocks4": [ + [3, 64, 128, 2, False], + [3, 128, 128, 1, False], + ], + "blocks5": [ + [3, 128, 256, 2, False], + [5, 256, 256, 1, False], + [5, 256, 256, 1, False], + [5, 256, 256, 1, False], + [5, 256, 256, 1, False], + [5, 256, 256, 1, False], + ], + "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]] +} + + +def make_divisible(v, divisor=8, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class ConvBNLayer(nn.Layer): + def __init__(self, + num_channels, + filter_size, + num_filters, + stride, + num_groups=1, + act='hard_swish'): + super().__init__() + + self.conv = Conv2D( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=num_groups, + weight_attr=ParamAttr(initializer=KaimingNormal()), + bias_attr=False) + + self.bn = nn.BatchNorm2D( + num_filters, + weight_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0))) + if act == 'hard_swish': + self.act = nn.Hardswish() + elif act == 'relu6': + self.act = nn.ReLU6() + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.act(x) + return x + + +class DepthwiseSeparable(nn.Layer): + def __init__(self, + num_channels, + num_filters, + stride, + dw_size=3, + use_se=False, + act='hard_swish'): + super().__init__() + self.use_se = use_se + self.dw_conv = ConvBNLayer( + num_channels=num_channels, + num_filters=num_channels, + filter_size=dw_size, + stride=stride, + num_groups=num_channels, + act=act) + if use_se: + self.se = SEModule(num_channels) + self.pw_conv = ConvBNLayer( + num_channels=num_channels, + filter_size=1, + num_filters=num_filters, + stride=1, + act=act) + + def forward(self, x): + x = self.dw_conv(x) + if self.use_se: + x = self.se(x) + x = self.pw_conv(x) + return x + + +class SEModule(nn.Layer): + def __init__(self, channel, reduction=4): + super().__init__() + self.avg_pool = AdaptiveAvgPool2D(1) + self.conv1 = Conv2D( + in_channels=channel, + out_channels=channel // reduction, + kernel_size=1, + stride=1, + padding=0) + self.relu = nn.ReLU() + self.conv2 = Conv2D( + in_channels=channel // reduction, + out_channels=channel, + kernel_size=1, + stride=1, + padding=0) + self.hardsigmoid = nn.Hardsigmoid() + + def forward(self, x): + identity = x + x = self.avg_pool(x) + x = self.conv1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.hardsigmoid(x) + x = paddle.multiply(x=identity, y=x) + return x + + +@register +@serializable +class LCNet(nn.Layer): + def __init__(self, scale=1.0, feature_maps=[3, 4, 5], act='hard_swish'): + super().__init__() + self.scale = scale + self.feature_maps = feature_maps + + out_channels = [] + + self.conv1 = ConvBNLayer( + num_channels=3, + filter_size=3, + num_filters=make_divisible(16 * scale), + stride=2, + act=act) + + self.blocks2 = nn.Sequential(* [ + DepthwiseSeparable( + num_channels=make_divisible(in_c * scale), + num_filters=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + act=act) + for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"]) + ]) + + self.blocks3 = nn.Sequential(* [ + DepthwiseSeparable( + num_channels=make_divisible(in_c * scale), + num_filters=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + act=act) + for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"]) + ]) + + out_channels.append( + make_divisible(NET_CONFIG["blocks3"][-1][2] * scale)) + + self.blocks4 = nn.Sequential(* [ + DepthwiseSeparable( + num_channels=make_divisible(in_c * scale), + num_filters=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + act=act) + for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"]) + ]) + + out_channels.append( + make_divisible(NET_CONFIG["blocks4"][-1][2] * scale)) + + self.blocks5 = nn.Sequential(* [ + DepthwiseSeparable( + num_channels=make_divisible(in_c * scale), + num_filters=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + act=act) + for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"]) + ]) + + out_channels.append( + make_divisible(NET_CONFIG["blocks5"][-1][2] * scale)) + + self.blocks6 = nn.Sequential(* [ + DepthwiseSeparable( + num_channels=make_divisible(in_c * scale), + num_filters=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + act=act) + for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"]) + ]) + + out_channels.append( + make_divisible(NET_CONFIG["blocks6"][-1][2] * scale)) + self._out_channels = [ + ch for idx, ch in enumerate(out_channels) if idx + 2 in feature_maps + ] + + def forward(self, inputs): + x = inputs['image'] + outs = [] + + x = self.conv1(x) + x = self.blocks2(x) + x = self.blocks3(x) + outs.append(x) + x = self.blocks4(x) + outs.append(x) + x = self.blocks5(x) + outs.append(x) + x = self.blocks6(x) + outs.append(x) + outs = [o for i, o in enumerate(outs) if i + 2 in self.feature_maps] + return outs + + @property + def out_shape(self): + return [ShapeSpec(channels=c) for c in self._out_channels] diff --git a/rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v1.py b/rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v1.py new file mode 100644 index 0000000..a39435b --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v1.py @@ -0,0 +1,402 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.regularizer import L2Decay +from paddle.nn.initializer import KaimingNormal +from ppdet.core.workspace import register, serializable +from numbers import Integral +from ..shape_spec import ShapeSpec + +__all__ = ['MobileNet'] + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + num_groups=1, + act='relu', + conv_lr=1., + conv_decay=0., + norm_decay=0., + norm_type='bn', + name=None): + super(ConvBNLayer, self).__init__() + self.act = act + self._conv = nn.Conv2D( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=num_groups, + weight_attr=ParamAttr( + learning_rate=conv_lr, + initializer=KaimingNormal(), + regularizer=L2Decay(conv_decay)), + bias_attr=False) + + param_attr = ParamAttr(regularizer=L2Decay(norm_decay)) + bias_attr = ParamAttr(regularizer=L2Decay(norm_decay)) + if norm_type in ['sync_bn', 'bn']: + self._batch_norm = nn.BatchNorm2D( + out_channels, weight_attr=param_attr, bias_attr=bias_attr) + + def forward(self, x): + x = self._conv(x) + x = self._batch_norm(x) + if self.act == "relu": + x = F.relu(x) + elif self.act == "relu6": + x = F.relu6(x) + return x + + +class DepthwiseSeparable(nn.Layer): + def __init__(self, + in_channels, + out_channels1, + out_channels2, + num_groups, + stride, + scale, + conv_lr=1., + conv_decay=0., + norm_decay=0., + norm_type='bn', + name=None): + super(DepthwiseSeparable, self).__init__() + + self._depthwise_conv = ConvBNLayer( + in_channels, + int(out_channels1 * scale), + kernel_size=3, + stride=stride, + padding=1, + num_groups=int(num_groups * scale), + conv_lr=conv_lr, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name=name + "_dw") + + self._pointwise_conv = ConvBNLayer( + int(out_channels1 * scale), + int(out_channels2 * scale), + kernel_size=1, + stride=1, + padding=0, + conv_lr=conv_lr, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name=name + "_sep") + + def forward(self, x): + x = self._depthwise_conv(x) + x = self._pointwise_conv(x) + return x + + +class ExtraBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels1, + out_channels2, + num_groups=1, + stride=2, + conv_lr=1., + conv_decay=0., + norm_decay=0., + norm_type='bn', + name=None): + super(ExtraBlock, self).__init__() + + self.pointwise_conv = ConvBNLayer( + in_channels, + int(out_channels1), + kernel_size=1, + stride=1, + padding=0, + num_groups=int(num_groups), + act='relu6', + conv_lr=conv_lr, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name=name + "_extra1") + + self.normal_conv = ConvBNLayer( + int(out_channels1), + int(out_channels2), + kernel_size=3, + stride=stride, + padding=1, + num_groups=int(num_groups), + act='relu6', + conv_lr=conv_lr, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name=name + "_extra2") + + def forward(self, x): + x = self.pointwise_conv(x) + x = self.normal_conv(x) + return x + + +@register +@serializable +class MobileNet(nn.Layer): + __shared__ = ['norm_type'] + + def __init__(self, + norm_type='bn', + norm_decay=0., + conv_decay=0., + scale=1, + conv_learning_rate=1.0, + feature_maps=[4, 6, 13], + with_extra_blocks=False, + extra_block_filters=[[256, 512], [128, 256], [128, 256], + [64, 128]]): + super(MobileNet, self).__init__() + if isinstance(feature_maps, Integral): + feature_maps = [feature_maps] + self.feature_maps = feature_maps + self.with_extra_blocks = with_extra_blocks + self.extra_block_filters = extra_block_filters + + self._out_channels = [] + + self.conv1 = ConvBNLayer( + in_channels=3, + out_channels=int(32 * scale), + kernel_size=3, + stride=2, + padding=1, + conv_lr=conv_learning_rate, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name="conv1") + + self.dwsl = [] + dws21 = self.add_sublayer( + "conv2_1", + sublayer=DepthwiseSeparable( + in_channels=int(32 * scale), + out_channels1=32, + out_channels2=64, + num_groups=32, + stride=1, + scale=scale, + conv_lr=conv_learning_rate, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name="conv2_1")) + self.dwsl.append(dws21) + self._update_out_channels(int(64 * scale), len(self.dwsl), feature_maps) + dws22 = self.add_sublayer( + "conv2_2", + sublayer=DepthwiseSeparable( + in_channels=int(64 * scale), + out_channels1=64, + out_channels2=128, + num_groups=64, + stride=2, + scale=scale, + conv_lr=conv_learning_rate, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name="conv2_2")) + self.dwsl.append(dws22) + self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps) + # 1/4 + dws31 = self.add_sublayer( + "conv3_1", + sublayer=DepthwiseSeparable( + in_channels=int(128 * scale), + out_channels1=128, + out_channels2=128, + num_groups=128, + stride=1, + scale=scale, + conv_lr=conv_learning_rate, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name="conv3_1")) + self.dwsl.append(dws31) + self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps) + dws32 = self.add_sublayer( + "conv3_2", + sublayer=DepthwiseSeparable( + in_channels=int(128 * scale), + out_channels1=128, + out_channels2=256, + num_groups=128, + stride=2, + scale=scale, + conv_lr=conv_learning_rate, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name="conv3_2")) + self.dwsl.append(dws32) + self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps) + # 1/8 + dws41 = self.add_sublayer( + "conv4_1", + sublayer=DepthwiseSeparable( + in_channels=int(256 * scale), + out_channels1=256, + out_channels2=256, + num_groups=256, + stride=1, + scale=scale, + conv_lr=conv_learning_rate, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name="conv4_1")) + self.dwsl.append(dws41) + self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps) + dws42 = self.add_sublayer( + "conv4_2", + sublayer=DepthwiseSeparable( + in_channels=int(256 * scale), + out_channels1=256, + out_channels2=512, + num_groups=256, + stride=2, + scale=scale, + conv_lr=conv_learning_rate, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name="conv4_2")) + self.dwsl.append(dws42) + self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps) + # 1/16 + for i in range(5): + tmp = self.add_sublayer( + "conv5_" + str(i + 1), + sublayer=DepthwiseSeparable( + in_channels=int(512 * scale), + out_channels1=512, + out_channels2=512, + num_groups=512, + stride=1, + scale=scale, + conv_lr=conv_learning_rate, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name="conv5_" + str(i + 1))) + self.dwsl.append(tmp) + self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps) + dws56 = self.add_sublayer( + "conv5_6", + sublayer=DepthwiseSeparable( + in_channels=int(512 * scale), + out_channels1=512, + out_channels2=1024, + num_groups=512, + stride=2, + scale=scale, + conv_lr=conv_learning_rate, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name="conv5_6")) + self.dwsl.append(dws56) + self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps) + # 1/32 + dws6 = self.add_sublayer( + "conv6", + sublayer=DepthwiseSeparable( + in_channels=int(1024 * scale), + out_channels1=1024, + out_channels2=1024, + num_groups=1024, + stride=1, + scale=scale, + conv_lr=conv_learning_rate, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name="conv6")) + self.dwsl.append(dws6) + self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps) + + if self.with_extra_blocks: + self.extra_blocks = [] + for i, block_filter in enumerate(self.extra_block_filters): + in_c = 1024 if i == 0 else self.extra_block_filters[i - 1][1] + conv_extra = self.add_sublayer( + "conv7_" + str(i + 1), + sublayer=ExtraBlock( + in_c, + block_filter[0], + block_filter[1], + conv_lr=conv_learning_rate, + conv_decay=conv_decay, + norm_decay=norm_decay, + norm_type=norm_type, + name="conv7_" + str(i + 1))) + self.extra_blocks.append(conv_extra) + self._update_out_channels( + block_filter[1], + len(self.dwsl) + len(self.extra_blocks), feature_maps) + + def _update_out_channels(self, channel, feature_idx, feature_maps): + if feature_idx in feature_maps: + self._out_channels.append(channel) + + def forward(self, inputs): + outs = [] + y = self.conv1(inputs['image']) + for i, block in enumerate(self.dwsl): + y = block(y) + if i + 1 in self.feature_maps: + outs.append(y) + + if not self.with_extra_blocks: + return outs + + y = outs[-1] + for i, block in enumerate(self.extra_blocks): + idx = i + len(self.dwsl) + y = block(y) + if idx + 1 in self.feature_maps: + outs.append(y) + return outs + + @property + def out_shape(self): + return [ShapeSpec(channels=c) for c in self._out_channels] diff --git a/rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v3.py b/rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v3.py new file mode 100644 index 0000000..2bd8856 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v3.py @@ -0,0 +1,478 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.regularizer import L2Decay +from ppdet.core.workspace import register, serializable +from numbers import Integral +from ..shape_spec import ShapeSpec + +__all__ = ['MobileNetV3'] + + +def make_divisible(v, divisor=8, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_c, + out_c, + filter_size, + stride, + padding, + num_groups=1, + act=None, + lr_mult=1., + conv_decay=0., + norm_type='bn', + norm_decay=0., + freeze_norm=False, + name=""): + super(ConvBNLayer, self).__init__() + self.act = act + self.conv = nn.Conv2D( + in_channels=in_c, + out_channels=out_c, + kernel_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + weight_attr=ParamAttr( + learning_rate=lr_mult, regularizer=L2Decay(conv_decay)), + bias_attr=False) + + norm_lr = 0. if freeze_norm else lr_mult + param_attr = ParamAttr( + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay), + trainable=False if freeze_norm else True) + bias_attr = ParamAttr( + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay), + trainable=False if freeze_norm else True) + global_stats = True if freeze_norm else None + if norm_type in ['sync_bn', 'bn']: + self.bn = nn.BatchNorm2D( + out_c, + weight_attr=param_attr, + bias_attr=bias_attr, + use_global_stats=global_stats) + norm_params = self.bn.parameters() + if freeze_norm: + for param in norm_params: + param.stop_gradient = True + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.act is not None: + if self.act == "relu": + x = F.relu(x) + elif self.act == "relu6": + x = F.relu6(x) + elif self.act == "hard_swish": + x = F.hardswish(x) + else: + raise NotImplementedError( + "The activation function is selected incorrectly.") + return x + + +class ResidualUnit(nn.Layer): + def __init__(self, + in_c, + mid_c, + out_c, + filter_size, + stride, + use_se, + lr_mult, + conv_decay=0., + norm_type='bn', + norm_decay=0., + freeze_norm=False, + act=None, + return_list=False, + name=''): + super(ResidualUnit, self).__init__() + self.if_shortcut = stride == 1 and in_c == out_c + self.use_se = use_se + self.return_list = return_list + + self.expand_conv = ConvBNLayer( + in_c=in_c, + out_c=mid_c, + filter_size=1, + stride=1, + padding=0, + act=act, + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + "_expand") + self.bottleneck_conv = ConvBNLayer( + in_c=mid_c, + out_c=mid_c, + filter_size=filter_size, + stride=stride, + padding=int((filter_size - 1) // 2), + num_groups=mid_c, + act=act, + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + "_depthwise") + if self.use_se: + self.mid_se = SEModule( + mid_c, lr_mult, conv_decay, name=name + "_se") + self.linear_conv = ConvBNLayer( + in_c=mid_c, + out_c=out_c, + filter_size=1, + stride=1, + padding=0, + act=None, + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + "_linear") + + def forward(self, inputs): + y = self.expand_conv(inputs) + x = self.bottleneck_conv(y) + if self.use_se: + x = self.mid_se(x) + x = self.linear_conv(x) + if self.if_shortcut: + x = paddle.add(inputs, x) + if self.return_list: + return [y, x] + else: + return x + + +class SEModule(nn.Layer): + def __init__(self, channel, lr_mult, conv_decay, reduction=4, name=""): + super(SEModule, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2D(1) + mid_channels = int(channel // reduction) + self.conv1 = nn.Conv2D( + in_channels=channel, + out_channels=mid_channels, + kernel_size=1, + stride=1, + padding=0, + weight_attr=ParamAttr( + learning_rate=lr_mult, regularizer=L2Decay(conv_decay)), + bias_attr=ParamAttr( + learning_rate=lr_mult, regularizer=L2Decay(conv_decay))) + self.conv2 = nn.Conv2D( + in_channels=mid_channels, + out_channels=channel, + kernel_size=1, + stride=1, + padding=0, + weight_attr=ParamAttr( + learning_rate=lr_mult, regularizer=L2Decay(conv_decay)), + bias_attr=ParamAttr( + learning_rate=lr_mult, regularizer=L2Decay(conv_decay))) + + def forward(self, inputs): + outputs = self.avg_pool(inputs) + outputs = self.conv1(outputs) + outputs = F.relu(outputs) + outputs = self.conv2(outputs) + outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5) + return paddle.multiply(x=inputs, y=outputs) + + +class ExtraBlockDW(nn.Layer): + def __init__(self, + in_c, + ch_1, + ch_2, + stride, + lr_mult, + conv_decay=0., + norm_type='bn', + norm_decay=0., + freeze_norm=False, + name=None): + super(ExtraBlockDW, self).__init__() + self.pointwise_conv = ConvBNLayer( + in_c=in_c, + out_c=ch_1, + filter_size=1, + stride=1, + padding='SAME', + act='relu6', + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + "_extra1") + self.depthwise_conv = ConvBNLayer( + in_c=ch_1, + out_c=ch_2, + filter_size=3, + stride=stride, + padding='SAME', + num_groups=int(ch_1), + act='relu6', + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + "_extra2_dw") + self.normal_conv = ConvBNLayer( + in_c=ch_2, + out_c=ch_2, + filter_size=1, + stride=1, + padding='SAME', + act='relu6', + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name=name + "_extra2_sep") + + def forward(self, inputs): + x = self.pointwise_conv(inputs) + x = self.depthwise_conv(x) + x = self.normal_conv(x) + return x + + +@register +@serializable +class MobileNetV3(nn.Layer): + __shared__ = ['norm_type'] + + def __init__( + self, + scale=1.0, + model_name="large", + feature_maps=[6, 12, 15], + with_extra_blocks=False, + extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]], + lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0], + conv_decay=0.0, + multiplier=1.0, + norm_type='bn', + norm_decay=0.0, + freeze_norm=False): + super(MobileNetV3, self).__init__() + if isinstance(feature_maps, Integral): + feature_maps = [feature_maps] + if norm_type == 'sync_bn' and freeze_norm: + raise ValueError( + "The norm_type should not be sync_bn when freeze_norm is True") + self.feature_maps = feature_maps + self.with_extra_blocks = with_extra_blocks + self.extra_block_filters = extra_block_filters + + inplanes = 16 + if model_name == "large": + self.cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, False, "relu", 1], + [3, 64, 24, False, "relu", 2], + [3, 72, 24, False, "relu", 1], + [5, 72, 40, True, "relu", 2], # RCNN output + [5, 120, 40, True, "relu", 1], + [5, 120, 40, True, "relu", 1], # YOLOv3 output + [3, 240, 80, False, "hard_swish", 2], # RCNN output + [3, 200, 80, False, "hard_swish", 1], + [3, 184, 80, False, "hard_swish", 1], + [3, 184, 80, False, "hard_swish", 1], + [3, 480, 112, True, "hard_swish", 1], + [3, 672, 112, True, "hard_swish", 1], # YOLOv3 output + [5, 672, 160, True, "hard_swish", 2], # SSD/SSDLite/RCNN output + [5, 960, 160, True, "hard_swish", 1], + [5, 960, 160, True, "hard_swish", 1], # YOLOv3 output + ] + elif model_name == "small": + self.cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, True, "relu", 2], + [3, 72, 24, False, "relu", 2], # RCNN output + [3, 88, 24, False, "relu", 1], # YOLOv3 output + [5, 96, 40, True, "hard_swish", 2], # RCNN output + [5, 240, 40, True, "hard_swish", 1], + [5, 240, 40, True, "hard_swish", 1], + [5, 120, 48, True, "hard_swish", 1], + [5, 144, 48, True, "hard_swish", 1], # YOLOv3 output + [5, 288, 96, True, "hard_swish", 2], # SSD/SSDLite/RCNN output + [5, 576, 96, True, "hard_swish", 1], + [5, 576, 96, True, "hard_swish", 1], # YOLOv3 output + ] + else: + raise NotImplementedError( + "mode[{}_model] is not implemented!".format(model_name)) + + if multiplier != 1.0: + self.cfg[-3][2] = int(self.cfg[-3][2] * multiplier) + self.cfg[-2][1] = int(self.cfg[-2][1] * multiplier) + self.cfg[-2][2] = int(self.cfg[-2][2] * multiplier) + self.cfg[-1][1] = int(self.cfg[-1][1] * multiplier) + self.cfg[-1][2] = int(self.cfg[-1][2] * multiplier) + + self.conv1 = ConvBNLayer( + in_c=3, + out_c=make_divisible(inplanes * scale), + filter_size=3, + stride=2, + padding=1, + num_groups=1, + act="hard_swish", + lr_mult=lr_mult_list[0], + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name="conv1") + + self._out_channels = [] + self.block_list = [] + i = 0 + inplanes = make_divisible(inplanes * scale) + for (k, exp, c, se, nl, s) in self.cfg: + lr_idx = min(i // 3, len(lr_mult_list) - 1) + lr_mult = lr_mult_list[lr_idx] + + # for SSD/SSDLite, first head input is after ResidualUnit expand_conv + return_list = self.with_extra_blocks and i + 2 in self.feature_maps + + block = self.add_sublayer( + "conv" + str(i + 2), + sublayer=ResidualUnit( + in_c=inplanes, + mid_c=make_divisible(scale * exp), + out_c=make_divisible(scale * c), + filter_size=k, + stride=s, + use_se=se, + act=nl, + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + return_list=return_list, + name="conv" + str(i + 2))) + self.block_list.append(block) + inplanes = make_divisible(scale * c) + i += 1 + self._update_out_channels( + make_divisible(scale * exp) + if return_list else inplanes, i + 1, feature_maps) + + if self.with_extra_blocks: + self.extra_block_list = [] + extra_out_c = make_divisible(scale * self.cfg[-1][1]) + lr_idx = min(i // 3, len(lr_mult_list) - 1) + lr_mult = lr_mult_list[lr_idx] + + conv_extra = self.add_sublayer( + "conv" + str(i + 2), + sublayer=ConvBNLayer( + in_c=inplanes, + out_c=extra_out_c, + filter_size=1, + stride=1, + padding=0, + num_groups=1, + act="hard_swish", + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name="conv" + str(i + 2))) + self.extra_block_list.append(conv_extra) + i += 1 + self._update_out_channels(extra_out_c, i + 1, feature_maps) + + for j, block_filter in enumerate(self.extra_block_filters): + in_c = extra_out_c if j == 0 else self.extra_block_filters[j - + 1][1] + conv_extra = self.add_sublayer( + "conv" + str(i + 2), + sublayer=ExtraBlockDW( + in_c, + block_filter[0], + block_filter[1], + stride=2, + lr_mult=lr_mult, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + name='conv' + str(i + 2))) + self.extra_block_list.append(conv_extra) + i += 1 + self._update_out_channels(block_filter[1], i + 1, feature_maps) + + def _update_out_channels(self, channel, feature_idx, feature_maps): + if feature_idx in feature_maps: + self._out_channels.append(channel) + + def forward(self, inputs): + x = self.conv1(inputs['image']) + outs = [] + for idx, block in enumerate(self.block_list): + x = block(x) + if idx + 2 in self.feature_maps: + if isinstance(x, list): + outs.append(x[0]) + x = x[1] + else: + outs.append(x) + + if not self.with_extra_blocks: + return outs + + for i, block in enumerate(self.extra_block_list): + idx = i + len(self.block_list) + x = block(x) + if idx + 2 in self.feature_maps: + outs.append(x) + return outs + + @property + def out_shape(self): + return [ShapeSpec(channels=c) for c in self._out_channels] diff --git a/rtdetr_paddle/ppdet/modeling/backbones/mobileone.py b/rtdetr_paddle/ppdet/modeling/backbones/mobileone.py new file mode 100644 index 0000000..e548bad --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/backbones/mobileone.py @@ -0,0 +1,266 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is the paddle implementation of MobileOne block, see: https://arxiv.org/pdf/2206.04040.pdf. +Some codes are based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py +Ths copyright of microsoft/Swin-Transformer is as follows: +MIT License [see LICENSE for details] +""" + +import paddle +import paddle.nn as nn +from paddle import ParamAttr +from paddle.regularizer import L2Decay +from paddle.nn.initializer import Normal, Constant + +from ppdet.modeling.ops import get_act_fn +from ppdet.modeling.layers import ConvNormLayer + + +class MobileOneBlock(nn.Layer): + def __init__( + self, + ch_in, + ch_out, + stride, + kernel_size, + conv_num=1, + norm_type='bn', + norm_decay=0., + norm_groups=32, + bias_on=False, + lr_scale=1., + freeze_norm=False, + initializer=Normal( + mean=0., std=0.01), + skip_quant=False, + act='relu', ): + super(MobileOneBlock, self).__init__() + + self.ch_in = ch_in + self.ch_out = ch_out + self.kernel_size = kernel_size + self.stride = stride + self.padding = (kernel_size - 1) // 2 + self.k = conv_num + + self.depth_conv = nn.LayerList() + self.point_conv = nn.LayerList() + for _ in range(self.k): + self.depth_conv.append( + ConvNormLayer( + ch_in, + ch_in, + kernel_size, + stride=stride, + groups=ch_in, + norm_type=norm_type, + norm_decay=norm_decay, + norm_groups=norm_groups, + bias_on=bias_on, + lr_scale=lr_scale, + freeze_norm=freeze_norm, + initializer=initializer, + skip_quant=skip_quant)) + self.point_conv.append( + ConvNormLayer( + ch_in, + ch_out, + 1, + stride=1, + groups=1, + norm_type=norm_type, + norm_decay=norm_decay, + norm_groups=norm_groups, + bias_on=bias_on, + lr_scale=lr_scale, + freeze_norm=freeze_norm, + initializer=initializer, + skip_quant=skip_quant)) + self.rbr_1x1 = ConvNormLayer( + ch_in, + ch_in, + 1, + stride=self.stride, + groups=ch_in, + norm_type=norm_type, + norm_decay=norm_decay, + norm_groups=norm_groups, + bias_on=bias_on, + lr_scale=lr_scale, + freeze_norm=freeze_norm, + initializer=initializer, + skip_quant=skip_quant) + self.rbr_identity_st1 = nn.BatchNorm2D( + num_features=ch_in, + weight_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay( + 0.0))) if ch_in == ch_out and self.stride == 1 else None + self.rbr_identity_st2 = nn.BatchNorm2D( + num_features=ch_out, + weight_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay( + 0.0))) if ch_in == ch_out and self.stride == 1 else None + self.act = get_act_fn(act) if act is None or isinstance(act, ( + str, dict)) else act + + def forward(self, x): + if hasattr(self, "conv1") and hasattr(self, "conv2"): + y = self.act(self.conv2(self.act(self.conv1(x)))) + else: + if self.rbr_identity_st1 is None: + id_out_st1 = 0 + else: + id_out_st1 = self.rbr_identity_st1(x) + + x1_1 = 0 + for i in range(self.k): + x1_1 += self.depth_conv[i](x) + + x1_2 = self.rbr_1x1(x) + x1 = self.act(x1_1 + x1_2 + id_out_st1) + + if self.rbr_identity_st2 is None: + id_out_st2 = 0 + else: + id_out_st2 = self.rbr_identity_st2(x1) + + x2_1 = 0 + for i in range(self.k): + x2_1 += self.point_conv[i](x1) + y = self.act(x2_1 + id_out_st2) + + return y + + def convert_to_deploy(self): + if not hasattr(self, 'conv1'): + self.conv1 = nn.Conv2D( + in_channels=self.ch_in, + out_channels=self.ch_in, + kernel_size=self.kernel_size, + stride=self.stride, + padding=self.padding, + groups=self.ch_in, + bias_attr=ParamAttr( + initializer=Constant(value=0.), learning_rate=1.)) + if not hasattr(self, 'conv2'): + self.conv2 = nn.Conv2D( + in_channels=self.ch_in, + out_channels=self.ch_out, + kernel_size=1, + stride=1, + padding='SAME', + groups=1, + bias_attr=ParamAttr( + initializer=Constant(value=0.), learning_rate=1.)) + + conv1_kernel, conv1_bias, conv2_kernel, conv2_bias = self.get_equivalent_kernel_bias( + ) + self.conv1.weight.set_value(conv1_kernel) + self.conv1.bias.set_value(conv1_bias) + self.conv2.weight.set_value(conv2_kernel) + self.conv2.bias.set_value(conv2_bias) + self.__delattr__('depth_conv') + self.__delattr__('point_conv') + self.__delattr__('rbr_1x1') + if hasattr(self, 'rbr_identity_st1'): + self.__delattr__('rbr_identity_st1') + if hasattr(self, 'rbr_identity_st2'): + self.__delattr__('rbr_identity_st2') + + def get_equivalent_kernel_bias(self): + st1_kernel3x3, st1_bias3x3 = self._fuse_bn_tensor(self.depth_conv) + st1_kernel1x1, st1_bias1x1 = self._fuse_bn_tensor(self.rbr_1x1) + st1_kernelid, st1_biasid = self._fuse_bn_tensor( + self.rbr_identity_st1, kernel_size=self.kernel_size) + + st2_kernel1x1, st2_bias1x1 = self._fuse_bn_tensor(self.point_conv) + st2_kernelid, st2_biasid = self._fuse_bn_tensor( + self.rbr_identity_st2, kernel_size=1) + + conv1_kernel = st1_kernel3x3 + self._pad_1x1_to_3x3_tensor( + st1_kernel1x1) + st1_kernelid + + conv1_bias = st1_bias3x3 + st1_bias1x1 + st1_biasid + + conv2_kernel = st2_kernel1x1 + st2_kernelid + conv2_bias = st2_bias1x1 + st2_biasid + + return conv1_kernel, conv1_bias, conv2_kernel, conv2_bias + + def _pad_1x1_to_3x3_tensor(self, kernel1x1): + if kernel1x1 is None: + return 0 + else: + padding_size = (self.kernel_size - 1) // 2 + return nn.functional.pad( + kernel1x1, + [padding_size, padding_size, padding_size, padding_size]) + + def _fuse_bn_tensor(self, branch, kernel_size=3): + if branch is None: + return 0, 0 + + if isinstance(branch, nn.LayerList): + fused_kernels = [] + fused_bias = [] + for block in branch: + kernel = block.conv.weight + running_mean = block.norm._mean + running_var = block.norm._variance + gamma = block.norm.weight + beta = block.norm.bias + eps = block.norm._epsilon + + std = (running_var + eps).sqrt() + t = (gamma / std).reshape((-1, 1, 1, 1)) + + fused_kernels.append(kernel * t) + fused_bias.append(beta - running_mean * gamma / std) + + return sum(fused_kernels), sum(fused_bias) + + elif isinstance(branch, ConvNormLayer): + kernel = branch.conv.weight + running_mean = branch.norm._mean + running_var = branch.norm._variance + gamma = branch.norm.weight + beta = branch.norm.bias + eps = branch.norm._epsilon + else: + assert isinstance(branch, nn.BatchNorm2D) + input_dim = self.ch_in if kernel_size == 1 else 1 + kernel_value = paddle.zeros( + shape=[self.ch_in, input_dim, kernel_size, kernel_size], + dtype='float32') + if kernel_size > 1: + for i in range(self.ch_in): + kernel_value[i, i % input_dim, (kernel_size - 1) // 2, ( + kernel_size - 1) // 2] = 1 + elif kernel_size == 1: + for i in range(self.ch_in): + kernel_value[i, i % input_dim, 0, 0] = 1 + else: + raise ValueError("Invalid kernel size recieved!") + kernel = paddle.to_tensor(kernel_value, place=branch.weight.place) + running_mean = branch._mean + running_var = branch._variance + gamma = branch.weight + beta = branch.bias + eps = branch._epsilon + + std = (running_var + eps).sqrt() + t = (gamma / std).reshape((-1, 1, 1, 1)) + + return kernel * t, beta - running_mean * gamma / std diff --git a/rtdetr_paddle/ppdet/modeling/backbones/name_adapter.py b/rtdetr_paddle/ppdet/modeling/backbones/name_adapter.py new file mode 100644 index 0000000..4afbb9b --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/backbones/name_adapter.py @@ -0,0 +1,69 @@ +class NameAdapter(object): + """Fix the backbones variable names for pretrained weight""" + + def __init__(self, model): + super(NameAdapter, self).__init__() + self.model = model + + @property + def model_type(self): + return getattr(self.model, '_model_type', '') + + @property + def variant(self): + return getattr(self.model, 'variant', '') + + def fix_conv_norm_name(self, name): + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + # the naming rule is same as pretrained weight + if self.model_type == 'SEResNeXt': + bn_name = name + "_bn" + return bn_name + + def fix_shortcut_name(self, name): + if self.model_type == 'SEResNeXt': + name = 'conv' + name + '_prj' + return name + + def fix_bottleneck_name(self, name): + if self.model_type == 'SEResNeXt': + conv_name1 = 'conv' + name + '_x1' + conv_name2 = 'conv' + name + '_x2' + conv_name3 = 'conv' + name + '_x3' + shortcut_name = name + else: + conv_name1 = name + "_branch2a" + conv_name2 = name + "_branch2b" + conv_name3 = name + "_branch2c" + shortcut_name = name + "_branch1" + return conv_name1, conv_name2, conv_name3, shortcut_name + + def fix_basicblock_name(self, name): + if self.model_type == 'SEResNeXt': + conv_name1 = 'conv' + name + '_x1' + conv_name2 = 'conv' + name + '_x2' + shortcut_name = name + else: + conv_name1 = name + "_branch2a" + conv_name2 = name + "_branch2b" + shortcut_name = name + "_branch1" + return conv_name1, conv_name2, shortcut_name + + def fix_layer_warp_name(self, stage_num, count, i): + name = 'res' + str(stage_num) + if count > 10 and stage_num == 4: + if i == 0: + conv_name = name + "a" + else: + conv_name = name + "b" + str(i) + else: + conv_name = name + chr(ord("a") + i) + if self.model_type == 'SEResNeXt': + conv_name = str(stage_num + 2) + '_' + str(i + 1) + return conv_name + + def fix_c1_stage_name(self): + return "res_conv1" if self.model_type == 'ResNeXt' else "conv1" diff --git a/rtdetr_paddle/ppdet/modeling/backbones/resnet.py b/rtdetr_paddle/ppdet/modeling/backbones/resnet.py new file mode 100755 index 0000000..84e362a --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/backbones/resnet.py @@ -0,0 +1,611 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from numbers import Integral + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ppdet.core.workspace import register, serializable +from paddle.regularizer import L2Decay +from paddle.nn.initializer import Uniform +from paddle import ParamAttr +from paddle.nn.initializer import Constant +from paddle.vision.ops import DeformConv2D +from .name_adapter import NameAdapter +from ..shape_spec import ShapeSpec + +__all__ = ['ResNet', 'Res5Head', 'Blocks', 'BasicBlock', 'BottleNeck'] + +ResNet_cfg = { + 18: [2, 2, 2, 2], + 34: [3, 4, 6, 3], + 50: [3, 4, 6, 3], + 101: [3, 4, 23, 3], + 152: [3, 8, 36, 3], +} + + +class ConvNormLayer(nn.Layer): + def __init__(self, + ch_in, + ch_out, + filter_size, + stride, + groups=1, + act=None, + norm_type='bn', + norm_decay=0., + freeze_norm=True, + lr=1.0, + dcn_v2=False): + super(ConvNormLayer, self).__init__() + assert norm_type in ['bn', 'sync_bn'] + self.norm_type = norm_type + self.act = act + self.dcn_v2 = dcn_v2 + + if not self.dcn_v2: + self.conv = nn.Conv2D( + in_channels=ch_in, + out_channels=ch_out, + kernel_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(learning_rate=lr), + bias_attr=False) + else: + self.offset_channel = 2 * filter_size**2 + self.mask_channel = filter_size**2 + + self.conv_offset = nn.Conv2D( + in_channels=ch_in, + out_channels=3 * filter_size**2, + kernel_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + weight_attr=ParamAttr(initializer=Constant(0.)), + bias_attr=ParamAttr(initializer=Constant(0.))) + self.conv = DeformConv2D( + in_channels=ch_in, + out_channels=ch_out, + kernel_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + dilation=1, + groups=groups, + weight_attr=ParamAttr(learning_rate=lr), + bias_attr=False) + + norm_lr = 0. if freeze_norm else lr + param_attr = ParamAttr( + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay), + trainable=False if freeze_norm else True) + bias_attr = ParamAttr( + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay), + trainable=False if freeze_norm else True) + + global_stats = True if freeze_norm else None + if norm_type in ['sync_bn', 'bn']: + self.norm = nn.BatchNorm2D( + ch_out, + weight_attr=param_attr, + bias_attr=bias_attr, + use_global_stats=global_stats) + norm_params = self.norm.parameters() + + if freeze_norm: + for param in norm_params: + param.stop_gradient = True + + def forward(self, inputs): + if not self.dcn_v2: + out = self.conv(inputs) + else: + offset_mask = self.conv_offset(inputs) + offset, mask = paddle.split( + offset_mask, + num_or_sections=[self.offset_channel, self.mask_channel], + axis=1) + mask = F.sigmoid(mask) + out = self.conv(inputs, offset, mask=mask) + + if self.norm_type in ['bn', 'sync_bn']: + out = self.norm(out) + if self.act: + out = getattr(F, self.act)(out) + return out + + +class SELayer(nn.Layer): + def __init__(self, ch, reduction_ratio=16): + super(SELayer, self).__init__() + self.pool = nn.AdaptiveAvgPool2D(1) + stdv = 1.0 / math.sqrt(ch) + c_ = ch // reduction_ratio + self.squeeze = nn.Linear( + ch, + c_, + weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)), + bias_attr=True) + + stdv = 1.0 / math.sqrt(c_) + self.extract = nn.Linear( + c_, + ch, + weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)), + bias_attr=True) + + def forward(self, inputs): + out = self.pool(inputs) + out = paddle.squeeze(out, axis=[2, 3]) + out = self.squeeze(out) + out = F.relu(out) + out = self.extract(out) + out = F.sigmoid(out) + out = paddle.unsqueeze(out, axis=[2, 3]) + scale = out * inputs + return scale + + +class BasicBlock(nn.Layer): + + expansion = 1 + + def __init__(self, + ch_in, + ch_out, + stride, + shortcut, + variant='b', + groups=1, + base_width=64, + lr=1.0, + norm_type='bn', + norm_decay=0., + freeze_norm=True, + dcn_v2=False, + std_senet=False): + super(BasicBlock, self).__init__() + assert groups == 1 and base_width == 64, 'BasicBlock only supports groups=1 and base_width=64' + + self.shortcut = shortcut + if not shortcut: + if variant == 'd' and stride == 2: + self.short = nn.Sequential() + self.short.add_sublayer( + 'pool', + nn.AvgPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True)) + self.short.add_sublayer( + 'conv', + ConvNormLayer( + ch_in=ch_in, + ch_out=ch_out, + filter_size=1, + stride=1, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + lr=lr)) + else: + self.short = ConvNormLayer( + ch_in=ch_in, + ch_out=ch_out, + filter_size=1, + stride=stride, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + lr=lr) + + self.branch2a = ConvNormLayer( + ch_in=ch_in, + ch_out=ch_out, + filter_size=3, + stride=stride, + act='relu', + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + lr=lr) + + self.branch2b = ConvNormLayer( + ch_in=ch_out, + ch_out=ch_out, + filter_size=3, + stride=1, + act=None, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + lr=lr, + dcn_v2=dcn_v2) + + self.std_senet = std_senet + if self.std_senet: + self.se = SELayer(ch_out) + + def forward(self, inputs): + out = self.branch2a(inputs) + out = self.branch2b(out) + if self.std_senet: + out = self.se(out) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + + out = paddle.add(x=out, y=short) + out = F.relu(out) + + return out + + +class BottleNeck(nn.Layer): + + expansion = 4 + + def __init__(self, + ch_in, + ch_out, + stride, + shortcut, + variant='b', + groups=1, + base_width=4, + lr=1.0, + norm_type='bn', + norm_decay=0., + freeze_norm=True, + dcn_v2=False, + std_senet=False): + super(BottleNeck, self).__init__() + if variant == 'a': + stride1, stride2 = stride, 1 + else: + stride1, stride2 = 1, stride + + # ResNeXt + width = int(ch_out * (base_width / 64.)) * groups + + self.branch2a = ConvNormLayer( + ch_in=ch_in, + ch_out=width, + filter_size=1, + stride=stride1, + groups=1, + act='relu', + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + lr=lr) + + self.branch2b = ConvNormLayer( + ch_in=width, + ch_out=width, + filter_size=3, + stride=stride2, + groups=groups, + act='relu', + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + lr=lr, + dcn_v2=dcn_v2) + + self.branch2c = ConvNormLayer( + ch_in=width, + ch_out=ch_out * self.expansion, + filter_size=1, + stride=1, + groups=1, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + lr=lr) + + self.shortcut = shortcut + if not shortcut: + if variant == 'd' and stride == 2: + self.short = nn.Sequential() + self.short.add_sublayer( + 'pool', + nn.AvgPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True)) + self.short.add_sublayer( + 'conv', + ConvNormLayer( + ch_in=ch_in, + ch_out=ch_out * self.expansion, + filter_size=1, + stride=1, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + lr=lr)) + else: + self.short = ConvNormLayer( + ch_in=ch_in, + ch_out=ch_out * self.expansion, + filter_size=1, + stride=stride, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + lr=lr) + + self.std_senet = std_senet + if self.std_senet: + self.se = SELayer(ch_out * self.expansion) + + def forward(self, inputs): + + out = self.branch2a(inputs) + out = self.branch2b(out) + out = self.branch2c(out) + + if self.std_senet: + out = self.se(out) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + + out = paddle.add(x=out, y=short) + out = F.relu(out) + + return out + + +class Blocks(nn.Layer): + def __init__(self, + block, + ch_in, + ch_out, + count, + name_adapter, + stage_num, + variant='b', + groups=1, + base_width=64, + lr=1.0, + norm_type='bn', + norm_decay=0., + freeze_norm=True, + dcn_v2=False, + std_senet=False): + super(Blocks, self).__init__() + + self.blocks = [] + for i in range(count): + conv_name = name_adapter.fix_layer_warp_name(stage_num, count, i) + layer = self.add_sublayer( + conv_name, + block( + ch_in=ch_in, + ch_out=ch_out, + stride=2 if i == 0 and stage_num != 2 else 1, + shortcut=False if i == 0 else True, + variant=variant, + groups=groups, + base_width=base_width, + lr=lr, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + dcn_v2=dcn_v2, + std_senet=std_senet)) + self.blocks.append(layer) + if i == 0: + ch_in = ch_out * block.expansion + + def forward(self, inputs): + block_out = inputs + for block in self.blocks: + block_out = block(block_out) + return block_out + + +@register +@serializable +class ResNet(nn.Layer): + __shared__ = ['norm_type'] + + def __init__(self, + depth=50, + ch_in=64, + variant='b', + lr_mult_list=[1.0, 1.0, 1.0, 1.0], + groups=1, + base_width=64, + norm_type='bn', + norm_decay=0, + freeze_norm=True, + freeze_at=0, + return_idx=[0, 1, 2, 3], + dcn_v2_stages=[-1], + num_stages=4, + std_senet=False, + freeze_stem_only=False): + """ + Residual Network, see https://arxiv.org/abs/1512.03385 + + Args: + depth (int): ResNet depth, should be 18, 34, 50, 101, 152. + ch_in (int): output channel of first stage, default 64 + variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently + lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5), + lower learning rate ratio is need for pretrained model + got using distillation(default as [1.0, 1.0, 1.0, 1.0]). + groups (int): group convolution cardinality + base_width (int): base width of each group convolution + norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel' + norm_decay (float): weight decay for normalization layer weights + freeze_norm (bool): freeze normalization layers + freeze_at (int): freeze the backbone at which stage + return_idx (list): index of the stages whose feature maps are returned + dcn_v2_stages (list): index of stages who select deformable conv v2 + num_stages (int): total num of stages + std_senet (bool): whether use senet, default False. + """ + super(ResNet, self).__init__() + self._model_type = 'ResNet' if groups == 1 else 'ResNeXt' + assert num_stages >= 1 and num_stages <= 4 + self.depth = depth + self.variant = variant + self.groups = groups + self.base_width = base_width + self.norm_type = norm_type + self.norm_decay = norm_decay + self.freeze_norm = freeze_norm + self.freeze_at = freeze_at + if isinstance(return_idx, Integral): + return_idx = [return_idx] + assert max(return_idx) < num_stages, \ + 'the maximum return index must smaller than num_stages, ' \ + 'but received maximum return index is {} and num_stages ' \ + 'is {}'.format(max(return_idx), num_stages) + self.return_idx = return_idx + self.num_stages = num_stages + assert len(lr_mult_list) == 4, \ + "lr_mult_list length must be 4 but got {}".format(len(lr_mult_list)) + if isinstance(dcn_v2_stages, Integral): + dcn_v2_stages = [dcn_v2_stages] + assert max(dcn_v2_stages) < num_stages + + if isinstance(dcn_v2_stages, Integral): + dcn_v2_stages = [dcn_v2_stages] + assert max(dcn_v2_stages) < num_stages + self.dcn_v2_stages = dcn_v2_stages + + block_nums = ResNet_cfg[depth] + na = NameAdapter(self) + + conv1_name = na.fix_c1_stage_name() + if variant in ['c', 'd']: + conv_def = [ + [3, ch_in // 2, 3, 2, "conv1_1"], + [ch_in // 2, ch_in // 2, 3, 1, "conv1_2"], + [ch_in // 2, ch_in, 3, 1, "conv1_3"], + ] + else: + conv_def = [[3, ch_in, 7, 2, conv1_name]] + self.conv1 = nn.Sequential() + for (c_in, c_out, k, s, _name) in conv_def: + self.conv1.add_sublayer( + _name, + ConvNormLayer( + ch_in=c_in, + ch_out=c_out, + filter_size=k, + stride=s, + groups=1, + act='relu', + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + lr=1.0)) + + self.ch_in = ch_in + ch_out_list = [64, 128, 256, 512] + block = BottleNeck if depth >= 50 else BasicBlock + + self._out_channels = [block.expansion * v for v in ch_out_list] + self._out_strides = [4, 8, 16, 32] + + self.res_layers = [] + for i in range(num_stages): + lr_mult = lr_mult_list[i] + stage_num = i + 2 + res_name = "res{}".format(stage_num) + res_layer = self.add_sublayer( + res_name, + Blocks( + block, + self.ch_in, + ch_out_list[i], + count=block_nums[i], + name_adapter=na, + stage_num=stage_num, + variant=variant, + groups=groups, + base_width=base_width, + lr=lr_mult, + norm_type=norm_type, + norm_decay=norm_decay, + freeze_norm=freeze_norm, + dcn_v2=(i in self.dcn_v2_stages), + std_senet=std_senet)) + self.res_layers.append(res_layer) + self.ch_in = self._out_channels[i] + + if freeze_at >= 0: + self._freeze_parameters(self.conv1) + if not freeze_stem_only: + for i in range(min(freeze_at + 1, num_stages)): + self._freeze_parameters(self.res_layers[i]) + + def _freeze_parameters(self, m): + for p in m.parameters(): + p.stop_gradient = True + + @property + def out_shape(self): + return [ + ShapeSpec( + channels=self._out_channels[i], stride=self._out_strides[i]) + for i in self.return_idx + ] + + def forward(self, inputs): + x = inputs['image'] + conv1 = self.conv1(x) + x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1) + outs = [] + for idx, stage in enumerate(self.res_layers): + x = stage(x) + if idx in self.return_idx: + outs.append(x) + return outs + + +@register +class Res5Head(nn.Layer): + def __init__(self, depth=50): + super(Res5Head, self).__init__() + feat_in, feat_out = [1024, 512] + if depth < 50: + feat_in = 256 + na = NameAdapter(self) + block = BottleNeck if depth >= 50 else BasicBlock + self.res5 = Blocks( + block, feat_in, feat_out, count=3, name_adapter=na, stage_num=5) + self.feat_out = feat_out if depth < 50 else feat_out * 4 + + @property + def out_shape(self): + return [ShapeSpec( + channels=self.feat_out, + stride=16, )] + + def forward(self, roi_feat, stage=0): + y = self.res5(roi_feat) + return y diff --git a/rtdetr_paddle/ppdet/modeling/backbones/shufflenet_v2.py b/rtdetr_paddle/ppdet/modeling/backbones/shufflenet_v2.py new file mode 100644 index 0000000..ca7ebb9 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/backbones/shufflenet_v2.py @@ -0,0 +1,250 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +from paddle import ParamAttr +import paddle.nn.functional as F +from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm2D +from paddle.nn.initializer import KaimingNormal +from paddle.regularizer import L2Decay + +from ppdet.core.workspace import register, serializable +from numbers import Integral +from ..shape_spec import ShapeSpec +from ppdet.modeling.ops import channel_shuffle + +__all__ = ['ShuffleNetV2'] + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + groups=1, + act=None): + super(ConvBNLayer, self).__init__() + self._conv = Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + weight_attr=ParamAttr(initializer=KaimingNormal()), + bias_attr=False) + + self._batch_norm = BatchNorm2D( + out_channels, + weight_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0))) + if act == "hard_swish": + act = 'hardswish' + self.act = act + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + if self.act: + y = getattr(F, self.act)(y) + return y + + +class InvertedResidual(nn.Layer): + def __init__(self, in_channels, out_channels, stride, act="relu"): + super(InvertedResidual, self).__init__() + self._conv_pw = ConvBNLayer( + in_channels=in_channels // 2, + out_channels=out_channels // 2, + kernel_size=1, + stride=1, + padding=0, + groups=1, + act=act) + self._conv_dw = ConvBNLayer( + in_channels=out_channels // 2, + out_channels=out_channels // 2, + kernel_size=3, + stride=stride, + padding=1, + groups=out_channels // 2, + act=None) + self._conv_linear = ConvBNLayer( + in_channels=out_channels // 2, + out_channels=out_channels // 2, + kernel_size=1, + stride=1, + padding=0, + groups=1, + act=act) + + def forward(self, inputs): + x1, x2 = paddle.split( + inputs, + num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2], + axis=1) + x2 = self._conv_pw(x2) + x2 = self._conv_dw(x2) + x2 = self._conv_linear(x2) + out = paddle.concat([x1, x2], axis=1) + return channel_shuffle(out, 2) + + +class InvertedResidualDS(nn.Layer): + def __init__(self, in_channels, out_channels, stride, act="relu"): + super(InvertedResidualDS, self).__init__() + + # branch1 + self._conv_dw_1 = ConvBNLayer( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=3, + stride=stride, + padding=1, + groups=in_channels, + act=None) + self._conv_linear_1 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels // 2, + kernel_size=1, + stride=1, + padding=0, + groups=1, + act=act) + # branch2 + self._conv_pw_2 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels // 2, + kernel_size=1, + stride=1, + padding=0, + groups=1, + act=act) + self._conv_dw_2 = ConvBNLayer( + in_channels=out_channels // 2, + out_channels=out_channels // 2, + kernel_size=3, + stride=stride, + padding=1, + groups=out_channels // 2, + act=None) + self._conv_linear_2 = ConvBNLayer( + in_channels=out_channels // 2, + out_channels=out_channels // 2, + kernel_size=1, + stride=1, + padding=0, + groups=1, + act=act) + + def forward(self, inputs): + x1 = self._conv_dw_1(inputs) + x1 = self._conv_linear_1(x1) + x2 = self._conv_pw_2(inputs) + x2 = self._conv_dw_2(x2) + x2 = self._conv_linear_2(x2) + out = paddle.concat([x1, x2], axis=1) + + return channel_shuffle(out, 2) + + +@register +@serializable +class ShuffleNetV2(nn.Layer): + def __init__(self, scale=1.0, act="relu", feature_maps=[5, 13, 17]): + super(ShuffleNetV2, self).__init__() + self.scale = scale + if isinstance(feature_maps, Integral): + feature_maps = [feature_maps] + self.feature_maps = feature_maps + stage_repeats = [4, 8, 4] + + if scale == 0.25: + stage_out_channels = [-1, 24, 24, 48, 96, 512] + elif scale == 0.33: + stage_out_channels = [-1, 24, 32, 64, 128, 512] + elif scale == 0.5: + stage_out_channels = [-1, 24, 48, 96, 192, 1024] + elif scale == 1.0: + stage_out_channels = [-1, 24, 116, 232, 464, 1024] + elif scale == 1.5: + stage_out_channels = [-1, 24, 176, 352, 704, 1024] + elif scale == 2.0: + stage_out_channels = [-1, 24, 244, 488, 976, 2048] + else: + raise NotImplementedError("This scale size:[" + str(scale) + + "] is not implemented!") + self._out_channels = [] + self._feature_idx = 0 + # 1. conv1 + self._conv1 = ConvBNLayer( + in_channels=3, + out_channels=stage_out_channels[1], + kernel_size=3, + stride=2, + padding=1, + act=act) + self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1) + self._feature_idx += 1 + + # 2. bottleneck sequences + self._block_list = [] + for stage_id, num_repeat in enumerate(stage_repeats): + for i in range(num_repeat): + if i == 0: + block = self.add_sublayer( + name=str(stage_id + 2) + '_' + str(i + 1), + sublayer=InvertedResidualDS( + in_channels=stage_out_channels[stage_id + 1], + out_channels=stage_out_channels[stage_id + 2], + stride=2, + act=act)) + else: + block = self.add_sublayer( + name=str(stage_id + 2) + '_' + str(i + 1), + sublayer=InvertedResidual( + in_channels=stage_out_channels[stage_id + 2], + out_channels=stage_out_channels[stage_id + 2], + stride=1, + act=act)) + self._block_list.append(block) + self._feature_idx += 1 + self._update_out_channels(stage_out_channels[stage_id + 2], + self._feature_idx, self.feature_maps) + + def _update_out_channels(self, channel, feature_idx, feature_maps): + if feature_idx in feature_maps: + self._out_channels.append(channel) + + def forward(self, inputs): + y = self._conv1(inputs['image']) + y = self._max_pool(y) + outs = [] + for i, inv in enumerate(self._block_list): + y = inv(y) + if i + 2 in self.feature_maps: + outs.append(y) + + return outs + + @property + def out_shape(self): + return [ShapeSpec(channels=c) for c in self._out_channels] diff --git a/rtdetr_paddle/ppdet/modeling/backbones/swin_transformer.py b/rtdetr_paddle/ppdet/modeling/backbones/swin_transformer.py new file mode 100644 index 0000000..64aabab --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/backbones/swin_transformer.py @@ -0,0 +1,752 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is based on https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py +Ths copyright of microsoft/Swin-Transformer is as follows: +MIT License [see LICENSE for details] +""" +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ppdet.modeling.shape_spec import ShapeSpec +from ppdet.core.workspace import register, serializable +from .transformer_utils import DropPath, Identity +from .transformer_utils import add_parameter, to_2tuple +from .transformer_utils import ones_, zeros_, trunc_normal_ + +__all__ = ['SwinTransformer'] + +MODEL_cfg = { + # use 22kto1k finetune weights as default pretrained, can set by SwinTransformer.pretrained in config + 'swin_T_224': dict( + pretrain_img_size=224, + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_tiny_patch4_window7_224_22kto1k_pretrained.pdparams', + ), + 'swin_S_224': dict( + pretrain_img_size=224, + embed_dim=96, + depths=[2, 2, 18, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_small_patch4_window7_224_22kto1k_pretrained.pdparams', + ), + 'swin_B_224': dict( + pretrain_img_size=224, + embed_dim=128, + depths=[2, 2, 18, 2], + num_heads=[4, 8, 16, 32], + window_size=7, + pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window7_224_22kto1k_pretrained.pdparams', + ), + 'swin_L_224': dict( + pretrain_img_size=224, + embed_dim=192, + depths=[2, 2, 18, 2], + num_heads=[6, 12, 24, 48], + window_size=7, + pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window7_224_22kto1k_pretrained.pdparams', + ), + 'swin_B_384': dict( + pretrain_img_size=384, + embed_dim=128, + depths=[2, 2, 18, 2], + num_heads=[4, 8, 16, 32], + window_size=12, + pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window12_384_22kto1k_pretrained.pdparams', + ), + 'swin_L_384': dict( + pretrain_img_size=384, + embed_dim=192, + depths=[2, 2, 18, 2], + num_heads=[6, 12, 24, 48], + window_size=12, + pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window12_384_22kto1k_pretrained.pdparams', + ), +} + + +class Mlp(nn.Layer): + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.reshape( + [-1, H // window_size, window_size, W // window_size, window_size, C]) + windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape( + [-1, window_size, window_size, C]) + return windows + + +def window_reverse(windows, window_size, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + Returns: + x: (B, H, W, C) + """ + _, _, _, C = windows.shape + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.reshape( + [-1, H // window_size, W // window_size, window_size, window_size, C]) + x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, H, W, C]) + return x + + +class WindowAttention(nn.Layer): + """ Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__(self, + dim, + window_size, + num_heads, + qkv_bias=True, + qk_scale=None, + attn_drop=0., + proj_drop=0.): + + super().__init__() + self.dim = dim + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + # define a parameter table of relative position bias + self.relative_position_bias_table = add_parameter( + self, + paddle.zeros(((2 * window_size[0] - 1) * (2 * window_size[1] - 1), + num_heads))) # 2*Wh-1 * 2*Ww-1, nH + + # get pair-wise relative position index for each token inside the window + coords_h = paddle.arange(self.window_size[0]) + coords_w = paddle.arange(self.window_size[1]) + coords = paddle.stack(paddle.meshgrid( + [coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww + coords_flatten_1 = coords_flatten.unsqueeze(axis=2) + coords_flatten_2 = coords_flatten.unsqueeze(axis=1) + relative_coords = coords_flatten_1 - coords_flatten_2 + relative_coords = relative_coords.transpose( + [1, 2, 0]) # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += self.window_size[ + 0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + self.relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + + self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table) + self.softmax = nn.Softmax(axis=-1) + + def forward(self, x, mask=None): + """ Forward function. + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + qkv = self.qkv(x).reshape( + [-1, N, 3, self.num_heads, C // self.num_heads]).transpose( + [2, 0, 3, 1, 4]) + q, k, v = qkv[0], qkv[1], qkv[2] + + q = q * self.scale + attn = paddle.mm(q, k.transpose([0, 1, 3, 2])) + + index = self.relative_position_index.flatten() + + relative_position_bias = paddle.index_select( + self.relative_position_bias_table, index) + relative_position_bias = relative_position_bias.reshape([ + self.window_size[0] * self.window_size[1], + self.window_size[0] * self.window_size[1], -1 + ]) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.transpose( + [2, 0, 1]) # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.reshape([-1, nW, self.num_heads, N, N + ]) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.reshape([-1, self.num_heads, N, N]) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + # x = (attn @ v).transpose(1, 2).reshape([B_, N, C]) + x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([-1, N, C]) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class SwinTransformerBlock(nn.Layer): + """ Swin Transformer Block. + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. + window_size (int): Window size. + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, + dim, + num_heads, + window_size=7, + shift_size=0, + mlp_ratio=4., + qkv_bias=True, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + act_layer=nn.GELU, + norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention( + dim, + window_size=to_2tuple(self.window_size), + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop) + + self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + + self.H = None + self.W = None + + def forward(self, x, mask_matrix): + """ Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + mask_matrix: Attention mask for cyclic shift. + """ + B, L, C = x.shape + H, W = self.H, self.W + assert L == H * W, "input feature has wrong size" + + shortcut = x + x = self.norm1(x) + x = x.reshape([-1, H, W, C]) + + # pad feature maps to multiples of window size + pad_l = pad_t = 0 + pad_r = (self.window_size - W % self.window_size) % self.window_size + pad_b = (self.window_size - H % self.window_size) % self.window_size + x = F.pad(x, [0, pad_l, 0, pad_b, 0, pad_r, 0, pad_t], + data_format='NHWC') + _, Hp, Wp, _ = x.shape + + # cyclic shift + if self.shift_size > 0: + shifted_x = paddle.roll( + x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2)) + attn_mask = mask_matrix + else: + shifted_x = x + attn_mask = None + + # partition windows + x_windows = window_partition( + shifted_x, self.window_size) # nW*B, window_size, window_size, C + x_windows = x_windows.reshape( + [x_windows.shape[0], self.window_size * self.window_size, + C]) # nW*B, window_size*window_size, C + + # W-MSA/SW-MSA + attn_windows = self.attn( + x_windows, mask=attn_mask) # nW*B, window_size*window_size, C + + # merge windows + attn_windows = attn_windows.reshape( + [x_windows.shape[0], self.window_size, self.window_size, C]) + shifted_x = window_reverse(attn_windows, self.window_size, Hp, + Wp) # B H' W' C + + # reverse cyclic shift + if self.shift_size > 0: + x = paddle.roll( + shifted_x, + shifts=(self.shift_size, self.shift_size), + axis=(1, 2)) + else: + x = shifted_x + + if pad_r > 0 or pad_b > 0: + x = x[:, :H, :W, :] + + x = x.reshape([-1, H * W, C]) + + # FFN + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x + + +class PatchMerging(nn.Layer): + r""" Patch Merging Layer. + Args: + dim (int): Number of input channels. + norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False) + self.norm = norm_layer(4 * dim) + + def forward(self, x, H, W): + """ Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + """ + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + + x = x.reshape([-1, H, W, C]) + + # padding + pad_input = (H % 2 == 1) or (W % 2 == 1) + if pad_input: + # paddle F.pad default data_format is 'NCHW' + x = F.pad(x, [0, 0, 0, H % 2, 0, W % 2, 0, 0], data_format='NHWC') + H += H % 2 + W += W % 2 + + x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C + x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C + x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C + x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C + x = paddle.concat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C + x = x.reshape([-1, H * W // 4, 4 * C]) # B H/2*W/2 4*C + + x = self.norm(x) + x = self.reduction(x) + + return x + + +class BasicLayer(nn.Layer): + """ A basic Swin Transformer layer for one stage. + Args: + dim (int): Number of input channels. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + window_size (int): Local window size. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None + """ + + def __init__(self, + dim, + depth, + num_heads, + window_size=7, + mlp_ratio=4., + qkv_bias=True, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + norm_layer=nn.LayerNorm, + downsample=None): + super().__init__() + self.window_size = window_size + self.shift_size = window_size // 2 + self.depth = depth + + # build blocks + self.blocks = nn.LayerList([ + SwinTransformerBlock( + dim=dim, + num_heads=num_heads, + window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path[i] + if isinstance(drop_path, np.ndarray) else drop_path, + norm_layer=norm_layer) for i in range(depth) + ]) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(dim=dim, norm_layer=norm_layer) + else: + self.downsample = None + + def forward(self, x, H, W): + """ Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + """ + + # calculate attention mask for SW-MSA + Hp = int(np.ceil(H / self.window_size)) * self.window_size + Wp = int(np.ceil(W / self.window_size)) * self.window_size + img_mask = paddle.zeros([1, Hp, Wp, 1], dtype='float32') # 1 Hp Wp 1 + h_slices = (slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None)) + w_slices = (slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None)) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + + cnt += 1 + + mask_windows = window_partition( + img_mask, self.window_size) # nW, window_size, window_size, 1 + mask_windows = mask_windows.reshape( + [-1, self.window_size * self.window_size]) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + huns = -100.0 * paddle.ones_like(attn_mask) + attn_mask = huns * (attn_mask != 0).astype("float32") + + for blk in self.blocks: + blk.H, blk.W = H, W + x = blk(x, attn_mask) + if self.downsample is not None: + x_down = self.downsample(x, H, W) + Wh, Ww = (H + 1) // 2, (W + 1) // 2 + return x, H, W, x_down, Wh, Ww + else: + return x, H, W, x, H, W + + +class PatchEmbed(nn.Layer): + """ Image to Patch Embedding + Args: + patch_size (int): Patch token size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Layer, optional): Normalization layer. Default: None + """ + + def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): + super().__init__() + patch_size = to_2tuple(patch_size) + self.patch_size = patch_size + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.proj = nn.Conv2D( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + # TODO # export dynamic shape + B, C, H, W = x.shape + # assert [H, W] == self.img_size[:2], "Input image size ({H}*{W}) doesn't match model ({}*{}).".format(H, W, self.img_size[0], self.img_size[1]) + if W % self.patch_size[1] != 0: + x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0]) + if H % self.patch_size[0] != 0: + x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]]) + + x = self.proj(x) + if self.norm is not None: + _, _, Wh, Ww = x.shape + x = x.flatten(2).transpose([0, 2, 1]) + x = self.norm(x) + x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww]) + + return x + + +@register +@serializable +class SwinTransformer(nn.Layer): + """ Swin Transformer backbone + Args: + arch (str): Architecture of FocalNet + pretrain_img_size (int | tuple(int)): Input image size. Default 224 + patch_size (int | tuple(int)): Patch size. Default: 4 + in_chans (int): Number of input image channels. Default: 3 + embed_dim (int): Patch embedding dimension. Default: 96 + depths (tuple(int)): Depth of each Swin Transformer layer. + num_heads (tuple(int)): Number of attention heads in different layers. + window_size (int): Window size. Default: 7 + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4 + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None + drop_rate (float): Dropout rate. Default: 0 + attn_drop_rate (float): Attention dropout rate. Default: 0 + drop_path_rate (float): Stochastic depth rate. Default: 0.1 + norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm. + ape (bool): If True, add absolute position embedding to the patch embedding. Default: False + patch_norm (bool): If True, add normalization after patch embedding. Default: True + """ + + def __init__(self, + arch='swin_T_224', + pretrain_img_size=224, + patch_size=4, + in_chans=3, + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4., + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + norm_layer=nn.LayerNorm, + ape=False, + patch_norm=True, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + pretrained=None): + super(SwinTransformer, self).__init__() + assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch) + + pretrain_img_size = MODEL_cfg[arch]['pretrain_img_size'] + embed_dim = MODEL_cfg[arch]['embed_dim'] + depths = MODEL_cfg[arch]['depths'] + num_heads = MODEL_cfg[arch]['num_heads'] + window_size = MODEL_cfg[arch]['window_size'] + if pretrained is None: + pretrained = MODEL_cfg[arch]['pretrained'] + + self.num_layers = len(depths) + self.ape = ape + self.patch_norm = patch_norm + self.out_indices = out_indices + self.frozen_stages = frozen_stages + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed( + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None) + + # absolute position embedding + if self.ape: + pretrain_img_size = to_2tuple(pretrain_img_size) + patch_size = to_2tuple(patch_size) + patches_resolution = [ + pretrain_img_size[0] // patch_size[0], + pretrain_img_size[1] // patch_size[1] + ] + + self.absolute_pos_embed = add_parameter( + self, + paddle.zeros((1, embed_dim, patches_resolution[0], + patches_resolution[1]))) + trunc_normal_(self.absolute_pos_embed) + + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth + dpr = np.linspace(0, drop_path_rate, + sum(depths)) # stochastic depth decay rule + + # build layers + self.layers = nn.LayerList() + for i_layer in range(self.num_layers): + layer = BasicLayer( + dim=int(embed_dim * 2**i_layer), + depth=depths[i_layer], + num_heads=num_heads[i_layer], + window_size=window_size, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], + norm_layer=norm_layer, + downsample=PatchMerging + if (i_layer < self.num_layers - 1) else None) + self.layers.append(layer) + + num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)] + self.num_features = num_features + + # add a norm layer for each output + for i_layer in out_indices: + layer = norm_layer(num_features[i_layer]) + layer_name = f'norm{i_layer}' + self.add_sublayer(layer_name, layer) + + self.apply(self._init_weights) + self._freeze_stages() + if pretrained: + if 'http' in pretrained: #URL + path = paddle.utils.download.get_weights_path_from_url( + pretrained) + else: #model in local path + path = pretrained + self.set_state_dict(paddle.load(path)) + + def _freeze_stages(self): + if self.frozen_stages >= 0: + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.stop_gradient = True + + if self.frozen_stages >= 1 and self.ape: + self.absolute_pos_embed.stop_gradient = True + + if self.frozen_stages >= 2: + self.pos_drop.eval() + for i in range(0, self.frozen_stages - 1): + m = self.layers[i] + m.eval() + for param in m.parameters(): + param.stop_gradient = True + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + + def forward(self, x): + """Forward function.""" + x = self.patch_embed(x['image']) + B, _, Wh, Ww = x.shape + if self.ape: + # interpolate the position embedding to the corresponding size + absolute_pos_embed = F.interpolate( + self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic') + x = (x + absolute_pos_embed).flatten(2).transpose([0, 2, 1]) + else: + x = x.flatten(2).transpose([0, 2, 1]) + x = self.pos_drop(x) + outs = [] + for i in range(self.num_layers): + layer = self.layers[i] + x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) + if i in self.out_indices: + norm_layer = getattr(self, f'norm{i}') + x_out = norm_layer(x_out) + out = x_out.reshape((-1, H, W, self.num_features[i])).transpose( + (0, 3, 1, 2)) + outs.append(out) + + return outs + + @property + def out_shape(self): + out_strides = [4, 8, 16, 32] + return [ + ShapeSpec( + channels=self.num_features[i], stride=out_strides[i]) + for i in self.out_indices + ] diff --git a/rtdetr_paddle/ppdet/modeling/backbones/trans_encoder.py b/rtdetr_paddle/ppdet/modeling/backbones/trans_encoder.py new file mode 100644 index 0000000..1a45e0f --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/backbones/trans_encoder.py @@ -0,0 +1,381 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn import ReLU, Swish, GELU +import math + +from ppdet.core.workspace import register +from ..shape_spec import ShapeSpec + +__all__ = ['TransEncoder'] + + +class BertEmbeddings(nn.Layer): + def __init__(self, word_size, position_embeddings_size, word_type_size, + hidden_size, dropout_prob): + super(BertEmbeddings, self).__init__() + self.word_embeddings = nn.Embedding( + word_size, hidden_size, padding_idx=0) + self.position_embeddings = nn.Embedding(position_embeddings_size, + hidden_size) + self.token_type_embeddings = nn.Embedding(word_type_size, hidden_size) + self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8) + self.dropout = nn.Dropout(dropout_prob) + + def forward(self, x, token_type_ids=None, position_ids=None): + seq_len = paddle.shape(x)[1] + if position_ids is None: + position_ids = paddle.arange(seq_len).unsqueeze(0).expand_as(x) + if token_type_ids is None: + token_type_ids = paddle.zeros(paddle.shape(x)) + + word_embs = self.word_embeddings(x) + position_embs = self.position_embeddings(position_ids) + token_type_embs = self.token_type_embeddings(token_type_ids) + + embs_cmb = word_embs + position_embs + token_type_embs + embs_out = self.layernorm(embs_cmb) + embs_out = self.dropout(embs_out) + return embs_out + + +class BertSelfAttention(nn.Layer): + def __init__(self, + hidden_size, + num_attention_heads, + attention_probs_dropout_prob, + output_attentions=False): + super(BertSelfAttention, self).__init__() + if hidden_size % num_attention_heads != 0: + raise ValueError( + "The hidden_size must be a multiple of the number of attention " + "heads, but got {} % {} != 0" % + (hidden_size, num_attention_heads)) + + self.num_attention_heads = num_attention_heads + self.attention_head_size = int(hidden_size / num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(hidden_size, self.all_head_size) + self.key = nn.Linear(hidden_size, self.all_head_size) + self.value = nn.Linear(hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(attention_probs_dropout_prob) + self.output_attentions = output_attentions + + def forward(self, x, attention_mask, head_mask=None): + query = self.query(x) + key = self.key(x) + value = self.value(x) + + query_dim1, query_dim2 = paddle.shape(query)[:-1] + new_shape = [ + query_dim1, query_dim2, self.num_attention_heads, + self.attention_head_size + ] + query = query.reshape(new_shape).transpose(perm=(0, 2, 1, 3)) + key = key.reshape(new_shape).transpose(perm=(0, 2, 3, 1)) + value = value.reshape(new_shape).transpose(perm=(0, 2, 1, 3)) + + attention = paddle.matmul(query, + key) / math.sqrt(self.attention_head_size) + attention = attention + attention_mask + attention_value = F.softmax(attention, axis=-1) + attention_value = self.dropout(attention_value) + + if head_mask is not None: + attention_value = attention_value * head_mask + + context = paddle.matmul(attention_value, value).transpose(perm=(0, 2, 1, + 3)) + ctx_dim1, ctx_dim2 = paddle.shape(context)[:-2] + new_context_shape = [ + ctx_dim1, + ctx_dim2, + self.all_head_size, + ] + context = context.reshape(new_context_shape) + + if self.output_attentions: + return (context, attention_value) + else: + return (context, ) + + +class BertAttention(nn.Layer): + def __init__(self, + hidden_size, + num_attention_heads, + attention_probs_dropout_prob, + fc_dropout_prob, + output_attentions=False): + super(BertAttention, self).__init__() + self.bert_selfattention = BertSelfAttention( + hidden_size, num_attention_heads, attention_probs_dropout_prob, + output_attentions) + self.fc = nn.Linear(hidden_size, hidden_size) + self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8) + self.dropout = nn.Dropout(fc_dropout_prob) + + def forward(self, x, attention_mask, head_mask=None): + attention_feats = self.bert_selfattention(x, attention_mask, head_mask) + features = self.fc(attention_feats[0]) + features = self.dropout(features) + features = self.layernorm(features + x) + if len(attention_feats) == 2: + return (features, attention_feats[1]) + else: + return (features, ) + + +class BertFeedForward(nn.Layer): + def __init__(self, + hidden_size, + intermediate_size, + num_attention_heads, + attention_probs_dropout_prob, + fc_dropout_prob, + act_fn='ReLU', + output_attentions=False): + super(BertFeedForward, self).__init__() + self.fc1 = nn.Linear(hidden_size, intermediate_size) + self.act_fn = eval(act_fn) + self.fc2 = nn.Linear(intermediate_size, hidden_size) + self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8) + self.dropout = nn.Dropout(fc_dropout_prob) + + def forward(self, x): + features = self.fc1(x) + features = self.act_fn(features) + features = self.fc2(features) + features = self.dropout(features) + features = self.layernorm(features + x) + return features + + +class BertLayer(nn.Layer): + def __init__(self, + hidden_size, + intermediate_size, + num_attention_heads, + attention_probs_dropout_prob, + fc_dropout_prob, + act_fn='ReLU', + output_attentions=False): + super(BertLayer, self).__init__() + self.attention = BertAttention(hidden_size, num_attention_heads, + attention_probs_dropout_prob, + output_attentions) + self.feed_forward = BertFeedForward( + hidden_size, intermediate_size, num_attention_heads, + attention_probs_dropout_prob, fc_dropout_prob, act_fn, + output_attentions) + + def forward(self, x, attention_mask, head_mask=None): + attention_feats = self.attention(x, attention_mask, head_mask) + features = self.feed_forward(attention_feats[0]) + if len(attention_feats) == 2: + return (features, attention_feats[1]) + else: + return (features, ) + + +class BertEncoder(nn.Layer): + def __init__(self, + num_hidden_layers, + hidden_size, + intermediate_size, + num_attention_heads, + attention_probs_dropout_prob, + fc_dropout_prob, + act_fn='ReLU', + output_attentions=False, + output_hidden_feats=False): + super(BertEncoder, self).__init__() + self.output_attentions = output_attentions + self.output_hidden_feats = output_hidden_feats + self.layers = nn.LayerList([ + BertLayer(hidden_size, intermediate_size, num_attention_heads, + attention_probs_dropout_prob, fc_dropout_prob, act_fn, + output_attentions) for _ in range(num_hidden_layers) + ]) + + def forward(self, x, attention_mask, head_mask=None): + all_features = (x, ) + all_attentions = () + + for i, layer in enumerate(self.layers): + mask = head_mask[i] if head_mask is not None else None + layer_out = layer(x, attention_mask, mask) + + if self.output_hidden_feats: + all_features = all_features + (x, ) + x = layer_out[0] + if self.output_attentions: + all_attentions = all_attentions + (layer_out[1], ) + + outputs = (x, ) + if self.output_hidden_feats: + outputs += (all_features, ) + if self.output_attentions: + outputs += (all_attentions, ) + return outputs + + +class BertPooler(nn.Layer): + def __init__(self, hidden_size): + super(BertPooler, self).__init__() + self.fc = nn.Linear(hidden_size, hidden_size) + self.act = nn.Tanh() + + def forward(self, x): + first_token = x[:, 0] + pooled_output = self.fc(first_token) + pooled_output = self.act(pooled_output) + return pooled_output + + +class METROEncoder(nn.Layer): + def __init__(self, + vocab_size, + num_hidden_layers, + features_dims, + position_embeddings_size, + hidden_size, + intermediate_size, + output_feature_dim, + num_attention_heads, + attention_probs_dropout_prob, + fc_dropout_prob, + act_fn='ReLU', + output_attentions=False, + output_hidden_feats=False, + use_img_layernorm=False): + super(METROEncoder, self).__init__() + self.img_dims = features_dims + self.num_hidden_layers = num_hidden_layers + self.use_img_layernorm = use_img_layernorm + self.output_attentions = output_attentions + self.embedding = BertEmbeddings(vocab_size, position_embeddings_size, 2, + hidden_size, fc_dropout_prob) + self.encoder = BertEncoder( + num_hidden_layers, hidden_size, intermediate_size, + num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob, + act_fn, output_attentions, output_hidden_feats) + self.pooler = BertPooler(hidden_size) + self.position_embeddings = nn.Embedding(position_embeddings_size, + hidden_size) + self.img_embedding = nn.Linear( + features_dims, hidden_size, bias_attr=True) + self.dropout = nn.Dropout(fc_dropout_prob) + self.cls_head = nn.Linear(hidden_size, output_feature_dim) + self.residual = nn.Linear(features_dims, output_feature_dim) + + self.apply(self.init_weights) + + def init_weights(self, module): + """ Initialize the weights. + """ + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.set_value( + paddle.normal( + mean=0.0, std=0.02, shape=module.weight.shape)) + elif isinstance(module, nn.LayerNorm): + module.bias.set_value(paddle.zeros(shape=module.bias.shape)) + module.weight.set_value( + paddle.full( + shape=module.weight.shape, fill_value=1.0)) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.set_value(paddle.zeros(shape=module.bias.shape)) + + def forward(self, x): + batchsize, seq_len = paddle.shape(x)[:2] + input_ids = paddle.zeros((batchsize, seq_len), dtype="int64") + position_ids = paddle.arange( + seq_len, dtype="int64").unsqueeze(0).expand_as(input_ids) + + attention_mask = paddle.ones_like(input_ids).unsqueeze(1).unsqueeze(2) + head_mask = [None] * self.num_hidden_layers + + position_embs = self.position_embeddings(position_ids) + attention_mask = (1.0 - attention_mask) * -10000.0 + + img_features = self.img_embedding(x) + + # We empirically observe that adding an additional learnable position embedding leads to more stable training + embeddings = position_embs + img_features + if self.use_img_layernorm: + embeddings = self.layernorm(embeddings) + embeddings = self.dropout(embeddings) + + encoder_outputs = self.encoder( + embeddings, attention_mask, head_mask=head_mask) + + pred_score = self.cls_head(encoder_outputs[0]) + res_img_feats = self.residual(x) + pred_score = pred_score + res_img_feats + + if self.output_attentions and self.output_hidden_feats: + return pred_score, encoder_outputs[1], encoder_outputs[-1] + else: + return pred_score + + +def gelu(x): + """Implementation of the gelu activation function. + https://arxiv.org/abs/1606.08415 + """ + return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0))) + + +@register +class TransEncoder(nn.Layer): + def __init__(self, + vocab_size=30522, + num_hidden_layers=4, + num_attention_heads=4, + position_embeddings_size=512, + intermediate_size=3072, + input_feat_dim=[2048, 512, 128], + hidden_feat_dim=[1024, 256, 128], + attention_probs_dropout_prob=0.1, + fc_dropout_prob=0.1, + act_fn='gelu', + output_attentions=False, + output_hidden_feats=False): + super(TransEncoder, self).__init__() + output_feat_dim = input_feat_dim[1:] + [3] + trans_encoder = [] + for i in range(len(output_feat_dim)): + features_dims = input_feat_dim[i] + output_feature_dim = output_feat_dim[i] + hidden_size = hidden_feat_dim[i] + + # init a transformer encoder and append it to a list + assert hidden_size % num_attention_heads == 0 + model = METROEncoder(vocab_size, num_hidden_layers, features_dims, + position_embeddings_size, hidden_size, + intermediate_size, output_feature_dim, + num_attention_heads, + attention_probs_dropout_prob, fc_dropout_prob, + act_fn, output_attentions, output_hidden_feats) + trans_encoder.append(model) + self.trans_encoder = paddle.nn.Sequential(*trans_encoder) + + def forward(self, x): + out = self.trans_encoder(x) + return out diff --git a/rtdetr_paddle/ppdet/modeling/backbones/transformer_utils.py b/rtdetr_paddle/ppdet/modeling/backbones/transformer_utils.py new file mode 100644 index 0000000..a0783e1 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/backbones/transformer_utils.py @@ -0,0 +1,124 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddle.nn.initializer import TruncatedNormal, Constant, Assign + +# Common initializations +ones_ = Constant(value=1.) +zeros_ = Constant(value=0.) +trunc_normal_ = TruncatedNormal(std=.02) + + +# Common Layers +def drop_path(x, drop_prob=0., training=False): + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... + """ + if drop_prob == 0. or not training: + return x + keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype) + shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1) + random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype) + random_tensor = paddle.floor(random_tensor) # binarize + output = x.divide(keep_prob) * random_tensor + return output + + +class DropPath(nn.Layer): + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +class Identity(nn.Layer): + def __init__(self): + super(Identity, self).__init__() + + def forward(self, input): + return input + + +# common funcs + + +def to_2tuple(x): + if isinstance(x, (list, tuple)): + return x + return tuple([x] * 2) + + +def add_parameter(layer, datas, name=None): + parameter = layer.create_parameter( + shape=(datas.shape), default_initializer=Assign(datas)) + if name: + layer.add_parameter(name, parameter) + return parameter + + +def window_partition(x, window_size): + """ + Partition into non-overlapping windows with padding if needed. + Args: + x (tensor): input tokens with [B, H, W, C]. + window_size (int): window size. + Returns: + windows: windows after partition with [B * num_windows, window_size, window_size, C]. + (Hp, Wp): padded height and width before partition + """ + B, H, W, C = paddle.shape(x) + + pad_h = (window_size - H % window_size) % window_size + pad_w = (window_size - W % window_size) % window_size + x = F.pad(x.transpose([0, 3, 1, 2]), + paddle.to_tensor( + [0, int(pad_w), 0, int(pad_h)], + dtype='int32')).transpose([0, 2, 3, 1]) + Hp, Wp = H + pad_h, W + pad_w + + num_h, num_w = Hp // window_size, Wp // window_size + + x = x.reshape([B, num_h, window_size, num_w, window_size, C]) + windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape( + [-1, window_size, window_size, C]) + return windows, (Hp, Wp), (num_h, num_w) + + +def window_unpartition(x, pad_hw, num_hw, hw): + """ + Window unpartition into original sequences and removing padding. + Args: + x (tensor): input tokens with [B * num_windows, window_size, window_size, C]. + pad_hw (Tuple): padded height and width (Hp, Wp). + hw (Tuple): original height and width (H, W) before padding. + Returns: + x: unpartitioned sequences with [B, H, W, C]. + """ + Hp, Wp = pad_hw + num_h, num_w = num_hw + H, W = hw + B, window_size, _, C = paddle.shape(x) + B = B // (num_h * num_w) + x = x.reshape([B, num_h, num_w, window_size, window_size, C]) + x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, Hp, Wp, C]) + + return x[:, :H, :W, :] diff --git a/rtdetr_paddle/ppdet/modeling/backbones/vision_transformer.py b/rtdetr_paddle/ppdet/modeling/backbones/vision_transformer.py new file mode 100644 index 0000000..a21eefc --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/backbones/vision_transformer.py @@ -0,0 +1,652 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np +from paddle.nn.initializer import Constant + +from ppdet.modeling.shape_spec import ShapeSpec +from ppdet.core.workspace import register, serializable + +from .transformer_utils import zeros_, DropPath, Identity + + +class Mlp(nn.Layer): + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Layer): + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0., + proj_drop=0., + window_size=None): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias_attr=False) + + if qkv_bias: + self.q_bias = self.create_parameter( + shape=([dim]), default_initializer=zeros_) + self.v_bias = self.create_parameter( + shape=([dim]), default_initializer=zeros_) + else: + self.q_bias = None + self.v_bias = None + if window_size: + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - 1) * ( + 2 * window_size[1] - 1) + 3 + self.relative_position_bias_table = self.create_parameter( + shape=(self.num_relative_distance, num_heads), + default_initializer=zeros_) # 2*Wh-1 * 2*Ww-1, nH + # cls to token & token 2 cls & cls to cls + + # get pair-wise relative position index for each token inside the window + coords_h = paddle.arange(window_size[0]) + coords_w = paddle.arange(window_size[1]) + coords = paddle.stack(paddle.meshgrid( + [coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww + coords_flatten_1 = paddle.unsqueeze(coords_flatten, 2) + coords_flatten_2 = paddle.unsqueeze(coords_flatten, 1) + relative_coords = coords_flatten_1.clone() - coords_flatten_2.clone( + ) + + #relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Wh + relative_coords = relative_coords.transpose( + (1, 2, 0)) #.contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[ + 0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = \ + paddle.zeros(shape=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype) + relative_position_index[1:, 1:] = relative_coords.sum( + -1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer("relative_position_index", + relative_position_index) + # trunc_normal_(self.relative_position_bias_table, std=.0) + else: + self.window_size = None + self.relative_position_bias_table = None + self.relative_position_index = None + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x, rel_pos_bias=None): + x_shape = paddle.shape(x) + N, C = x_shape[1], x_shape[2] + + qkv_bias = None + if self.q_bias is not None: + qkv_bias = paddle.concat( + (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias)) + qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias) + + qkv = qkv.reshape((-1, N, 3, self.num_heads, + C // self.num_heads)).transpose((2, 0, 3, 1, 4)) + q, k, v = qkv[0], qkv[1], qkv[2] + attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale + + if self.relative_position_bias_table is not None: + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.reshape([-1])].reshape([ + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, -1 + ]) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.transpose( + (2, 0, 1)) #.contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + if rel_pos_bias is not None: + attn = attn + rel_pos_bias + + attn = nn.functional.softmax(attn, axis=-1) + attn = self.attn_drop(attn) + + x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C)) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Layer): + def __init__(self, + dim, + num_heads, + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + window_size=None, + init_values=None, + act_layer=nn.GELU, + norm_layer='nn.LayerNorm', + epsilon=1e-5): + super().__init__() + self.norm1 = nn.LayerNorm(dim, epsilon=1e-6) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + window_size=window_size) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() + self.norm2 = eval(norm_layer)(dim, epsilon=epsilon) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + if init_values is not None: + self.gamma_1 = self.create_parameter( + shape=([dim]), default_initializer=Constant(value=init_values)) + self.gamma_2 = self.create_parameter( + shape=([dim]), default_initializer=Constant(value=init_values)) + else: + self.gamma_1, self.gamma_2 = None, None + + def forward(self, x, rel_pos_bias=None): + + if self.gamma_1 is None: + x = x + self.drop_path( + self.attn( + self.norm1(x), rel_pos_bias=rel_pos_bias)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + else: + x = x + self.drop_path(self.gamma_1 * self.attn( + self.norm1(x), rel_pos_bias=rel_pos_bias)) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Layer): + """ Image to Patch Embedding + """ + + def __init__(self, + img_size=[224, 224], + patch_size=16, + in_chans=3, + embed_dim=768): + super().__init__() + self.num_patches_w = img_size[0] // patch_size + self.num_patches_h = img_size[1] // patch_size + + num_patches = self.num_patches_w * self.num_patches_h + self.patch_shape = (img_size[0] // patch_size, + img_size[1] // patch_size) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2D( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + + @property + def num_patches_in_h(self): + return self.img_size[1] // self.patch_size + + @property + def num_patches_in_w(self): + return self.img_size[0] // self.patch_size + + def forward(self, x, mask=None): + B, C, H, W = x.shape + return self.proj(x) + + +class RelativePositionBias(nn.Layer): + def __init__(self, window_size, num_heads): + super().__init__() + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - 1) * ( + 2 * window_size[1] - 1) + 3 + self.relative_position_bias_table = self.create_parameter( + shape=(self.num_relative_distance, num_heads), + default_initialize=zeros_) + # cls to token & token 2 cls & cls to cls + + # get pair-wise relative position index for each token inside the window + coords_h = paddle.arange(window_size[0]) + coords_w = paddle.arange(window_size[1]) + coords = paddle.stack(paddle.meshgrid( + [coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = coords.flatten(1) # 2, Wh*Ww + + relative_coords = coords_flatten[:, :, + None] - coords_flatten[:, + None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.transpos( + (1, 2, 0)) # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = \ + paddle.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype) + relative_position_index[1:, 1:] = relative_coords.sum( + -1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + self.register_buffer("relative_position_index", relative_position_index) + + def forward(self): + relative_position_bias = \ + self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([ + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, -1]) # Wh*Ww,Wh*Ww,nH + return relative_position_bias.transpose((2, 0, 1)) # nH, Wh*Ww, Wh*Ww + + +def get_sinusoid_encoding_table(n_position, d_hid, token=False): + ''' Sinusoid position encoding table ''' + + def get_position_angle_vec(position): + return [ + position / np.power(10000, 2 * (hid_j // 2) / d_hid) + for hid_j in range(d_hid) + ] + + sinusoid_table = np.array( + [get_position_angle_vec(pos_i) for pos_i in range(n_position)]) + sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i + sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 + if token: + sinusoid_table = np.concatenate( + [sinusoid_table, np.zeros([1, d_hid])], dim=0) + + return paddle.to_tensor(sinusoid_table, dtype=paddle.float32).unsqueeze(0) + + +@register +@serializable +class VisionTransformer(nn.Layer): + """ Vision Transformer with support for patch input + """ + + def __init__(self, + img_size=[672, 1092], + patch_size=16, + in_chans=3, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + qkv_bias=False, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + norm_layer='nn.LayerNorm', + init_values=None, + use_rel_pos_bias=False, + use_shared_rel_pos_bias=False, + epsilon=1e-5, + final_norm=False, + pretrained=None, + out_indices=[3, 5, 7, 11], + use_abs_pos_emb=False, + use_sincos_pos_emb=True, + with_fpn=True, + num_fpn_levels=4, + use_checkpoint=False, + **args): + super().__init__() + self.img_size = img_size + self.embed_dim = embed_dim + self.with_fpn = with_fpn + self.use_checkpoint = use_checkpoint + self.use_sincos_pos_emb = use_sincos_pos_emb + self.use_rel_pos_bias = use_rel_pos_bias + self.final_norm = final_norm + self.out_indices = out_indices + self.num_fpn_levels = num_fpn_levels + + if use_checkpoint: + paddle.seed(0) + + self.patch_embed = PatchEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim) + + self.pos_w = self.patch_embed.num_patches_in_w + self.pos_h = self.patch_embed.num_patches_in_h + + self.cls_token = self.create_parameter( + shape=(1, 1, embed_dim), + default_initializer=paddle.nn.initializer.Constant(value=0.)) + + if use_abs_pos_emb: + self.pos_embed = self.create_parameter( + shape=(1, self.pos_w * self.pos_h + 1, embed_dim), + default_initializer=paddle.nn.initializer.TruncatedNormal( + std=.02)) + elif use_sincos_pos_emb: + pos_embed = self.build_2d_sincos_position_embedding(embed_dim) + + self.pos_embed = pos_embed + self.pos_embed = self.create_parameter(shape=pos_embed.shape) + self.pos_embed.set_value(pos_embed.numpy()) + self.pos_embed.stop_gradient = True + + else: + self.pos_embed = None + + self.pos_drop = nn.Dropout(p=drop_rate) + + if use_shared_rel_pos_bias: + self.rel_pos_bias = RelativePositionBias( + window_size=self.patch_embed.patch_shape, num_heads=num_heads) + else: + self.rel_pos_bias = None + + dpr = np.linspace(0, drop_path_rate, depth) + + self.blocks = nn.LayerList([ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + init_values=init_values, + window_size=self.patch_embed.patch_shape + if use_rel_pos_bias else None, + epsilon=epsilon) for i in range(depth) + ]) + + self.pretrained = pretrained + self.init_weight() + + assert len(out_indices) <= 4, '' + self.out_indices = out_indices + self.out_channels = [embed_dim for _ in range(num_fpn_levels)] + self.out_strides = [4, 8, 16, 32][-num_fpn_levels:] if with_fpn else [ + patch_size for _ in range(len(out_indices)) + ] + + self.norm = Identity() + + if self.with_fpn: + assert num_fpn_levels <= 4, '' + self.init_fpn( + embed_dim=embed_dim, + patch_size=patch_size, ) + + def init_weight(self): + pretrained = self.pretrained + + if pretrained: + if 'http' in pretrained: #URL + path = paddle.utils.download.get_weights_path_from_url( + pretrained) + else: #model in local path + path = pretrained + + load_state_dict = paddle.load(path) + model_state_dict = self.state_dict() + pos_embed_name = "pos_embed" + + if pos_embed_name in load_state_dict.keys(): + load_pos_embed = paddle.to_tensor( + load_state_dict[pos_embed_name], dtype="float32") + if self.pos_embed.shape != load_pos_embed.shape: + pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1)) + model_state_dict[pos_embed_name] = self.resize_pos_embed( + load_pos_embed, (pos_size, pos_size), + (self.pos_h, self.pos_w)) + + # self.set_state_dict(model_state_dict) + load_state_dict[pos_embed_name] = model_state_dict[ + pos_embed_name] + + print("Load pos_embed and resize it from {} to {} .".format( + load_pos_embed.shape, self.pos_embed.shape)) + + self.set_state_dict(load_state_dict) + print("Load load_state_dict....") + + def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False): + if patch_size == 16: + self.fpn1 = nn.Sequential( + nn.Conv2DTranspose( + embed_dim, embed_dim, kernel_size=2, stride=2), + nn.BatchNorm2D(embed_dim), + nn.GELU(), + nn.Conv2DTranspose( + embed_dim, embed_dim, kernel_size=2, stride=2), ) + + self.fpn2 = nn.Sequential( + nn.Conv2DTranspose( + embed_dim, embed_dim, kernel_size=2, stride=2), ) + + self.fpn3 = Identity() + + self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2) + elif patch_size == 8: + self.fpn1 = nn.Sequential( + nn.Conv2DTranspose( + embed_dim, embed_dim, kernel_size=2, stride=2), ) + + self.fpn2 = Identity() + + self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), ) + + self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), ) + + if not out_with_norm: + self.norm = Identity() + else: + self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6) + + def interpolate_pos_encoding(self, x, w, h): + npatch = x.shape[1] - 1 + N = self.pos_embed.shape[1] - 1 + w0 = w // self.patch_embed.patch_size + h0 = h // self.patch_embed.patch_size + if npatch == N and w0 == self.patch_embed.num_patches_w and h0 == self.patch_embed.num_patches_h: + return self.pos_embed + class_pos_embed = self.pos_embed[:, 0] + patch_pos_embed = self.pos_embed[:, 1:] + dim = x.shape[-1] + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + # w0, h0 = w0 + 0.1, h0 + 0.1 + # patch_pos_embed = nn.functional.interpolate( + # patch_pos_embed.reshape([ + # 1, self.patch_embed.num_patches_w, + # self.patch_embed.num_patches_h, dim + # ]).transpose((0, 3, 1, 2)), + # scale_factor=(w0 / self.patch_embed.num_patches_w, + # h0 / self.patch_embed.num_patches_h), + # mode='bicubic', ) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.reshape([ + 1, self.patch_embed.num_patches_w, + self.patch_embed.num_patches_h, dim + ]).transpose((0, 3, 1, 2)), + (w0, h0), + mode='bicubic', ) + + assert int(w0) == patch_pos_embed.shape[-2] and int( + h0) == patch_pos_embed.shape[-1] + patch_pos_embed = patch_pos_embed.transpose( + (0, 2, 3, 1)).reshape([1, -1, dim]) + return paddle.concat( + (class_pos_embed.unsqueeze(0), patch_pos_embed), axis=1) + + def resize_pos_embed(self, pos_embed, old_hw, new_hw): + """ + Resize pos_embed weight. + Args: + pos_embed (Tensor): the pos_embed weight + old_hw (list[int]): the height and width of old pos_embed + new_hw (list[int]): the height and width of new pos_embed + Returns: + Tensor: the resized pos_embed weight + """ + cls_pos_embed = pos_embed[:, :1, :] + pos_embed = pos_embed[:, 1:, :] + + pos_embed = pos_embed.transpose([0, 2, 1]) + pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]]) + pos_embed = F.interpolate( + pos_embed, new_hw, mode='bicubic', align_corners=False) + pos_embed = pos_embed.flatten(2).transpose([0, 2, 1]) + pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1) + + return pos_embed + + def build_2d_sincos_position_embedding( + self, + embed_dim=768, + temperature=10000., ): + h, w = self.patch_embed.patch_shape + grid_w = paddle.arange(w, dtype=paddle.float32) + grid_h = paddle.arange(h, dtype=paddle.float32) + grid_w, grid_h = paddle.meshgrid(grid_w, grid_h) + assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding' + pos_dim = embed_dim // 4 + omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim + omega = 1. / (temperature**omega) + + out_w = grid_w.flatten()[..., None] @omega[None] + out_h = grid_h.flatten()[..., None] @omega[None] + + pos_emb = paddle.concat( + [ + paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h), + paddle.cos(out_h) + ], + axis=1)[None, :, :] + + pe_token = paddle.zeros([1, 1, embed_dim], dtype=paddle.float32) + pos_embed = paddle.concat([pe_token, pos_emb], axis=1) + # pos_embed.stop_gradient = True + + return pos_embed + + def forward(self, x): + x = x['image'] if isinstance(x, dict) else x + _, _, h, w = x.shape + + x = self.patch_embed(x) + + B, D, Hp, Wp = x.shape # b * c * h * w + + cls_tokens = self.cls_token.expand( + (B, self.cls_token.shape[-2], self.cls_token.shape[-1])) + x = x.flatten(2).transpose([0, 2, 1]) # b * hw * c + x = paddle.concat([cls_tokens, x], axis=1) + + if self.pos_embed is not None: + # x = x + self.interpolate_pos_encoding(x, w, h) + x = x + self.interpolate_pos_encoding(x, h, w) + + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias( + ) if self.rel_pos_bias is not None else None + + feats = [] + for idx, blk in enumerate(self.blocks): + if self.use_checkpoint and self.training: + x = paddle.distributed.fleet.utils.recompute( + blk, x, rel_pos_bias, **{"preserve_rng_state": True}) + else: + x = blk(x, rel_pos_bias) + + if idx in self.out_indices: + xp = paddle.reshape( + paddle.transpose( + self.norm(x[:, 1:, :]), perm=[0, 2, 1]), + shape=[B, D, Hp, Wp]) + feats.append(xp) + + if self.with_fpn: + fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4][ + -self.num_fpn_levels:] + assert len(fpns) == len(feats) or len(feats) == 1, '' + outputs = [] + for i, m in enumerate(fpns): + outputs.append( + m(feats[i] if len(feats) == len(fpns) else feats[-1])) + + return outputs + + return feats + + @property + def num_layers(self): + return len(self.blocks) + + @property + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + @property + def out_shape(self): + return [ + ShapeSpec( + channels=c, stride=s) + for c, s in zip(self.out_channels, self.out_strides) + ] diff --git a/rtdetr_paddle/ppdet/modeling/backbones/vit_mae.py b/rtdetr_paddle/ppdet/modeling/backbones/vit_mae.py new file mode 100644 index 0000000..8d00da7 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/backbones/vit_mae.py @@ -0,0 +1,749 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np +import math +from paddle import ParamAttr +from paddle.regularizer import L2Decay +from paddle.nn.initializer import Constant, TruncatedNormal + +from ppdet.modeling.shape_spec import ShapeSpec +from ppdet.core.workspace import register, serializable + +from .transformer_utils import (zeros_, DropPath, Identity, window_partition, + window_unpartition) +from ..initializer import linear_init_ + +__all__ = ['VisionTransformer2D', 'SimpleFeaturePyramid'] + + +class Mlp(nn.Layer): + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer='nn.GELU', + drop=0., + lr_factor=1.0): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear( + in_features, + hidden_features, + weight_attr=ParamAttr(learning_rate=lr_factor), + bias_attr=ParamAttr(learning_rate=lr_factor)) + self.act = eval(act_layer)() + self.fc2 = nn.Linear( + hidden_features, + out_features, + weight_attr=ParamAttr(learning_rate=lr_factor), + bias_attr=ParamAttr(learning_rate=lr_factor)) + self.drop = nn.Dropout(drop) + + self._init_weights() + + def _init_weights(self): + linear_init_(self.fc1) + linear_init_(self.fc2) + + def forward(self, x): + x = self.drop(self.act(self.fc1(x))) + x = self.drop(self.fc2(x)) + return x + + +class Attention(nn.Layer): + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + attn_bias=False, + attn_drop=0., + proj_drop=0., + use_rel_pos=False, + rel_pos_zero_init=True, + window_size=None, + input_size=None, + qk_scale=None, + lr_factor=1.0): + super().__init__() + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.scale = qk_scale or self.head_dim**-0.5 + self.use_rel_pos = use_rel_pos + self.input_size = input_size + self.rel_pos_zero_init = rel_pos_zero_init + self.window_size = window_size + self.lr_factor = lr_factor + + self.qkv = nn.Linear( + dim, + dim * 3, + weight_attr=ParamAttr(learning_rate=lr_factor), + bias_attr=ParamAttr(learning_rate=lr_factor) + if attn_bias else False) + if qkv_bias: + self.q_bias = self.create_parameter( + shape=([dim]), default_initializer=zeros_) + self.v_bias = self.create_parameter( + shape=([dim]), default_initializer=zeros_) + else: + self.q_bias = None + self.v_bias = None + self.proj = nn.Linear( + dim, + dim, + weight_attr=ParamAttr(learning_rate=lr_factor), + bias_attr=ParamAttr(learning_rate=lr_factor)) + self.attn_drop = nn.Dropout(attn_drop) + if window_size is None: + self.window_size = self.input_size[0] + + self._init_weights() + + def _init_weights(self): + linear_init_(self.qkv) + linear_init_(self.proj) + + if self.use_rel_pos: + self.rel_pos_h = self.create_parameter( + [2 * self.window_size - 1, self.head_dim], + attr=ParamAttr(learning_rate=self.lr_factor), + default_initializer=Constant(value=0.)) + self.rel_pos_w = self.create_parameter( + [2 * self.window_size - 1, self.head_dim], + attr=ParamAttr(learning_rate=self.lr_factor), + default_initializer=Constant(value=0.)) + + if not self.rel_pos_zero_init: + TruncatedNormal(self.rel_pos_h, std=0.02) + TruncatedNormal(self.rel_pos_w, std=0.02) + + def get_rel_pos(self, seq_size, rel_pos): + max_rel_dist = int(2 * seq_size - 1) + # Interpolate rel pos if needed. + if rel_pos.shape[0] != max_rel_dist: + # Interpolate rel pos. + rel_pos = rel_pos.reshape([1, rel_pos.shape[0], -1]) + rel_pos = rel_pos.transpose([0, 2, 1]) + rel_pos_resized = F.interpolate( + rel_pos, + size=(max_rel_dist, ), + mode="linear", + data_format='NCW') + rel_pos_resized = rel_pos_resized.reshape([-1, max_rel_dist]) + rel_pos_resized = rel_pos_resized.transpose([1, 0]) + else: + rel_pos_resized = rel_pos + + coords = paddle.arange(seq_size, dtype='float32') + relative_coords = coords.unsqueeze(-1) - coords.unsqueeze(0) + relative_coords += (seq_size - 1) + relative_coords = relative_coords.astype('int64').flatten() + + return paddle.index_select(rel_pos_resized, relative_coords).reshape( + [seq_size, seq_size, self.head_dim]) + + def add_decomposed_rel_pos(self, attn, q, h, w): + """ + Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`. + Args: + attn (Tensor): attention map. + q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C). + Returns: + attn (Tensor): attention map with added relative positional embeddings. + """ + Rh = self.get_rel_pos(h, self.rel_pos_h) + Rw = self.get_rel_pos(w, self.rel_pos_w) + + B, _, dim = q.shape + r_q = q.reshape([B, h, w, dim]) + # bhwc, hch->bhwh1 + # bwhc, wcw->bhw1w + rel_h = paddle.einsum("bhwc,hkc->bhwk", r_q, Rh).unsqueeze(-1) + rel_w = paddle.einsum("bhwc,wkc->bhwk", r_q, Rw).unsqueeze(-2) + + attn = attn.reshape([B, h, w, h, w]) + rel_h + rel_w + return attn.reshape([B, h * w, h * w]) + + def forward(self, x): + B, H, W, C = paddle.shape(x) + + if self.q_bias is not None: + qkv_bias = paddle.concat( + (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias)) + qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias) + else: + qkv = self.qkv(x).reshape( + [B, H * W, 3, self.num_heads, self.head_dim]).transpose( + [2, 0, 3, 1, 4]).reshape( + [3, B * self.num_heads, H * W, self.head_dim]) + + q, k, v = qkv[0], qkv[1], qkv[2] + attn = q.matmul(k.transpose([0, 2, 1])) * self.scale + + if self.use_rel_pos: + attn = self.add_decomposed_rel_pos(attn, q, H, W) + + attn = F.softmax(attn, axis=-1) + attn = self.attn_drop(attn) + x = attn.matmul(v).reshape( + [B, self.num_heads, H * W, self.head_dim]).transpose( + [0, 2, 1, 3]).reshape([B, H, W, C]) + x = self.proj(x) + return x + + +class Block(nn.Layer): + def __init__(self, + dim, + num_heads, + mlp_ratio=4., + qkv_bias=False, + attn_bias=False, + qk_scale=None, + init_values=None, + drop=0., + attn_drop=0., + drop_path=0., + use_rel_pos=True, + rel_pos_zero_init=True, + window_size=None, + input_size=None, + act_layer='nn.GELU', + norm_layer='nn.LayerNorm', + lr_factor=1.0, + epsilon=1e-5): + super().__init__() + self.window_size = window_size + + self.norm1 = eval(norm_layer)(dim, + weight_attr=ParamAttr( + learning_rate=lr_factor, + regularizer=L2Decay(0.0)), + bias_attr=ParamAttr( + learning_rate=lr_factor, + regularizer=L2Decay(0.0)), + epsilon=epsilon) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_bias=attn_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + use_rel_pos=use_rel_pos, + rel_pos_zero_init=rel_pos_zero_init, + window_size=window_size, + input_size=input_size, + lr_factor=lr_factor) + + self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() + self.norm2 = eval(norm_layer)(dim, + weight_attr=ParamAttr( + learning_rate=lr_factor, + regularizer=L2Decay(0.0)), + bias_attr=ParamAttr( + learning_rate=lr_factor, + regularizer=L2Decay(0.0)), + epsilon=epsilon) + self.mlp = Mlp(in_features=dim, + hidden_features=int(dim * mlp_ratio), + act_layer=act_layer, + drop=drop, + lr_factor=lr_factor) + if init_values is not None: + self.gamma_1 = self.create_parameter( + shape=([dim]), default_initializer=Constant(value=init_values)) + self.gamma_2 = self.create_parameter( + shape=([dim]), default_initializer=Constant(value=init_values)) + else: + self.gamma_1, self.gamma_2 = None, None + + def forward(self, x): + y = self.norm1(x) + if self.window_size is not None: + y, pad_hw, num_hw = window_partition(y, self.window_size) + y = self.attn(y) + if self.gamma_1 is not None: + y = self.gamma_1 * y + + if self.window_size is not None: + y = window_unpartition(y, pad_hw, num_hw, (x.shape[1], x.shape[2])) + x = x + self.drop_path(y) + if self.gamma_2 is None: + x = x + self.drop_path(self.mlp(self.norm2(x))) + else: + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + + return x + + +class PatchEmbed(nn.Layer): + """ Image to Patch Embedding + """ + + def __init__(self, + img_size=(224, 224), + patch_size=16, + in_chans=3, + embed_dim=768, + lr_factor=0.01): + super().__init__() + self.img_size = img_size + self.patch_size = patch_size + self.proj = nn.Conv2D( + in_chans, + embed_dim, + kernel_size=patch_size, + stride=patch_size, + weight_attr=ParamAttr(learning_rate=lr_factor), + bias_attr=ParamAttr(learning_rate=lr_factor)) + + @property + def num_patches_in_h(self): + return self.img_size[1] // self.patch_size + + @property + def num_patches_in_w(self): + return self.img_size[0] // self.patch_size + + def forward(self, x): + out = self.proj(x) + return out + + +@register +@serializable +class VisionTransformer2D(nn.Layer): + """ Vision Transformer with support for patch input + """ + + def __init__(self, + img_size=(1024, 1024), + patch_size=16, + in_chans=3, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + qkv_bias=False, + attn_bias=False, + qk_scale=None, + init_values=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + act_layer='nn.GELU', + norm_layer='nn.LayerNorm', + lr_decay_rate=1.0, + global_attn_indexes=(2, 5, 8, 11), + use_abs_pos=False, + use_rel_pos=False, + use_abs_pos_emb=False, + use_sincos_pos_emb=False, + rel_pos_zero_init=True, + epsilon=1e-5, + final_norm=False, + pretrained=None, + window_size=None, + out_indices=(11, ), + with_fpn=False, + use_checkpoint=False, + *args, + **kwargs): + super().__init__() + self.img_size = img_size + self.patch_size = patch_size + self.embed_dim = embed_dim + self.num_heads = num_heads + self.depth = depth + self.global_attn_indexes = global_attn_indexes + self.epsilon = epsilon + self.with_fpn = with_fpn + self.use_checkpoint = use_checkpoint + + self.patch_h = img_size[0] // patch_size + self.patch_w = img_size[1] // patch_size + self.num_patches = self.patch_h * self.patch_w + self.use_abs_pos = use_abs_pos + self.use_abs_pos_emb = use_abs_pos_emb + + self.patch_embed = PatchEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim) + + dpr = np.linspace(0, drop_path_rate, depth) + if use_checkpoint: + paddle.seed(0) + + if use_abs_pos_emb: + self.pos_w = self.patch_embed.num_patches_in_w + self.pos_h = self.patch_embed.num_patches_in_h + self.pos_embed = self.create_parameter( + shape=(1, self.pos_w * self.pos_h + 1, embed_dim), + default_initializer=paddle.nn.initializer.TruncatedNormal( + std=.02)) + elif use_sincos_pos_emb: + pos_embed = self.get_2d_sincos_position_embedding(self.patch_h, + self.patch_w) + + self.pos_embed = pos_embed + self.pos_embed = self.create_parameter(shape=pos_embed.shape) + self.pos_embed.set_value(pos_embed.numpy()) + self.pos_embed.stop_gradient = True + else: + self.pos_embed = None + + self.blocks = nn.LayerList([ + Block( + embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + attn_bias=attn_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + use_rel_pos=use_rel_pos, + rel_pos_zero_init=rel_pos_zero_init, + window_size=None + if i in self.global_attn_indexes else window_size, + input_size=[self.patch_h, self.patch_w], + act_layer=act_layer, + lr_factor=self.get_vit_lr_decay_rate(i, lr_decay_rate), + norm_layer=norm_layer, + init_values=init_values, + epsilon=epsilon) for i in range(depth) + ]) + + assert len(out_indices) <= 4, 'out_indices out of bound' + self.out_indices = out_indices + self.pretrained = pretrained + self.init_weight() + + self.out_channels = [embed_dim for _ in range(len(out_indices))] + self.out_strides = [4, 8, 16, 32][-len(out_indices):] if with_fpn else [ + patch_size for _ in range(len(out_indices)) + ] + self.norm = Identity() + if self.with_fpn: + self.init_fpn( + embed_dim=embed_dim, + patch_size=patch_size, + out_with_norm=final_norm) + + def get_vit_lr_decay_rate(self, layer_id, lr_decay_rate): + return lr_decay_rate**(self.depth - layer_id) + + def init_weight(self): + pretrained = self.pretrained + if pretrained: + if 'http' in pretrained: + path = paddle.utils.download.get_weights_path_from_url( + pretrained) + else: + path = pretrained + + load_state_dict = paddle.load(path) + model_state_dict = self.state_dict() + pos_embed_name = "pos_embed" + + if pos_embed_name in load_state_dict.keys( + ) and self.use_abs_pos_emb: + load_pos_embed = paddle.to_tensor( + load_state_dict[pos_embed_name], dtype="float32") + if self.pos_embed.shape != load_pos_embed.shape: + pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1)) + model_state_dict[pos_embed_name] = self.resize_pos_embed( + load_pos_embed, (pos_size, pos_size), + (self.pos_h, self.pos_w)) + + # self.set_state_dict(model_state_dict) + load_state_dict[pos_embed_name] = model_state_dict[ + pos_embed_name] + + print("Load pos_embed and resize it from {} to {} .".format( + load_pos_embed.shape, self.pos_embed.shape)) + + self.set_state_dict(load_state_dict) + print("Load load_state_dict....") + + def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False): + if patch_size == 16: + self.fpn1 = nn.Sequential( + nn.Conv2DTranspose( + embed_dim, embed_dim, kernel_size=2, stride=2), + nn.BatchNorm2D(embed_dim), + nn.GELU(), + nn.Conv2DTranspose( + embed_dim, embed_dim, kernel_size=2, stride=2), ) + + self.fpn2 = nn.Sequential( + nn.Conv2DTranspose( + embed_dim, embed_dim, kernel_size=2, stride=2), ) + + self.fpn3 = Identity() + + self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2) + elif patch_size == 8: + self.fpn1 = nn.Sequential( + nn.Conv2DTranspose( + embed_dim, embed_dim, kernel_size=2, stride=2), ) + + self.fpn2 = Identity() + + self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), ) + + self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), ) + + if not out_with_norm: + self.norm = Identity() + else: + self.norm = nn.LayerNorm(embed_dim, epsilon=self.epsilon) + + def resize_pos_embed(self, pos_embed, old_hw, new_hw): + """ + Resize pos_embed weight. + Args: + pos_embed (Tensor): the pos_embed weight + old_hw (list[int]): the height and width of old pos_embed + new_hw (list[int]): the height and width of new pos_embed + Returns: + Tensor: the resized pos_embed weight + """ + cls_pos_embed = pos_embed[:, :1, :] + pos_embed = pos_embed[:, 1:, :] + + pos_embed = pos_embed.transpose([0, 2, 1]) + pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]]) + pos_embed = F.interpolate( + pos_embed, new_hw, mode='bicubic', align_corners=False) + pos_embed = pos_embed.flatten(2).transpose([0, 2, 1]) + pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1) + + return pos_embed + + def get_2d_sincos_position_embedding(self, h, w, temperature=10000.): + grid_y, grid_x = paddle.meshgrid( + paddle.arange( + h, dtype=paddle.float32), + paddle.arange( + w, dtype=paddle.float32)) + assert self.embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding' + pos_dim = self.embed_dim // 4 + omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim + omega = (1. / (temperature**omega)).unsqueeze(0) + + out_x = grid_x.reshape([-1, 1]).matmul(omega) + out_y = grid_y.reshape([-1, 1]).matmul(omega) + + pos_emb = paddle.concat( + [ + paddle.sin(out_y), paddle.cos(out_y), paddle.sin(out_x), + paddle.cos(out_x) + ], + axis=1) + + return pos_emb.reshape([1, h, w, self.embed_dim]) + + def forward(self, inputs): + x = self.patch_embed(inputs['image']).transpose([0, 2, 3, 1]) + B, Hp, Wp, _ = paddle.shape(x) + + if self.use_abs_pos: + x = x + self.get_2d_sincos_position_embedding(Hp, Wp) + + if self.use_abs_pos_emb: + x = x + self.resize_pos_embed(self.pos_embed, + (self.pos_h, self.pos_w), (Hp, Wp)) + + feats = [] + for idx, blk in enumerate(self.blocks): + if self.use_checkpoint and self.training: + x = paddle.distributed.fleet.utils.recompute( + blk, x, **{"preserve_rng_state": True}) + else: + x = blk(x) + if idx in self.out_indices: + feats.append(self.norm(x.transpose([0, 3, 1, 2]))) + + if self.with_fpn: + fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4] + for i in range(len(feats)): + feats[i] = fpns[i](feats[i]) + return feats + + @property + def num_layers(self): + return len(self.blocks) + + @property + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + @property + def out_shape(self): + return [ + ShapeSpec( + channels=c, stride=s) + for c, s in zip(self.out_channels, self.out_strides) + ] + + +class LayerNorm(nn.Layer): + """ + A LayerNorm variant, popularized by Transformers, that performs point-wise mean and + variance normalization over the channel dimension for inputs that have shape + (batch_size, channels, height, width). + Note that, the modified LayerNorm on used in ResBlock and SimpleFeaturePyramid. + + In ViT, we use the nn.LayerNorm + """ + + def __init__(self, normalized_shape, eps=1e-6): + super().__init__() + self.weight = self.create_parameter([normalized_shape]) + self.bias = self.create_parameter([normalized_shape]) + self.eps = eps + self.normalized_shape = (normalized_shape, ) + + def forward(self, x): + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / paddle.sqrt(s + self.eps) + x = self.weight[:, None, None] * x + self.bias[:, None, None] + return x + + +@register +@serializable +class SimpleFeaturePyramid(nn.Layer): + def __init__(self, + in_channels, + out_channels, + spatial_scales, + num_levels=4, + use_bias=False): + """ + Args: + in_channels (list[int]): input channels of each level which can be + derived from the output shape of backbone by from_config + out_channel (int): output channel of each level. + spatial_scales (list[float]): list of scaling factors to upsample or downsample + the input features for creating pyramid features which can be derived from + the output shape of backbone by from_config + num_levels (int): number of levels of output features. + use_bias (bool): whether use bias or not. + """ + super(SimpleFeaturePyramid, self).__init__() + + self.in_channels = in_channels[0] + self.out_channels = out_channels + self.num_levels = num_levels + + self.stages = [] + dim = self.in_channels + if num_levels == 4: + scale_factors = [2.0, 1.0, 0.5] + elif num_levels == 5: + scale_factors = [4.0, 2.0, 1.0, 0.5] + else: + raise NotImplementedError( + f"num_levels={num_levels} is not supported yet.") + + dim = in_channels[0] + for idx, scale in enumerate(scale_factors): + out_dim = dim + if scale == 4.0: + layers = [ + nn.Conv2DTranspose( + dim, dim // 2, kernel_size=2, stride=2), + nn.LayerNorm(dim // 2), + nn.GELU(), + nn.Conv2DTranspose( + dim // 2, dim // 4, kernel_size=2, stride=2), + ] + out_dim = dim // 4 + elif scale == 2.0: + layers = [ + nn.Conv2DTranspose( + dim, dim // 2, kernel_size=2, stride=2) + ] + out_dim = dim // 2 + elif scale == 1.0: + layers = [] + elif scale == 0.5: + layers = [nn.MaxPool2D(kernel_size=2, stride=2)] + + layers.extend([ + nn.Conv2D( + out_dim, + out_channels, + kernel_size=1, + bias_attr=use_bias, ), LayerNorm(out_channels), nn.Conv2D( + out_channels, + out_channels, + kernel_size=3, + padding=1, + bias_attr=use_bias, ), LayerNorm(out_channels) + ]) + layers = nn.Sequential(*layers) + + stage = -int(math.log2(spatial_scales[0] * scale_factors[idx])) + self.add_sublayer(f"simfp_{stage}", layers) + self.stages.append(layers) + + # top block output feature maps. + self.top_block = nn.Sequential( + nn.MaxPool2D( + kernel_size=1, stride=2, padding=0)) + + @classmethod + def from_config(cls, cfg, input_shape): + return { + 'in_channels': [i.channels for i in input_shape], + 'spatial_scales': [1.0 / i.stride for i in input_shape], + } + + @property + def out_shape(self): + return [ + ShapeSpec(channels=self.out_channels) + for _ in range(self.num_levels) + ] + + def forward(self, feats): + """ + Args: + x: Tensor of shape (N,C,H,W). + """ + features = feats[0] + results = [] + + for stage in self.stages: + results.append(stage(features)) + + top_block_in_feature = results[-1] + results.append(self.top_block(top_block_in_feature)) + assert self.num_levels == len(results) + + return results diff --git a/rtdetr_paddle/ppdet/modeling/bbox_utils.py b/rtdetr_paddle/ppdet/modeling/bbox_utils.py new file mode 100644 index 0000000..576cbbf --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/bbox_utils.py @@ -0,0 +1,607 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import paddle +import numpy as np + + +def bbox2delta(src_boxes, tgt_boxes, weights=[1.0, 1.0, 1.0, 1.0]): + """Encode bboxes to deltas. + """ + src_w = src_boxes[:, 2] - src_boxes[:, 0] + src_h = src_boxes[:, 3] - src_boxes[:, 1] + src_ctr_x = src_boxes[:, 0] + 0.5 * src_w + src_ctr_y = src_boxes[:, 1] + 0.5 * src_h + + tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0] + tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1] + tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w + tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h + + wx, wy, ww, wh = weights + dx = wx * (tgt_ctr_x - src_ctr_x) / src_w + dy = wy * (tgt_ctr_y - src_ctr_y) / src_h + dw = ww * paddle.log(tgt_w / src_w) + dh = wh * paddle.log(tgt_h / src_h) + + deltas = paddle.stack((dx, dy, dw, dh), axis=1) + return deltas + + +def delta2bbox(deltas, boxes, weights=[1.0, 1.0, 1.0, 1.0], max_shape=None): + """Decode deltas to boxes. Used in RCNNBox,CascadeHead,RCNNHead,RetinaHead. + Note: return tensor shape [n,1,4] + If you want to add a reshape, please add after the calling code instead of here. + """ + clip_scale = math.log(1000.0 / 16) + + widths = boxes[:, 2] - boxes[:, 0] + heights = boxes[:, 3] - boxes[:, 1] + ctr_x = boxes[:, 0] + 0.5 * widths + ctr_y = boxes[:, 1] + 0.5 * heights + + wx, wy, ww, wh = weights + dx = deltas[:, 0::4] / wx + dy = deltas[:, 1::4] / wy + dw = deltas[:, 2::4] / ww + dh = deltas[:, 3::4] / wh + # Prevent sending too large values into paddle.exp() + dw = paddle.clip(dw, max=clip_scale) + dh = paddle.clip(dh, max=clip_scale) + + pred_ctr_x = dx * widths.unsqueeze(1) + ctr_x.unsqueeze(1) + pred_ctr_y = dy * heights.unsqueeze(1) + ctr_y.unsqueeze(1) + pred_w = paddle.exp(dw) * widths.unsqueeze(1) + pred_h = paddle.exp(dh) * heights.unsqueeze(1) + + pred_boxes = [] + pred_boxes.append(pred_ctr_x - 0.5 * pred_w) + pred_boxes.append(pred_ctr_y - 0.5 * pred_h) + pred_boxes.append(pred_ctr_x + 0.5 * pred_w) + pred_boxes.append(pred_ctr_y + 0.5 * pred_h) + pred_boxes = paddle.stack(pred_boxes, axis=-1) + + if max_shape is not None: + pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip( + min=0, max=max_shape[1]) + pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip( + min=0, max=max_shape[0]) + return pred_boxes + + +def bbox2delta_v2(src_boxes, + tgt_boxes, + delta_mean=[0.0, 0.0, 0.0, 0.0], + delta_std=[1.0, 1.0, 1.0, 1.0]): + """Encode bboxes to deltas. + Modified from bbox2delta() which just use weight parameters to multiply deltas. + """ + src_w = src_boxes[:, 2] - src_boxes[:, 0] + src_h = src_boxes[:, 3] - src_boxes[:, 1] + src_ctr_x = src_boxes[:, 0] + 0.5 * src_w + src_ctr_y = src_boxes[:, 1] + 0.5 * src_h + + tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0] + tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1] + tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w + tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h + + dx = (tgt_ctr_x - src_ctr_x) / src_w + dy = (tgt_ctr_y - src_ctr_y) / src_h + dw = paddle.log(tgt_w / src_w) + dh = paddle.log(tgt_h / src_h) + + deltas = paddle.stack((dx, dy, dw, dh), axis=1) + deltas = ( + deltas - paddle.to_tensor(delta_mean)) / paddle.to_tensor(delta_std) + return deltas + + +def delta2bbox_v2(deltas, + boxes, + delta_mean=[0.0, 0.0, 0.0, 0.0], + delta_std=[1.0, 1.0, 1.0, 1.0], + max_shape=None, + ctr_clip=32.0): + """Decode deltas to bboxes. + Modified from delta2bbox() which just use weight parameters to be divided by deltas. + Used in YOLOFHead. + Note: return tensor shape [n,1,4] + If you want to add a reshape, please add after the calling code instead of here. + """ + clip_scale = math.log(1000.0 / 16) + + widths = boxes[:, 2] - boxes[:, 0] + heights = boxes[:, 3] - boxes[:, 1] + ctr_x = boxes[:, 0] + 0.5 * widths + ctr_y = boxes[:, 1] + 0.5 * heights + + deltas = deltas * paddle.to_tensor(delta_std) + paddle.to_tensor(delta_mean) + dx = deltas[:, 0::4] + dy = deltas[:, 1::4] + dw = deltas[:, 2::4] + dh = deltas[:, 3::4] + + # Prevent sending too large values into paddle.exp() + dx = dx * widths.unsqueeze(1) + dy = dy * heights.unsqueeze(1) + if ctr_clip is not None: + dx = paddle.clip(dx, max=ctr_clip, min=-ctr_clip) + dy = paddle.clip(dy, max=ctr_clip, min=-ctr_clip) + dw = paddle.clip(dw, max=clip_scale) + dh = paddle.clip(dh, max=clip_scale) + else: + dw = dw.clip(min=-clip_scale, max=clip_scale) + dh = dh.clip(min=-clip_scale, max=clip_scale) + + pred_ctr_x = dx + ctr_x.unsqueeze(1) + pred_ctr_y = dy + ctr_y.unsqueeze(1) + pred_w = paddle.exp(dw) * widths.unsqueeze(1) + pred_h = paddle.exp(dh) * heights.unsqueeze(1) + + pred_boxes = [] + pred_boxes.append(pred_ctr_x - 0.5 * pred_w) + pred_boxes.append(pred_ctr_y - 0.5 * pred_h) + pred_boxes.append(pred_ctr_x + 0.5 * pred_w) + pred_boxes.append(pred_ctr_y + 0.5 * pred_h) + pred_boxes = paddle.stack(pred_boxes, axis=-1) + + if max_shape is not None: + pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip( + min=0, max=max_shape[1]) + pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip( + min=0, max=max_shape[0]) + return pred_boxes + + +def expand_bbox(bboxes, scale): + w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5 + h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5 + x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5 + y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5 + + w_half *= scale + h_half *= scale + + bboxes_exp = np.zeros(bboxes.shape, dtype=np.float32) + bboxes_exp[:, 0] = x_c - w_half + bboxes_exp[:, 2] = x_c + w_half + bboxes_exp[:, 1] = y_c - h_half + bboxes_exp[:, 3] = y_c + h_half + + return bboxes_exp + + +def clip_bbox(boxes, im_shape): + h, w = im_shape[0], im_shape[1] + x1 = boxes[:, 0].clip(0, w) + y1 = boxes[:, 1].clip(0, h) + x2 = boxes[:, 2].clip(0, w) + y2 = boxes[:, 3].clip(0, h) + return paddle.stack([x1, y1, x2, y2], axis=1) + + +def nonempty_bbox(boxes, min_size=0, return_mask=False): + w = boxes[:, 2] - boxes[:, 0] + h = boxes[:, 3] - boxes[:, 1] + mask = paddle.logical_and(h > min_size, w > min_size) + if return_mask: + return mask + keep = paddle.nonzero(mask).flatten() + return keep + + +def bbox_area(boxes): + return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + + +def bbox_overlaps(boxes1, boxes2): + """ + Calculate overlaps between boxes1 and boxes2 + + Args: + boxes1 (Tensor): boxes with shape [M, 4] + boxes2 (Tensor): boxes with shape [N, 4] + + Return: + overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N] + """ + M = boxes1.shape[0] + N = boxes2.shape[0] + if M * N == 0: + return paddle.zeros([M, N], dtype='float32') + area1 = bbox_area(boxes1) + area2 = bbox_area(boxes2) + + xy_max = paddle.minimum( + paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:]) + xy_min = paddle.maximum( + paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2]) + width_height = xy_max - xy_min + width_height = width_height.clip(min=0) + inter = width_height.prod(axis=2) + + overlaps = paddle.where(inter > 0, inter / + (paddle.unsqueeze(area1, 1) + area2 - inter), + paddle.zeros_like(inter)) + return overlaps + + +def batch_bbox_overlaps(bboxes1, + bboxes2, + mode='iou', + is_aligned=False, + eps=1e-6): + """Calculate overlap between two set of bboxes. + If ``is_aligned `` is ``False``, then calculate the overlaps between each + bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned + pair of bboxes1 and bboxes2. + Args: + bboxes1 (Tensor): shape (B, m, 4) in format or empty. + bboxes2 (Tensor): shape (B, n, 4) in format or empty. + B indicates the batch dim, in shape (B1, B2, ..., Bn). + If ``is_aligned `` is ``True``, then m and n must be equal. + mode (str): "iou" (intersection over union) or "iof" (intersection over + foreground). + is_aligned (bool, optional): If True, then m and n must be equal. + Default False. + eps (float, optional): A value added to the denominator for numerical + stability. Default 1e-6. + Returns: + Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,) + """ + assert mode in ['iou', 'iof', 'giou'], 'Unsupported mode {}'.format(mode) + # Either the boxes are empty or the length of boxes's last dimenstion is 4 + assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0) + assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0) + + # Batch dim must be the same + # Batch dim: (B1, B2, ... Bn) + assert bboxes1.shape[:-2] == bboxes2.shape[:-2] + batch_shape = bboxes1.shape[:-2] + + rows = bboxes1.shape[-2] if bboxes1.shape[0] > 0 else 0 + cols = bboxes2.shape[-2] if bboxes2.shape[0] > 0 else 0 + if is_aligned: + assert rows == cols + + if rows * cols == 0: + if is_aligned: + return paddle.full(batch_shape + (rows, ), 1) + else: + return paddle.full(batch_shape + (rows, cols), 1) + + area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1]) + area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1]) + + if is_aligned: + lt = paddle.maximum(bboxes1[:, :2], bboxes2[:, :2]) # [B, rows, 2] + rb = paddle.minimum(bboxes1[:, 2:], bboxes2[:, 2:]) # [B, rows, 2] + + wh = (rb - lt).clip(min=0) # [B, rows, 2] + overlap = wh[:, 0] * wh[:, 1] + + if mode in ['iou', 'giou']: + union = area1 + area2 - overlap + else: + union = area1 + if mode == 'giou': + enclosed_lt = paddle.minimum(bboxes1[:, :2], bboxes2[:, :2]) + enclosed_rb = paddle.maximum(bboxes1[:, 2:], bboxes2[:, 2:]) + else: + lt = paddle.maximum(bboxes1[:, :2].reshape([rows, 1, 2]), + bboxes2[:, :2]) # [B, rows, cols, 2] + rb = paddle.minimum(bboxes1[:, 2:].reshape([rows, 1, 2]), + bboxes2[:, 2:]) # [B, rows, cols, 2] + + wh = (rb - lt).clip(min=0) # [B, rows, cols, 2] + overlap = wh[:, :, 0] * wh[:, :, 1] + + if mode in ['iou', 'giou']: + union = area1.reshape([rows,1]) \ + + area2.reshape([1,cols]) - overlap + else: + union = area1[:, None] + if mode == 'giou': + enclosed_lt = paddle.minimum(bboxes1[:, :2].reshape([rows, 1, 2]), + bboxes2[:, :2]) + enclosed_rb = paddle.maximum(bboxes1[:, 2:].reshape([rows, 1, 2]), + bboxes2[:, 2:]) + + eps = paddle.to_tensor([eps]) + union = paddle.maximum(union, eps) + ious = overlap / union + if mode in ['iou', 'iof']: + return ious + # calculate gious + enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0) + enclose_area = enclose_wh[:, :, 0] * enclose_wh[:, :, 1] + enclose_area = paddle.maximum(enclose_area, eps) + gious = ious - (enclose_area - union) / enclose_area + return 1 - gious + + +def xywh2xyxy(box): + x, y, w, h = box + x1 = x - w * 0.5 + y1 = y - h * 0.5 + x2 = x + w * 0.5 + y2 = y + h * 0.5 + return [x1, y1, x2, y2] + + +def make_grid(h, w, dtype): + yv, xv = paddle.meshgrid([paddle.arange(h), paddle.arange(w)]) + return paddle.stack((xv, yv), 2).cast(dtype=dtype) + + +def decode_yolo(box, anchor, downsample_ratio): + """decode yolo box + + Args: + box (list): [x, y, w, h], all have the shape [b, na, h, w, 1] + anchor (list): anchor with the shape [na, 2] + downsample_ratio (int): downsample ratio, default 32 + scale (float): scale, default 1. + + Return: + box (list): decoded box, [x, y, w, h], all have the shape [b, na, h, w, 1] + """ + x, y, w, h = box + na, grid_h, grid_w = x.shape[1:4] + grid = make_grid(grid_h, grid_w, x.dtype).reshape((1, 1, grid_h, grid_w, 2)) + x1 = (x + grid[:, :, :, :, 0:1]) / grid_w + y1 = (y + grid[:, :, :, :, 1:2]) / grid_h + + anchor = paddle.to_tensor(anchor, dtype=x.dtype) + anchor = anchor.reshape((1, na, 1, 1, 2)) + w1 = paddle.exp(w) * anchor[:, :, :, :, 0:1] / (downsample_ratio * grid_w) + h1 = paddle.exp(h) * anchor[:, :, :, :, 1:2] / (downsample_ratio * grid_h) + + return [x1, y1, w1, h1] + + +def batch_iou_similarity(box1, box2, eps=1e-9): + """Calculate iou of box1 and box2 in batch + + Args: + box1 (Tensor): box with the shape [N, M1, 4] + box2 (Tensor): box with the shape [N, M2, 4] + + Return: + iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2] + """ + box1 = box1.unsqueeze(2) # [N, M1, 4] -> [N, M1, 1, 4] + box2 = box2.unsqueeze(1) # [N, M2, 4] -> [N, 1, M2, 4] + px1y1, px2y2 = box1[:, :, :, 0:2], box1[:, :, :, 2:4] + gx1y1, gx2y2 = box2[:, :, :, 0:2], box2[:, :, :, 2:4] + x1y1 = paddle.maximum(px1y1, gx1y1) + x2y2 = paddle.minimum(px2y2, gx2y2) + overlap = (x2y2 - x1y1).clip(0).prod(-1) + area1 = (px2y2 - px1y1).clip(0).prod(-1) + area2 = (gx2y2 - gx1y1).clip(0).prod(-1) + union = area1 + area2 - overlap + eps + return overlap / union + + +def bbox_iou(box1, box2, giou=False, diou=False, ciou=False, eps=1e-9): + """calculate the iou of box1 and box2 + + Args: + box1 (list): [x, y, w, h], all have the shape [b, na, h, w, 1] + box2 (list): [x, y, w, h], all have the shape [b, na, h, w, 1] + giou (bool): whether use giou or not, default False + diou (bool): whether use diou or not, default False + ciou (bool): whether use ciou or not, default False + eps (float): epsilon to avoid divide by zero + + Return: + iou (Tensor): iou of box1 and box1, with the shape [b, na, h, w, 1] + """ + px1, py1, px2, py2 = box1 + gx1, gy1, gx2, gy2 = box2 + x1 = paddle.maximum(px1, gx1) + y1 = paddle.maximum(py1, gy1) + x2 = paddle.minimum(px2, gx2) + y2 = paddle.minimum(py2, gy2) + + overlap = ((x2 - x1).clip(0)) * ((y2 - y1).clip(0)) + + area1 = (px2 - px1) * (py2 - py1) + area1 = area1.clip(0) + + area2 = (gx2 - gx1) * (gy2 - gy1) + area2 = area2.clip(0) + + union = area1 + area2 - overlap + eps + iou = overlap / union + + if giou or ciou or diou: + # convex w, h + cw = paddle.maximum(px2, gx2) - paddle.minimum(px1, gx1) + ch = paddle.maximum(py2, gy2) - paddle.minimum(py1, gy1) + if giou: + c_area = cw * ch + eps + return iou - (c_area - union) / c_area + else: + # convex diagonal squared + c2 = cw**2 + ch**2 + eps + # center distance + rho2 = ((px1 + px2 - gx1 - gx2)**2 + (py1 + py2 - gy1 - gy2)**2) / 4 + if diou: + return iou - rho2 / c2 + else: + w1, h1 = px2 - px1, py2 - py1 + eps + w2, h2 = gx2 - gx1, gy2 - gy1 + eps + delta = paddle.atan(w1 / h1) - paddle.atan(w2 / h2) + v = (4 / math.pi**2) * paddle.pow(delta, 2) + alpha = v / (1 + eps - iou + v) + alpha.stop_gradient = True + return iou - (rho2 / c2 + v * alpha) + else: + return iou + + +def bbox_iou_np_expand(box1, box2, x1y1x2y2=True, eps=1e-16): + """ + Calculate the iou of box1 and box2 with numpy. + + Args: + box1 (ndarray): [N, 4] + box2 (ndarray): [M, 4], usually N != M + x1y1x2y2 (bool): whether in x1y1x2y2 stype, default True + eps (float): epsilon to avoid divide by zero + Return: + iou (ndarray): iou of box1 and box2, [N, M] + """ + N, M = len(box1), len(box2) # usually N != M + if x1y1x2y2: + b1_x1, b1_y1 = box1[:, 0], box1[:, 1] + b1_x2, b1_y2 = box1[:, 2], box1[:, 3] + b2_x1, b2_y1 = box2[:, 0], box2[:, 1] + b2_x2, b2_y2 = box2[:, 2], box2[:, 3] + else: + # cxcywh style + # Transform from center and width to exact coordinates + b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 + b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 + b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 + b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 + + # get the coordinates of the intersection rectangle + inter_rect_x1 = np.zeros((N, M), dtype=np.float32) + inter_rect_y1 = np.zeros((N, M), dtype=np.float32) + inter_rect_x2 = np.zeros((N, M), dtype=np.float32) + inter_rect_y2 = np.zeros((N, M), dtype=np.float32) + for i in range(len(box2)): + inter_rect_x1[:, i] = np.maximum(b1_x1, b2_x1[i]) + inter_rect_y1[:, i] = np.maximum(b1_y1, b2_y1[i]) + inter_rect_x2[:, i] = np.minimum(b1_x2, b2_x2[i]) + inter_rect_y2[:, i] = np.minimum(b1_y2, b2_y2[i]) + # Intersection area + inter_area = np.maximum(inter_rect_x2 - inter_rect_x1, 0) * np.maximum( + inter_rect_y2 - inter_rect_y1, 0) + # Union Area + b1_area = np.repeat( + ((b1_x2 - b1_x1) * (b1_y2 - b1_y1)).reshape(-1, 1), M, axis=-1) + b2_area = np.repeat( + ((b2_x2 - b2_x1) * (b2_y2 - b2_y1)).reshape(1, -1), N, axis=0) + + ious = inter_area / (b1_area + b2_area - inter_area + eps) + return ious + + +def bbox2distance(points, bbox, max_dis=None, eps=0.1): + """Decode bounding box based on distances. + Args: + points (Tensor): Shape (n, 2), [x, y]. + bbox (Tensor): Shape (n, 4), "xyxy" format + max_dis (float): Upper bound of the distance. + eps (float): a small value to ensure target < max_dis, instead <= + Returns: + Tensor: Decoded distances. + """ + left = points[:, 0] - bbox[:, 0] + top = points[:, 1] - bbox[:, 1] + right = bbox[:, 2] - points[:, 0] + bottom = bbox[:, 3] - points[:, 1] + if max_dis is not None: + left = left.clip(min=0, max=max_dis - eps) + top = top.clip(min=0, max=max_dis - eps) + right = right.clip(min=0, max=max_dis - eps) + bottom = bottom.clip(min=0, max=max_dis - eps) + return paddle.stack([left, top, right, bottom], -1) + + +def distance2bbox(points, distance, max_shape=None): + """Decode distance prediction to bounding box. + Args: + points (Tensor): Shape (n, 2), [x, y]. + distance (Tensor): Distance from the given point to 4 + boundaries (left, top, right, bottom). + max_shape (tuple): Shape of the image. + Returns: + Tensor: Decoded bboxes. + """ + x1 = points[:, 0] - distance[:, 0] + y1 = points[:, 1] - distance[:, 1] + x2 = points[:, 0] + distance[:, 2] + y2 = points[:, 1] + distance[:, 3] + if max_shape is not None: + x1 = x1.clip(min=0, max=max_shape[1]) + y1 = y1.clip(min=0, max=max_shape[0]) + x2 = x2.clip(min=0, max=max_shape[1]) + y2 = y2.clip(min=0, max=max_shape[0]) + return paddle.stack([x1, y1, x2, y2], -1) + + +def bbox_center(boxes): + """Get bbox centers from boxes. + Args: + boxes (Tensor): boxes with shape (..., 4), "xmin, ymin, xmax, ymax" format. + Returns: + Tensor: boxes centers with shape (..., 2), "cx, cy" format. + """ + boxes_cx = (boxes[..., 0] + boxes[..., 2]) / 2 + boxes_cy = (boxes[..., 1] + boxes[..., 3]) / 2 + return paddle.stack([boxes_cx, boxes_cy], axis=-1) + + +def batch_distance2bbox(points, distance, max_shapes=None): + """Decode distance prediction to bounding box for batch. + Args: + points (Tensor): [B, ..., 2], "xy" format + distance (Tensor): [B, ..., 4], "ltrb" format + max_shapes (Tensor): [B, 2], "h,w" format, Shape of the image. + Returns: + Tensor: Decoded bboxes, "x1y1x2y2" format. + """ + lt, rb = paddle.split(distance, 2, -1) + # while tensor add parameters, parameters should be better placed on the second place + x1y1 = -lt + points + x2y2 = rb + points + out_bbox = paddle.concat([x1y1, x2y2], -1) + if max_shapes is not None: + max_shapes = max_shapes.flip(-1).tile([1, 2]) + delta_dim = out_bbox.ndim - max_shapes.ndim + for _ in range(delta_dim): + max_shapes.unsqueeze_(1) + out_bbox = paddle.where(out_bbox < max_shapes, out_bbox, max_shapes) + out_bbox = paddle.where(out_bbox > 0, out_bbox, + paddle.zeros_like(out_bbox)) + return out_bbox + + +def iou_similarity(box1, box2, eps=1e-10): + """Calculate iou of box1 and box2 + + Args: + box1 (Tensor): box with the shape [M1, 4] + box2 (Tensor): box with the shape [M2, 4] + + Return: + iou (Tensor): iou between box1 and box2 with the shape [M1, M2] + """ + box1 = box1.unsqueeze(1) # [M1, 4] -> [M1, 1, 4] + box2 = box2.unsqueeze(0) # [M2, 4] -> [1, M2, 4] + px1y1, px2y2 = box1[:, :, 0:2], box1[:, :, 2:4] + gx1y1, gx2y2 = box2[:, :, 0:2], box2[:, :, 2:4] + x1y1 = paddle.maximum(px1y1, gx1y1) + x2y2 = paddle.minimum(px2y2, gx2y2) + overlap = (x2y2 - x1y1).clip(0).prod(-1) + area1 = (px2y2 - px1y1).clip(0).prod(-1) + area2 = (gx2y2 - gx1y1).clip(0).prod(-1) + union = area1 + area2 - overlap + eps + return overlap / union diff --git a/rtdetr_paddle/ppdet/modeling/cls_utils.py b/rtdetr_paddle/ppdet/modeling/cls_utils.py new file mode 100644 index 0000000..3ae8d11 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/cls_utils.py @@ -0,0 +1,40 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def _get_class_default_kwargs(cls, *args, **kwargs): + """ + Get default arguments of a class in dict format, if args and + kwargs is specified, it will replace default arguments + """ + varnames = cls.__init__.__code__.co_varnames + argcount = cls.__init__.__code__.co_argcount + keys = varnames[:argcount] + assert keys[0] == 'self' + keys = keys[1:] + + values = list(cls.__init__.__defaults__) + assert len(values) == len(keys) + + if len(args) > 0: + for i, arg in enumerate(args): + values[i] = arg + + default_kwargs = dict(zip(keys, values)) + + if len(kwargs) > 0: + for k, v in kwargs.items(): + default_kwargs[k] = v + + return default_kwargs diff --git a/rtdetr_paddle/ppdet/modeling/heads/__init__.py b/rtdetr_paddle/ppdet/modeling/heads/__init__.py new file mode 100644 index 0000000..ccd9c24 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/heads/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .detr_head import * + diff --git a/rtdetr_paddle/ppdet/modeling/heads/detr_head.py b/rtdetr_paddle/ppdet/modeling/heads/detr_head.py new file mode 100644 index 0000000..fde4bb4 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/heads/detr_head.py @@ -0,0 +1,534 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ppdet.core.workspace import register +from ..initializer import linear_init_, constant_ +from ..transformers.utils import inverse_sigmoid + +import pycocotools.mask as mask_util + +__all__ = ['DETRHead', 'DeformableDETRHead', 'DINOHead', 'MaskDINOHead'] + + +class MLP(nn.Layer): + """This code is based on + https://github.com/facebookresearch/detr/blob/main/models/detr.py + """ + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.LayerList( + nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + self._reset_parameters() + + def _reset_parameters(self): + for l in self.layers: + linear_init_(l) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +class MultiHeadAttentionMap(nn.Layer): + """This code is based on + https://github.com/facebookresearch/detr/blob/main/models/segmentation.py + + This is a 2D attention module, which only returns the attention softmax (no multiplication by value) + """ + + def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, + bias=True): + super().__init__() + self.num_heads = num_heads + self.hidden_dim = hidden_dim + self.dropout = nn.Dropout(dropout) + + weight_attr = paddle.ParamAttr( + initializer=paddle.nn.initializer.XavierUniform()) + bias_attr = paddle.framework.ParamAttr( + initializer=paddle.nn.initializer.Constant()) if bias else False + + self.q_proj = nn.Linear(query_dim, hidden_dim, weight_attr, bias_attr) + self.k_proj = nn.Conv2D( + query_dim, + hidden_dim, + 1, + weight_attr=weight_attr, + bias_attr=bias_attr) + + self.normalize_fact = float(hidden_dim / self.num_heads)**-0.5 + + def forward(self, q, k, mask=None): + q = self.q_proj(q) + k = self.k_proj(k) + bs, num_queries, n, c, h, w = q.shape[0], q.shape[1], self.num_heads,\ + self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1] + qh = q.reshape([bs, num_queries, n, c]) + kh = k.reshape([bs, n, c, h, w]) + # weights = paddle.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh) + qh = qh.transpose([0, 2, 1, 3]).reshape([-1, num_queries, c]) + kh = kh.reshape([-1, c, h * w]) + weights = paddle.bmm(qh * self.normalize_fact, kh).reshape( + [bs, n, num_queries, h, w]).transpose([0, 2, 1, 3, 4]) + + if mask is not None: + weights += mask + # fix a potenial bug: https://github.com/facebookresearch/detr/issues/247 + weights = F.softmax(weights.flatten(3), axis=-1).reshape(weights.shape) + weights = self.dropout(weights) + return weights + + +class MaskHeadFPNConv(nn.Layer): + """This code is based on + https://github.com/facebookresearch/detr/blob/main/models/segmentation.py + + Simple convolutional head, using group norm. + Upsampling is done using a FPN approach + """ + + def __init__(self, input_dim, fpn_dims, context_dim, num_groups=8): + super().__init__() + + inter_dims = [input_dim, + ] + [context_dim // (2**i) for i in range(1, 5)] + weight_attr = paddle.ParamAttr( + initializer=paddle.nn.initializer.KaimingUniform()) + bias_attr = paddle.framework.ParamAttr( + initializer=paddle.nn.initializer.Constant()) + + self.conv0 = self._make_layers(input_dim, input_dim, 3, num_groups, + weight_attr, bias_attr) + self.conv_inter = nn.LayerList() + for in_dims, out_dims in zip(inter_dims[:-1], inter_dims[1:]): + self.conv_inter.append( + self._make_layers(in_dims, out_dims, 3, num_groups, weight_attr, + bias_attr)) + + self.conv_out = nn.Conv2D( + inter_dims[-1], + 1, + 3, + padding=1, + weight_attr=weight_attr, + bias_attr=bias_attr) + + self.adapter = nn.LayerList() + for i in range(len(fpn_dims)): + self.adapter.append( + nn.Conv2D( + fpn_dims[i], + inter_dims[i + 1], + 1, + weight_attr=weight_attr, + bias_attr=bias_attr)) + + def _make_layers(self, + in_dims, + out_dims, + kernel_size, + num_groups, + weight_attr=None, + bias_attr=None): + return nn.Sequential( + nn.Conv2D( + in_dims, + out_dims, + kernel_size, + padding=kernel_size // 2, + weight_attr=weight_attr, + bias_attr=bias_attr), + nn.GroupNorm(num_groups, out_dims), + nn.ReLU()) + + def forward(self, x, bbox_attention_map, fpns): + x = paddle.concat([ + x.tile([bbox_attention_map.shape[1], 1, 1, 1]), + bbox_attention_map.flatten(0, 1) + ], 1) + x = self.conv0(x) + for inter_layer, adapter_layer, feat in zip(self.conv_inter[:-1], + self.adapter, fpns): + feat = adapter_layer(feat).tile( + [bbox_attention_map.shape[1], 1, 1, 1]) + x = inter_layer(x) + x = feat + F.interpolate(x, size=feat.shape[-2:]) + + x = self.conv_inter[-1](x) + x = self.conv_out(x) + return x + + +@register +class DETRHead(nn.Layer): + __shared__ = ['num_classes', 'hidden_dim', 'use_focal_loss'] + __inject__ = ['loss'] + + def __init__(self, + num_classes=80, + hidden_dim=256, + nhead=8, + num_mlp_layers=3, + loss='DETRLoss', + fpn_dims=[1024, 512, 256], + with_mask_head=False, + use_focal_loss=False): + super(DETRHead, self).__init__() + # add background class + self.num_classes = num_classes if use_focal_loss else num_classes + 1 + self.hidden_dim = hidden_dim + self.loss = loss + self.with_mask_head = with_mask_head + self.use_focal_loss = use_focal_loss + + self.score_head = nn.Linear(hidden_dim, self.num_classes) + self.bbox_head = MLP(hidden_dim, + hidden_dim, + output_dim=4, + num_layers=num_mlp_layers) + if self.with_mask_head: + self.bbox_attention = MultiHeadAttentionMap(hidden_dim, hidden_dim, + nhead) + self.mask_head = MaskHeadFPNConv(hidden_dim + nhead, fpn_dims, + hidden_dim) + self._reset_parameters() + + def _reset_parameters(self): + linear_init_(self.score_head) + + @classmethod + def from_config(cls, cfg, hidden_dim, nhead, input_shape): + + return { + 'hidden_dim': hidden_dim, + 'nhead': nhead, + 'fpn_dims': [i.channels for i in input_shape[::-1]][1:] + } + + @staticmethod + def get_gt_mask_from_polygons(gt_poly, pad_mask): + out_gt_mask = [] + for polygons, padding in zip(gt_poly, pad_mask): + height, width = int(padding[:, 0].sum()), int(padding[0, :].sum()) + masks = [] + for obj_poly in polygons: + rles = mask_util.frPyObjects(obj_poly, height, width) + rle = mask_util.merge(rles) + masks.append( + paddle.to_tensor(mask_util.decode(rle)).astype('float32')) + masks = paddle.stack(masks) + masks_pad = paddle.zeros( + [masks.shape[0], pad_mask.shape[1], pad_mask.shape[2]]) + masks_pad[:, :height, :width] = masks + out_gt_mask.append(masks_pad) + return out_gt_mask + + def forward(self, out_transformer, body_feats, inputs=None): + r""" + Args: + out_transformer (Tuple): (feats: [num_levels, batch_size, + num_queries, hidden_dim], + memory: [batch_size, hidden_dim, h, w], + src_proj: [batch_size, h*w, hidden_dim], + src_mask: [batch_size, 1, 1, h, w]) + body_feats (List(Tensor)): list[[B, C, H, W]] + inputs (dict): dict(inputs) + """ + feats, memory, src_proj, src_mask = out_transformer + outputs_logit = self.score_head(feats) + outputs_bbox = F.sigmoid(self.bbox_head(feats)) + outputs_seg = None + if self.with_mask_head: + bbox_attention_map = self.bbox_attention(feats[-1], memory, + src_mask) + fpn_feats = [a for a in body_feats[::-1]][1:] + outputs_seg = self.mask_head(src_proj, bbox_attention_map, + fpn_feats) + outputs_seg = outputs_seg.reshape([ + feats.shape[1], feats.shape[2], outputs_seg.shape[-2], + outputs_seg.shape[-1] + ]) + + if self.training: + assert inputs is not None + assert 'gt_bbox' in inputs and 'gt_class' in inputs + gt_mask = self.get_gt_mask_from_polygons( + inputs['gt_poly'], + inputs['pad_mask']) if 'gt_poly' in inputs else None + return self.loss( + outputs_bbox, + outputs_logit, + inputs['gt_bbox'], + inputs['gt_class'], + masks=outputs_seg, + gt_mask=gt_mask) + else: + return (outputs_bbox[-1], outputs_logit[-1], outputs_seg) + + +@register +class DeformableDETRHead(nn.Layer): + __shared__ = ['num_classes', 'hidden_dim'] + __inject__ = ['loss'] + + def __init__(self, + num_classes=80, + hidden_dim=512, + nhead=8, + num_mlp_layers=3, + loss='DETRLoss'): + super(DeformableDETRHead, self).__init__() + self.num_classes = num_classes + self.hidden_dim = hidden_dim + self.nhead = nhead + self.loss = loss + + self.score_head = nn.Linear(hidden_dim, self.num_classes) + self.bbox_head = MLP(hidden_dim, + hidden_dim, + output_dim=4, + num_layers=num_mlp_layers) + + self._reset_parameters() + + def _reset_parameters(self): + linear_init_(self.score_head) + constant_(self.score_head.bias, -4.595) + constant_(self.bbox_head.layers[-1].weight) + + with paddle.no_grad(): + bias = paddle.zeros_like(self.bbox_head.layers[-1].bias) + bias[2:] = -2.0 + self.bbox_head.layers[-1].bias.set_value(bias) + + @classmethod + def from_config(cls, cfg, hidden_dim, nhead, input_shape): + return {'hidden_dim': hidden_dim, 'nhead': nhead} + + def forward(self, out_transformer, body_feats, inputs=None): + r""" + Args: + out_transformer (Tuple): (feats: [num_levels, batch_size, + num_queries, hidden_dim], + memory: [batch_size, + \sum_{l=0}^{L-1} H_l \cdot W_l, hidden_dim], + reference_points: [batch_size, num_queries, 2]) + body_feats (List(Tensor)): list[[B, C, H, W]] + inputs (dict): dict(inputs) + """ + feats, memory, reference_points = out_transformer + reference_points = inverse_sigmoid(reference_points.unsqueeze(0)) + outputs_bbox = self.bbox_head(feats) + + # It's equivalent to "outputs_bbox[:, :, :, :2] += reference_points", + # but the gradient is wrong in paddle. + outputs_bbox = paddle.concat( + [ + outputs_bbox[:, :, :, :2] + reference_points, + outputs_bbox[:, :, :, 2:] + ], + axis=-1) + + outputs_bbox = F.sigmoid(outputs_bbox) + outputs_logit = self.score_head(feats) + + if self.training: + assert inputs is not None + assert 'gt_bbox' in inputs and 'gt_class' in inputs + + return self.loss(outputs_bbox, outputs_logit, inputs['gt_bbox'], + inputs['gt_class']) + else: + return (outputs_bbox[-1], outputs_logit[-1], None) + + +@register +class DINOHead(nn.Layer): + __inject__ = ['loss'] + + def __init__(self, loss='DINOLoss'): + super(DINOHead, self).__init__() + self.loss = loss + + def forward(self, out_transformer, body_feats, inputs=None): + (dec_out_bboxes, dec_out_logits, enc_topk_bboxes, enc_topk_logits, + dn_meta) = out_transformer + if self.training: + assert inputs is not None + assert 'gt_bbox' in inputs and 'gt_class' in inputs + + if dn_meta is not None: + if isinstance(dn_meta, list): + dual_groups = len(dn_meta) - 1 + dec_out_bboxes = paddle.split( + dec_out_bboxes, dual_groups + 1, axis=2) + dec_out_logits = paddle.split( + dec_out_logits, dual_groups + 1, axis=2) + enc_topk_bboxes = paddle.split( + enc_topk_bboxes, dual_groups + 1, axis=1) + enc_topk_logits = paddle.split( + enc_topk_logits, dual_groups + 1, axis=1) + + dec_out_bboxes_list = [] + dec_out_logits_list = [] + dn_out_bboxes_list = [] + dn_out_logits_list = [] + loss = {} + for g_id in range(dual_groups + 1): + if dn_meta[g_id] is not None: + dn_out_bboxes_gid, dec_out_bboxes_gid = paddle.split( + dec_out_bboxes[g_id], + dn_meta[g_id]['dn_num_split'], + axis=2) + dn_out_logits_gid, dec_out_logits_gid = paddle.split( + dec_out_logits[g_id], + dn_meta[g_id]['dn_num_split'], + axis=2) + else: + dn_out_bboxes_gid, dn_out_logits_gid = None, None + dec_out_bboxes_gid = dec_out_bboxes[g_id] + dec_out_logits_gid = dec_out_logits[g_id] + out_bboxes_gid = paddle.concat([ + enc_topk_bboxes[g_id].unsqueeze(0), + dec_out_bboxes_gid + ]) + out_logits_gid = paddle.concat([ + enc_topk_logits[g_id].unsqueeze(0), + dec_out_logits_gid + ]) + loss_gid = self.loss( + out_bboxes_gid, + out_logits_gid, + inputs['gt_bbox'], + inputs['gt_class'], + dn_out_bboxes=dn_out_bboxes_gid, + dn_out_logits=dn_out_logits_gid, + dn_meta=dn_meta[g_id]) + # sum loss + for key, value in loss_gid.items(): + loss.update({ + key: loss.get(key, paddle.zeros([1])) + value + }) + + # average across (dual_groups + 1) + for key, value in loss.items(): + loss.update({key: value / (dual_groups + 1)}) + return loss + else: + dn_out_bboxes, dec_out_bboxes = paddle.split( + dec_out_bboxes, dn_meta['dn_num_split'], axis=2) + dn_out_logits, dec_out_logits = paddle.split( + dec_out_logits, dn_meta['dn_num_split'], axis=2) + else: + dn_out_bboxes, dn_out_logits = None, None + + out_bboxes = paddle.concat( + [enc_topk_bboxes.unsqueeze(0), dec_out_bboxes]) + out_logits = paddle.concat( + [enc_topk_logits.unsqueeze(0), dec_out_logits]) + + return self.loss( + out_bboxes, + out_logits, + inputs['gt_bbox'], + inputs['gt_class'], + dn_out_bboxes=dn_out_bboxes, + dn_out_logits=dn_out_logits, + dn_meta=dn_meta) + else: + return (dec_out_bboxes[-1], dec_out_logits[-1], None) + + +@register +class MaskDINOHead(nn.Layer): + __inject__ = ['loss'] + + def __init__(self, loss='DINOLoss'): + super(MaskDINOHead, self).__init__() + self.loss = loss + + def forward(self, out_transformer, body_feats, inputs=None): + (dec_out_logits, dec_out_bboxes, dec_out_masks, enc_out, init_out, + dn_meta) = out_transformer + if self.training: + assert inputs is not None + assert 'gt_bbox' in inputs and 'gt_class' in inputs + assert 'gt_segm' in inputs + + if dn_meta is not None: + dn_out_logits, dec_out_logits = paddle.split( + dec_out_logits, dn_meta['dn_num_split'], axis=2) + dn_out_bboxes, dec_out_bboxes = paddle.split( + dec_out_bboxes, dn_meta['dn_num_split'], axis=2) + dn_out_masks, dec_out_masks = paddle.split( + dec_out_masks, dn_meta['dn_num_split'], axis=2) + if init_out is not None: + init_out_logits, init_out_bboxes, init_out_masks = init_out + init_out_logits_dn, init_out_logits = paddle.split( + init_out_logits, dn_meta['dn_num_split'], axis=1) + init_out_bboxes_dn, init_out_bboxes = paddle.split( + init_out_bboxes, dn_meta['dn_num_split'], axis=1) + init_out_masks_dn, init_out_masks = paddle.split( + init_out_masks, dn_meta['dn_num_split'], axis=1) + + dec_out_logits = paddle.concat( + [init_out_logits.unsqueeze(0), dec_out_logits]) + dec_out_bboxes = paddle.concat( + [init_out_bboxes.unsqueeze(0), dec_out_bboxes]) + dec_out_masks = paddle.concat( + [init_out_masks.unsqueeze(0), dec_out_masks]) + + dn_out_logits = paddle.concat( + [init_out_logits_dn.unsqueeze(0), dn_out_logits]) + dn_out_bboxes = paddle.concat( + [init_out_bboxes_dn.unsqueeze(0), dn_out_bboxes]) + dn_out_masks = paddle.concat( + [init_out_masks_dn.unsqueeze(0), dn_out_masks]) + else: + dn_out_bboxes, dn_out_logits = None, None + dn_out_masks = None + + enc_out_logits, enc_out_bboxes, enc_out_masks = enc_out + out_logits = paddle.concat( + [enc_out_logits.unsqueeze(0), dec_out_logits]) + out_bboxes = paddle.concat( + [enc_out_bboxes.unsqueeze(0), dec_out_bboxes]) + out_masks = paddle.concat( + [enc_out_masks.unsqueeze(0), dec_out_masks]) + + return self.loss( + out_bboxes, + out_logits, + inputs['gt_bbox'], + inputs['gt_class'], + masks=out_masks, + gt_mask=inputs['gt_segm'], + dn_out_logits=dn_out_logits, + dn_out_bboxes=dn_out_bboxes, + dn_out_masks=dn_out_masks, + dn_meta=dn_meta) + else: + return (dec_out_bboxes[-1], dec_out_logits[-1], dec_out_masks[-1]) diff --git a/rtdetr_paddle/ppdet/modeling/initializer.py b/rtdetr_paddle/ppdet/modeling/initializer.py new file mode 100644 index 0000000..308c51b --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/initializer.py @@ -0,0 +1,325 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py +Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file. +""" + +import math +import numpy as np + +import paddle +import paddle.nn as nn + +__all__ = [ + 'uniform_', + 'normal_', + 'constant_', + 'ones_', + 'zeros_', + 'xavier_uniform_', + 'xavier_normal_', + 'kaiming_uniform_', + 'kaiming_normal_', + 'linear_init_', + 'conv_init_', + 'reset_initialized_parameter', +] + + +def _no_grad_uniform_(tensor, a, b): + with paddle.no_grad(): + tensor.set_value( + paddle.uniform( + shape=tensor.shape, dtype=tensor.dtype, min=a, max=b)) + return tensor + + +def _no_grad_normal_(tensor, mean=0., std=1.): + with paddle.no_grad(): + tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape)) + return tensor + + +def _no_grad_fill_(tensor, value=0.): + with paddle.no_grad(): + tensor.set_value(paddle.full_like(tensor, value, dtype=tensor.dtype)) + return tensor + + +def uniform_(tensor, a, b): + """ + Modified tensor inspace using uniform_ + Args: + tensor (paddle.Tensor): paddle Tensor + a (float|int): min value. + b (float|int): max value. + Return: + tensor + """ + return _no_grad_uniform_(tensor, a, b) + + +def normal_(tensor, mean=0., std=1.): + """ + Modified tensor inspace using normal_ + Args: + tensor (paddle.Tensor): paddle Tensor + mean (float|int): mean value. + std (float|int): std value. + Return: + tensor + """ + return _no_grad_normal_(tensor, mean, std) + + +def constant_(tensor, value=0.): + """ + Modified tensor inspace using constant_ + Args: + tensor (paddle.Tensor): paddle Tensor + value (float|int): value to fill tensor. + Return: + tensor + """ + return _no_grad_fill_(tensor, value) + + +def ones_(tensor): + """ + Modified tensor inspace using ones_ + Args: + tensor (paddle.Tensor): paddle Tensor + Return: + tensor + """ + return _no_grad_fill_(tensor, 1) + + +def zeros_(tensor): + """ + Modified tensor inspace using zeros_ + Args: + tensor (paddle.Tensor): paddle Tensor + Return: + tensor + """ + return _no_grad_fill_(tensor, 0) + + +def vector_(tensor, vector): + with paddle.no_grad(): + tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype)) + return tensor + + +def _calculate_fan_in_and_fan_out(tensor, reverse=False): + """ + Calculate (fan_in, _fan_out) for tensor + + Args: + tensor (Tensor): paddle.Tensor + reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True + + Return: + Tuple[fan_in, fan_out] + """ + if tensor.ndim < 2: + raise ValueError( + "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions" + ) + + if reverse: + num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1] + else: + num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0] + + receptive_field_size = 1 + if tensor.ndim > 2: + receptive_field_size = np.prod(tensor.shape[2:]) + + fan_in = num_input_fmaps * receptive_field_size + fan_out = num_output_fmaps * receptive_field_size + + return fan_in, fan_out + + +def xavier_uniform_(tensor, gain=1., reverse=False): + """ + Modified tensor inspace using xavier_uniform_ + Args: + tensor (paddle.Tensor): paddle Tensor + gain (float): super parameter, 1. default. + reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. + Return: + tensor + """ + fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse) + std = gain * math.sqrt(2.0 / float(fan_in + fan_out)) + k = math.sqrt(3.0) * std + return _no_grad_uniform_(tensor, -k, k) + + +def xavier_normal_(tensor, gain=1., reverse=False): + """ + Modified tensor inspace using xavier_normal_ + Args: + tensor (paddle.Tensor): paddle Tensor + gain (float): super parameter, 1. default. + reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. + Return: + tensor + """ + fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse) + std = gain * math.sqrt(2.0 / float(fan_in + fan_out)) + return _no_grad_normal_(tensor, 0, std) + + +# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html +def _calculate_correct_fan(tensor, mode, reverse=False): + mode = mode.lower() + valid_modes = ['fan_in', 'fan_out'] + if mode not in valid_modes: + raise ValueError("Mode {} not supported, please use one of {}".format( + mode, valid_modes)) + + fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse) + + return fan_in if mode == 'fan_in' else fan_out + + +def _calculate_gain(nonlinearity, param=None): + linear_fns = [ + 'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d', + 'conv_transpose2d', 'conv_transpose3d' + ] + if nonlinearity in linear_fns or nonlinearity == 'sigmoid': + return 1 + elif nonlinearity == 'tanh': + return 5.0 / 3 + elif nonlinearity == 'relu': + return math.sqrt(2.0) + elif nonlinearity == 'leaky_relu': + if param is None: + negative_slope = 0.01 + elif not isinstance(param, bool) and isinstance( + param, int) or isinstance(param, float): + # True/False are instances of int, hence check above + negative_slope = param + else: + raise ValueError("negative_slope {} not a valid number".format( + param)) + return math.sqrt(2.0 / (1 + negative_slope**2)) + elif nonlinearity == 'selu': + return 3.0 / 4 + else: + raise ValueError("Unsupported nonlinearity {}".format(nonlinearity)) + + +def kaiming_uniform_(tensor, + a=0, + mode='fan_in', + nonlinearity='leaky_relu', + reverse=False): + """ + Modified tensor inspace using kaiming_uniform method + Args: + tensor (paddle.Tensor): paddle Tensor + mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut + nonlinearity (str): nonlinearity method name + reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. + Return: + tensor + """ + fan = _calculate_correct_fan(tensor, mode, reverse) + gain = _calculate_gain(nonlinearity, a) + std = gain / math.sqrt(fan) + k = math.sqrt(3.0) * std + return _no_grad_uniform_(tensor, -k, k) + + +def kaiming_normal_(tensor, + a=0, + mode='fan_in', + nonlinearity='leaky_relu', + reverse=False): + """ + Modified tensor inspace using kaiming_normal_ + Args: + tensor (paddle.Tensor): paddle Tensor + mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut + nonlinearity (str): nonlinearity method name + reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. + Return: + tensor + """ + fan = _calculate_correct_fan(tensor, mode, reverse) + gain = _calculate_gain(nonlinearity, a) + std = gain / math.sqrt(fan) + return _no_grad_normal_(tensor, 0, std) + + +def linear_init_(module): + bound = 1 / math.sqrt(module.weight.shape[0]) + uniform_(module.weight, -bound, bound) + if hasattr(module, "bias") and module.bias is not None: + uniform_(module.bias, -bound, bound) + + +def conv_init_(module): + bound = 1 / np.sqrt(np.prod(module.weight.shape[1:])) + uniform_(module.weight, -bound, bound) + if module.bias is not None: + uniform_(module.bias, -bound, bound) + + +def bias_init_with_prob(prior_prob=0.01): + """initialize conv/fc bias value according to a given probability value.""" + bias_init = float(-np.log((1 - prior_prob) / prior_prob)) + return bias_init + + +@paddle.no_grad() +def reset_initialized_parameter(model, include_self=True): + """ + Reset initialized parameter using following method for [conv, linear, embedding, bn] + + Args: + model (paddle.Layer): paddle Layer + include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself + Return: + None + """ + for _, m in model.named_sublayers(include_self=include_self): + if isinstance(m, nn.Conv2D): + k = float(m._groups) / (m._in_channels * m._kernel_size[0] * + m._kernel_size[1]) + k = math.sqrt(k) + _no_grad_uniform_(m.weight, -k, k) + if hasattr(m, 'bias') and getattr(m, 'bias') is not None: + _no_grad_uniform_(m.bias, -k, k) + + elif isinstance(m, nn.Linear): + k = math.sqrt(1. / m.weight.shape[0]) + _no_grad_uniform_(m.weight, -k, k) + if hasattr(m, 'bias') and getattr(m, 'bias') is not None: + _no_grad_uniform_(m.bias, -k, k) + + elif isinstance(m, nn.Embedding): + _no_grad_normal_(m.weight, mean=0., std=1.) + + elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)): + _no_grad_fill_(m.weight, 1.) + if hasattr(m, 'bias') and getattr(m, 'bias') is not None: + _no_grad_fill_(m.bias, 0) diff --git a/rtdetr_paddle/ppdet/modeling/keypoint_utils.py b/rtdetr_paddle/ppdet/modeling/keypoint_utils.py new file mode 100644 index 0000000..377f1d7 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/keypoint_utils.py @@ -0,0 +1,403 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +this code is based on https://github.com/open-mmlab/mmpose +""" + +import cv2 +import numpy as np +import paddle.nn.functional as F + + +def get_affine_mat_kernel(h, w, s, inv=False): + if w < h: + w_ = s + h_ = int(np.ceil((s / w * h) / 64.) * 64) + scale_w = w + scale_h = h_ / w_ * w + + else: + h_ = s + w_ = int(np.ceil((s / h * w) / 64.) * 64) + scale_h = h + scale_w = w_ / h_ * h + + center = np.array([np.round(w / 2.), np.round(h / 2.)]) + + size_resized = (w_, h_) + trans = get_affine_transform( + center, np.array([scale_w, scale_h]), 0, size_resized, inv=inv) + + return trans, size_resized + + +def get_affine_transform(center, + input_size, + rot, + output_size, + shift=(0., 0.), + inv=False): + """Get the affine transform matrix, given the center/scale/rot/output_size. + + Args: + center (np.ndarray[2, ]): Center of the bounding box (x, y). + input_size (np.ndarray[2, ]): Size of input feature (width, height). + rot (float): Rotation angle (degree). + output_size (np.ndarray[2, ]): Size of the destination heatmaps. + shift (0-100%): Shift translation ratio wrt the width/height. + Default (0., 0.). + inv (bool): Option to inverse the affine transform direction. + (inv=False: src->dst or inv=True: dst->src) + + Returns: + np.ndarray: The transform matrix. + """ + assert len(center) == 2 + assert len(output_size) == 2 + assert len(shift) == 2 + + if not isinstance(input_size, (np.ndarray, list)): + input_size = np.array([input_size, input_size], dtype=np.float32) + scale_tmp = input_size + + shift = np.array(shift) + src_w = scale_tmp[0] + dst_w = output_size[0] + dst_h = output_size[1] + + rot_rad = np.pi * rot / 180 + src_dir = rotate_point([0., src_w * -0.5], rot_rad) + dst_dir = np.array([0., dst_w * -0.5]) + + src = np.zeros((3, 2), dtype=np.float32) + + src[0, :] = center + scale_tmp * shift + src[1, :] = center + src_dir + scale_tmp * shift + src[2, :] = _get_3rd_point(src[0, :], src[1, :]) + + dst = np.zeros((3, 2), dtype=np.float32) + dst[0, :] = [dst_w * 0.5, dst_h * 0.5] + dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir + dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :]) + + if inv: + trans = cv2.getAffineTransform(np.float32(dst), np.float32(src)) + else: + trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) + + return trans + + +def get_warp_matrix(theta, size_input, size_dst, size_target): + """This code is based on + https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py + + Calculate the transformation matrix under the constraint of unbiased. + Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased + Data Processing for Human Pose Estimation (CVPR 2020). + + Args: + theta (float): Rotation angle in degrees. + size_input (np.ndarray): Size of input image [w, h]. + size_dst (np.ndarray): Size of output image [w, h]. + size_target (np.ndarray): Size of ROI in input plane [w, h]. + + Returns: + matrix (np.ndarray): A matrix for transformation. + """ + theta = np.deg2rad(theta) + matrix = np.zeros((2, 3), dtype=np.float32) + scale_x = size_dst[0] / size_target[0] + scale_y = size_dst[1] / size_target[1] + matrix[0, 0] = np.cos(theta) * scale_x + matrix[0, 1] = -np.sin(theta) * scale_x + matrix[0, 2] = scale_x * ( + -0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] * + np.sin(theta) + 0.5 * size_target[0]) + matrix[1, 0] = np.sin(theta) * scale_y + matrix[1, 1] = np.cos(theta) * scale_y + matrix[1, 2] = scale_y * ( + -0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] * + np.cos(theta) + 0.5 * size_target[1]) + return matrix + + +def _get_3rd_point(a, b): + """To calculate the affine matrix, three pairs of points are required. This + function is used to get the 3rd point, given 2D points a & b. + + The 3rd point is defined by rotating vector `a - b` by 90 degrees + anticlockwise, using b as the rotation center. + + Args: + a (np.ndarray): point(x,y) + b (np.ndarray): point(x,y) + + Returns: + np.ndarray: The 3rd point. + """ + assert len( + a) == 2, 'input of _get_3rd_point should be point with length of 2' + assert len( + b) == 2, 'input of _get_3rd_point should be point with length of 2' + direction = a - b + third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32) + + return third_pt + + +def rotate_point(pt, angle_rad): + """Rotate a point by an angle. + + Args: + pt (list[float]): 2 dimensional point to be rotated + angle_rad (float): rotation angle by radian + + Returns: + list[float]: Rotated point. + """ + assert len(pt) == 2 + sn, cs = np.sin(angle_rad), np.cos(angle_rad) + new_x = pt[0] * cs - pt[1] * sn + new_y = pt[0] * sn + pt[1] * cs + rotated_pt = [new_x, new_y] + + return rotated_pt + + +def transpred(kpts, h, w, s): + trans, _ = get_affine_mat_kernel(h, w, s, inv=True) + + return warp_affine_joints(kpts[..., :2].copy(), trans) + + +def warp_affine_joints(joints, mat): + """Apply affine transformation defined by the transform matrix on the + joints. + + Args: + joints (np.ndarray[..., 2]): Origin coordinate of joints. + mat (np.ndarray[3, 2]): The affine matrix. + + Returns: + matrix (np.ndarray[..., 2]): Result coordinate of joints. + """ + joints = np.array(joints) + shape = joints.shape + joints = joints.reshape(-1, 2) + return np.dot(np.concatenate( + (joints, joints[:, 0:1] * 0 + 1), axis=1), + mat.T).reshape(shape) + + +def affine_transform(pt, t): + new_pt = np.array([pt[0], pt[1], 1.]).T + new_pt = np.dot(t, new_pt) + return new_pt[:2] + + +def transform_preds(coords, center, scale, output_size): + target_coords = np.zeros(coords.shape) + trans = get_affine_transform(center, scale * 200, 0, output_size, inv=1) + for p in range(coords.shape[0]): + target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans) + return target_coords + + +def oks_iou(g, d, a_g, a_d, sigmas=None, in_vis_thre=None): + if not isinstance(sigmas, np.ndarray): + sigmas = np.array([ + .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, + .87, .87, .89, .89 + ]) / 10.0 + vars = (sigmas * 2)**2 + xg = g[0::3] + yg = g[1::3] + vg = g[2::3] + ious = np.zeros((d.shape[0])) + for n_d in range(0, d.shape[0]): + xd = d[n_d, 0::3] + yd = d[n_d, 1::3] + vd = d[n_d, 2::3] + dx = xd - xg + dy = yd - yg + e = (dx**2 + dy**2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2 + if in_vis_thre is not None: + ind = list(vg > in_vis_thre) and list(vd > in_vis_thre) + e = e[ind] + ious[n_d] = np.sum(np.exp(-e)) / e.shape[0] if e.shape[0] != 0 else 0.0 + return ious + + +def oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None): + """greedily select boxes with high confidence and overlap with current maximum <= thresh + rule out overlap >= thresh + + Args: + kpts_db (list): The predicted keypoints within the image + thresh (float): The threshold to select the boxes + sigmas (np.array): The variance to calculate the oks iou + Default: None + in_vis_thre (float): The threshold to select the high confidence boxes + Default: None + + Return: + keep (list): indexes to keep + """ + + if len(kpts_db) == 0: + return [] + + scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))]) + kpts = np.array( + [kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))]) + areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))]) + + order = scores.argsort()[::-1] + + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + + oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]], + sigmas, in_vis_thre) + + inds = np.where(oks_ovr <= thresh)[0] + order = order[inds + 1] + + return keep + + +def rescore(overlap, scores, thresh, type='gaussian'): + assert overlap.shape[0] == scores.shape[0] + if type == 'linear': + inds = np.where(overlap >= thresh)[0] + scores[inds] = scores[inds] * (1 - overlap[inds]) + else: + scores = scores * np.exp(-overlap**2 / thresh) + + return scores + + +def soft_oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None): + """greedily select boxes with high confidence and overlap with current maximum <= thresh + rule out overlap >= thresh + + Args: + kpts_db (list): The predicted keypoints within the image + thresh (float): The threshold to select the boxes + sigmas (np.array): The variance to calculate the oks iou + Default: None + in_vis_thre (float): The threshold to select the high confidence boxes + Default: None + + Return: + keep (list): indexes to keep + """ + + if len(kpts_db) == 0: + return [] + + scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))]) + kpts = np.array( + [kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))]) + areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))]) + + order = scores.argsort()[::-1] + scores = scores[order] + + # max_dets = order.size + max_dets = 20 + keep = np.zeros(max_dets, dtype=np.intp) + keep_cnt = 0 + while order.size > 0 and keep_cnt < max_dets: + i = order[0] + + oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]], + sigmas, in_vis_thre) + + order = order[1:] + scores = rescore(oks_ovr, scores[1:], thresh) + + tmp = scores.argsort()[::-1] + order = order[tmp] + scores = scores[tmp] + + keep[keep_cnt] = i + keep_cnt += 1 + + keep = keep[:keep_cnt] + + return keep + + +def resize(input, + size=None, + scale_factor=None, + mode='nearest', + align_corners=None, + warning=True): + if warning: + if size is not None and align_corners: + input_h, input_w = tuple(int(x) for x in input.shape[2:]) + output_h, output_w = tuple(int(x) for x in size) + if output_h > input_h or output_w > output_h: + if ((output_h > 1 and output_w > 1 and input_h > 1 and + input_w > 1) and (output_h - 1) % (input_h - 1) and + (output_w - 1) % (input_w - 1)): + warnings.warn( + f'When align_corners={align_corners}, ' + 'the output would more aligned if ' + f'input size {(input_h, input_w)} is `x+1` and ' + f'out size {(output_h, output_w)} is `nx+1`') + + return F.interpolate(input, size, scale_factor, mode, align_corners) + + +def flip_back(output_flipped, flip_pairs, target_type='GaussianHeatmap'): + """Flip the flipped heatmaps back to the original form. + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + Args: + output_flipped (np.ndarray[N, K, H, W]): The output heatmaps obtained + from the flipped images. + flip_pairs (list[tuple()): Pairs of keypoints which are mirrored + (for example, left ear -- right ear). + target_type (str): GaussianHeatmap or CombinedTarget + Returns: + np.ndarray: heatmaps that flipped back to the original image + """ + assert len(output_flipped.shape) == 4, \ + 'output_flipped should be [batch_size, num_keypoints, height, width]' + shape_ori = output_flipped.shape + channels = 1 + if target_type.lower() == 'CombinedTarget'.lower(): + channels = 3 + output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...] + output_flipped = output_flipped.reshape((shape_ori[0], -1, channels, + shape_ori[2], shape_ori[3])) + output_flipped_back = output_flipped.clone() + + # Swap left-right parts + for left, right in flip_pairs: + output_flipped_back[:, left, ...] = output_flipped[:, right, ...] + output_flipped_back[:, right, ...] = output_flipped[:, left, ...] + output_flipped_back = output_flipped_back.reshape(shape_ori) + # Flip horizontally + output_flipped_back = output_flipped_back[..., ::-1] + return output_flipped_back diff --git a/rtdetr_paddle/ppdet/modeling/layers.py b/rtdetr_paddle/ppdet/modeling/layers.py new file mode 100644 index 0000000..86c6d96 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/layers.py @@ -0,0 +1,1346 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import six +import numpy as np +from numbers import Integral + +import paddle +import paddle.nn as nn +from paddle import ParamAttr +from paddle import to_tensor +import paddle.nn.functional as F +from paddle.nn.initializer import Normal, Constant, XavierUniform +from paddle.regularizer import L2Decay + +from ppdet.core.workspace import register, serializable +from ppdet.modeling.bbox_utils import delta2bbox +from . import ops +from .initializer import xavier_uniform_, constant_ + +from paddle.vision.ops import DeformConv2D + + +def _to_list(l): + if isinstance(l, (list, tuple)): + return list(l) + return [l] + + +class AlignConv(nn.Layer): + def __init__(self, in_channels, out_channels, kernel_size=3, groups=1): + super(AlignConv, self).__init__() + self.kernel_size = kernel_size + self.align_conv = paddle.vision.ops.DeformConv2D( + in_channels, + out_channels, + kernel_size=self.kernel_size, + padding=(self.kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(initializer=Normal(0, 0.01)), + bias_attr=None) + + @paddle.no_grad() + def get_offset(self, anchors, featmap_size, stride): + """ + Args: + anchors: [B, L, 5] xc,yc,w,h,angle + featmap_size: (feat_h, feat_w) + stride: 8 + Returns: + + """ + batch = anchors.shape[0] + dtype = anchors.dtype + feat_h, feat_w = featmap_size + pad = (self.kernel_size - 1) // 2 + idx = paddle.arange(-pad, pad + 1, dtype=dtype) + + yy, xx = paddle.meshgrid(idx, idx) + xx = paddle.reshape(xx, [-1]) + yy = paddle.reshape(yy, [-1]) + + # get sampling locations of default conv + xc = paddle.arange(0, feat_w, dtype=dtype) + yc = paddle.arange(0, feat_h, dtype=dtype) + yc, xc = paddle.meshgrid(yc, xc) + + xc = paddle.reshape(xc, [-1, 1]) + yc = paddle.reshape(yc, [-1, 1]) + x_conv = xc + xx + y_conv = yc + yy + + # get sampling locations of anchors + x_ctr, y_ctr, w, h, a = paddle.split(anchors, 5, axis=-1) + x_ctr = x_ctr / stride + y_ctr = y_ctr / stride + w_s = w / stride + h_s = h / stride + cos, sin = paddle.cos(a), paddle.sin(a) + dw, dh = w_s / self.kernel_size, h_s / self.kernel_size + x, y = dw * xx, dh * yy + xr = cos * x - sin * y + yr = sin * x + cos * y + x_anchor, y_anchor = xr + x_ctr, yr + y_ctr + # get offset filed + offset_x = x_anchor - x_conv + offset_y = y_anchor - y_conv + offset = paddle.stack([offset_y, offset_x], axis=-1) + offset = offset.reshape( + [batch, feat_h, feat_w, self.kernel_size * self.kernel_size * 2]) + offset = offset.transpose([0, 3, 1, 2]) + + return offset + + def forward(self, x, refine_anchors, featmap_size, stride): + batch = paddle.shape(x)[0].numpy() + offset = self.get_offset(refine_anchors, featmap_size, stride) + if self.training: + x = F.relu(self.align_conv(x, offset.detach())) + else: + x = F.relu(self.align_conv(x, offset)) + return x + + +class DeformableConvV2(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + weight_attr=None, + bias_attr=None, + lr_scale=1, + regularizer=None, + skip_quant=False, + dcn_bias_regularizer=L2Decay(0.), + dcn_bias_lr_scale=2.): + super(DeformableConvV2, self).__init__() + self.offset_channel = 2 * kernel_size**2 + self.mask_channel = kernel_size**2 + + if lr_scale == 1 and regularizer is None: + offset_bias_attr = ParamAttr(initializer=Constant(0.)) + else: + offset_bias_attr = ParamAttr( + initializer=Constant(0.), + learning_rate=lr_scale, + regularizer=regularizer) + self.conv_offset = nn.Conv2D( + in_channels, + 3 * kernel_size**2, + kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + weight_attr=ParamAttr(initializer=Constant(0.0)), + bias_attr=offset_bias_attr) + if skip_quant: + self.conv_offset.skip_quant = True + + if bias_attr: + # in FCOS-DCN head, specifically need learning_rate and regularizer + dcn_bias_attr = ParamAttr( + initializer=Constant(value=0), + regularizer=dcn_bias_regularizer, + learning_rate=dcn_bias_lr_scale) + else: + # in ResNet backbone, do not need bias + dcn_bias_attr = False + self.conv_dcn = DeformConv2D( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2 * dilation, + dilation=dilation, + groups=groups, + weight_attr=weight_attr, + bias_attr=dcn_bias_attr) + + def forward(self, x): + offset_mask = self.conv_offset(x) + offset, mask = paddle.split( + offset_mask, + num_or_sections=[self.offset_channel, self.mask_channel], + axis=1) + mask = F.sigmoid(mask) + y = self.conv_dcn(x, offset, mask=mask) + return y + + +class ConvNormLayer(nn.Layer): + def __init__(self, + ch_in, + ch_out, + filter_size, + stride, + groups=1, + norm_type='bn', + norm_decay=0., + norm_groups=32, + use_dcn=False, + bias_on=False, + lr_scale=1., + freeze_norm=False, + initializer=Normal( + mean=0., std=0.01), + skip_quant=False, + dcn_lr_scale=2., + dcn_regularizer=L2Decay(0.)): + super(ConvNormLayer, self).__init__() + assert norm_type in ['bn', 'sync_bn', 'gn', None] + + if bias_on: + bias_attr = ParamAttr( + initializer=Constant(value=0.), learning_rate=lr_scale) + else: + bias_attr = False + + if not use_dcn: + self.conv = nn.Conv2D( + in_channels=ch_in, + out_channels=ch_out, + kernel_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr( + initializer=initializer, learning_rate=1.), + bias_attr=bias_attr) + if skip_quant: + self.conv.skip_quant = True + else: + # in FCOS-DCN head, specifically need learning_rate and regularizer + self.conv = DeformableConvV2( + in_channels=ch_in, + out_channels=ch_out, + kernel_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr( + initializer=initializer, learning_rate=1.), + bias_attr=True, + lr_scale=dcn_lr_scale, + regularizer=dcn_regularizer, + dcn_bias_regularizer=dcn_regularizer, + dcn_bias_lr_scale=dcn_lr_scale, + skip_quant=skip_quant) + + norm_lr = 0. if freeze_norm else 1. + param_attr = ParamAttr( + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay) if norm_decay is not None else None) + bias_attr = ParamAttr( + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay) if norm_decay is not None else None) + if norm_type in ['bn', 'sync_bn']: + self.norm = nn.BatchNorm2D( + ch_out, weight_attr=param_attr, bias_attr=bias_attr) + elif norm_type == 'gn': + self.norm = nn.GroupNorm( + num_groups=norm_groups, + num_channels=ch_out, + weight_attr=param_attr, + bias_attr=bias_attr) + else: + self.norm = None + + def forward(self, inputs): + out = self.conv(inputs) + if self.norm is not None: + out = self.norm(out) + return out + + +class LiteConv(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride=1, + with_act=True, + norm_type='sync_bn', + name=None): + super(LiteConv, self).__init__() + self.lite_conv = nn.Sequential() + conv1 = ConvNormLayer( + in_channels, + in_channels, + filter_size=5, + stride=stride, + groups=in_channels, + norm_type=norm_type, + initializer=XavierUniform()) + conv2 = ConvNormLayer( + in_channels, + out_channels, + filter_size=1, + stride=stride, + norm_type=norm_type, + initializer=XavierUniform()) + conv3 = ConvNormLayer( + out_channels, + out_channels, + filter_size=1, + stride=stride, + norm_type=norm_type, + initializer=XavierUniform()) + conv4 = ConvNormLayer( + out_channels, + out_channels, + filter_size=5, + stride=stride, + groups=out_channels, + norm_type=norm_type, + initializer=XavierUniform()) + conv_list = [conv1, conv2, conv3, conv4] + self.lite_conv.add_sublayer('conv1', conv1) + self.lite_conv.add_sublayer('relu6_1', nn.ReLU6()) + self.lite_conv.add_sublayer('conv2', conv2) + if with_act: + self.lite_conv.add_sublayer('relu6_2', nn.ReLU6()) + self.lite_conv.add_sublayer('conv3', conv3) + self.lite_conv.add_sublayer('relu6_3', nn.ReLU6()) + self.lite_conv.add_sublayer('conv4', conv4) + if with_act: + self.lite_conv.add_sublayer('relu6_4', nn.ReLU6()) + + def forward(self, inputs): + out = self.lite_conv(inputs) + return out + + +class DropBlock(nn.Layer): + def __init__(self, block_size, keep_prob, name=None, data_format='NCHW'): + """ + DropBlock layer, see https://arxiv.org/abs/1810.12890 + + Args: + block_size (int): block size + keep_prob (int): keep probability + name (str): layer name + data_format (str): data format, NCHW or NHWC + """ + super(DropBlock, self).__init__() + self.block_size = block_size + self.keep_prob = keep_prob + self.name = name + self.data_format = data_format + + def forward(self, x): + if not self.training or self.keep_prob == 1: + return x + else: + gamma = (1. - self.keep_prob) / (self.block_size**2) + if self.data_format == 'NCHW': + shape = x.shape[2:] + else: + shape = x.shape[1:3] + for s in shape: + gamma *= s / (s - self.block_size + 1) + + matrix = paddle.cast(paddle.rand(x.shape) < gamma, x.dtype) + mask_inv = F.max_pool2d( + matrix, + self.block_size, + stride=1, + padding=self.block_size // 2, + data_format=self.data_format) + mask = 1. - mask_inv + y = x * mask * (mask.numel() / mask.sum()) + return y + + +@register +@serializable +class AnchorGeneratorSSD(object): + def __init__(self, + steps=[8, 16, 32, 64, 100, 300], + aspect_ratios=[[2.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]], + min_ratio=15, + max_ratio=90, + base_size=300, + min_sizes=[30.0, 60.0, 111.0, 162.0, 213.0, 264.0], + max_sizes=[60.0, 111.0, 162.0, 213.0, 264.0, 315.0], + offset=0.5, + flip=True, + clip=False, + min_max_aspect_ratios_order=False): + self.steps = steps + self.aspect_ratios = aspect_ratios + self.min_ratio = min_ratio + self.max_ratio = max_ratio + self.base_size = base_size + self.min_sizes = min_sizes + self.max_sizes = max_sizes + self.offset = offset + self.flip = flip + self.clip = clip + self.min_max_aspect_ratios_order = min_max_aspect_ratios_order + + if self.min_sizes == [] and self.max_sizes == []: + num_layer = len(aspect_ratios) + step = int( + math.floor(((self.max_ratio - self.min_ratio)) / (num_layer - 2 + ))) + for ratio in six.moves.range(self.min_ratio, self.max_ratio + 1, + step): + self.min_sizes.append(self.base_size * ratio / 100.) + self.max_sizes.append(self.base_size * (ratio + step) / 100.) + self.min_sizes = [self.base_size * .10] + self.min_sizes + self.max_sizes = [self.base_size * .20] + self.max_sizes + + self.num_priors = [] + for aspect_ratio, min_size, max_size in zip( + aspect_ratios, self.min_sizes, self.max_sizes): + if isinstance(min_size, (list, tuple)): + self.num_priors.append( + len(_to_list(min_size)) + len(_to_list(max_size))) + else: + self.num_priors.append((len(aspect_ratio) * 2 + 1) * len( + _to_list(min_size)) + len(_to_list(max_size))) + + def __call__(self, inputs, image): + boxes = [] + for input, min_size, max_size, aspect_ratio, step in zip( + inputs, self.min_sizes, self.max_sizes, self.aspect_ratios, + self.steps): + box, _ = ops.prior_box( + input=input, + image=image, + min_sizes=_to_list(min_size), + max_sizes=_to_list(max_size), + aspect_ratios=aspect_ratio, + flip=self.flip, + clip=self.clip, + steps=[step, step], + offset=self.offset, + min_max_aspect_ratios_order=self.min_max_aspect_ratios_order) + boxes.append(paddle.reshape(box, [-1, 4])) + return boxes + + +@register +@serializable +class RCNNBox(object): + __shared__ = ['num_classes', 'export_onnx'] + + def __init__(self, + prior_box_var=[10., 10., 5., 5.], + code_type="decode_center_size", + box_normalized=False, + num_classes=80, + export_onnx=False): + super(RCNNBox, self).__init__() + self.prior_box_var = prior_box_var + self.code_type = code_type + self.box_normalized = box_normalized + self.num_classes = num_classes + self.export_onnx = export_onnx + + def __call__(self, bbox_head_out, rois, im_shape, scale_factor): + bbox_pred = bbox_head_out[0] + cls_prob = bbox_head_out[1] + roi = rois[0] + rois_num = rois[1] + + if self.export_onnx: + onnx_rois_num_per_im = rois_num[0] + origin_shape = paddle.expand(im_shape[0, :], + [onnx_rois_num_per_im, 2]) + + else: + origin_shape_list = [] + if isinstance(roi, list): + batch_size = len(roi) + else: + batch_size = paddle.slice(paddle.shape(im_shape), [0], [0], [1]) + + # bbox_pred.shape: [N, C*4] + for idx in range(batch_size): + rois_num_per_im = rois_num[idx] + expand_im_shape = paddle.expand(im_shape[idx, :], + [rois_num_per_im, 2]) + origin_shape_list.append(expand_im_shape) + + origin_shape = paddle.concat(origin_shape_list) + + # bbox_pred.shape: [N, C*4] + # C=num_classes in faster/mask rcnn(bbox_head), C=1 in cascade rcnn(cascade_head) + bbox = paddle.concat(roi) + bbox = delta2bbox(bbox_pred, bbox, self.prior_box_var) + scores = cls_prob[:, :-1] + + # bbox.shape: [N, C, 4] + # bbox.shape[1] must be equal to scores.shape[1] + total_num = bbox.shape[0] + bbox_dim = bbox.shape[-1] + bbox = paddle.expand(bbox, [total_num, self.num_classes, bbox_dim]) + + origin_h = paddle.unsqueeze(origin_shape[:, 0], axis=1) + origin_w = paddle.unsqueeze(origin_shape[:, 1], axis=1) + zeros = paddle.zeros_like(origin_h) + x1 = paddle.maximum(paddle.minimum(bbox[:, :, 0], origin_w), zeros) + y1 = paddle.maximum(paddle.minimum(bbox[:, :, 1], origin_h), zeros) + x2 = paddle.maximum(paddle.minimum(bbox[:, :, 2], origin_w), zeros) + y2 = paddle.maximum(paddle.minimum(bbox[:, :, 3], origin_h), zeros) + bbox = paddle.stack([x1, y1, x2, y2], axis=-1) + bboxes = (bbox, rois_num) + return bboxes, scores + + +@register +@serializable +class MultiClassNMS(object): + def __init__(self, + score_threshold=.05, + nms_top_k=-1, + keep_top_k=100, + nms_threshold=.5, + normalized=True, + nms_eta=1.0, + return_index=False, + return_rois_num=True, + trt=False): + super(MultiClassNMS, self).__init__() + self.score_threshold = score_threshold + self.nms_top_k = nms_top_k + self.keep_top_k = keep_top_k + self.nms_threshold = nms_threshold + self.normalized = normalized + self.nms_eta = nms_eta + self.return_index = return_index + self.return_rois_num = return_rois_num + self.trt = trt + + def __call__(self, bboxes, score, background_label=-1): + """ + bboxes (Tensor|List[Tensor]): 1. (Tensor) Predicted bboxes with shape + [N, M, 4], N is the batch size and M + is the number of bboxes + 2. (List[Tensor]) bboxes and bbox_num, + bboxes have shape of [M, C, 4], C + is the class number and bbox_num means + the number of bboxes of each batch with + shape [N,] + score (Tensor): Predicted scores with shape [N, C, M] or [M, C] + background_label (int): Ignore the background label; For example, RCNN + is num_classes and YOLO is -1. + """ + kwargs = self.__dict__.copy() + if isinstance(bboxes, tuple): + bboxes, bbox_num = bboxes + kwargs.update({'rois_num': bbox_num}) + if background_label > -1: + kwargs.update({'background_label': background_label}) + kwargs.pop('trt') + # TODO(wangxinxin08): paddle version should be develop or 2.3 and above to run nms on tensorrt + if self.trt and (int(paddle.version.major) == 0 or + (int(paddle.version.major) >= 2 and + int(paddle.version.minor) >= 3)): + # TODO(wangxinxin08): tricky switch to run nms on tensorrt + kwargs.update({'nms_eta': 1.1}) + bbox, bbox_num, _ = ops.multiclass_nms(bboxes, score, **kwargs) + bbox = bbox.reshape([1, -1, 6]) + idx = paddle.nonzero(bbox[..., 0] != -1) + bbox = paddle.gather_nd(bbox, idx) + return bbox, bbox_num, None + else: + return ops.multiclass_nms(bboxes, score, **kwargs) + + +@register +@serializable +class MatrixNMS(object): + __append_doc__ = True + + def __init__(self, + score_threshold=.05, + post_threshold=.05, + nms_top_k=-1, + keep_top_k=100, + use_gaussian=False, + gaussian_sigma=2., + normalized=False, + background_label=0): + super(MatrixNMS, self).__init__() + self.score_threshold = score_threshold + self.post_threshold = post_threshold + self.nms_top_k = nms_top_k + self.keep_top_k = keep_top_k + self.normalized = normalized + self.use_gaussian = use_gaussian + self.gaussian_sigma = gaussian_sigma + self.background_label = background_label + + def __call__(self, bbox, score, *args): + return ops.matrix_nms( + bboxes=bbox, + scores=score, + score_threshold=self.score_threshold, + post_threshold=self.post_threshold, + nms_top_k=self.nms_top_k, + keep_top_k=self.keep_top_k, + use_gaussian=self.use_gaussian, + gaussian_sigma=self.gaussian_sigma, + background_label=self.background_label, + normalized=self.normalized) + + +@register +@serializable +class YOLOBox(object): + __shared__ = ['num_classes'] + + def __init__(self, + num_classes=80, + conf_thresh=0.005, + downsample_ratio=32, + clip_bbox=True, + scale_x_y=1.): + self.num_classes = num_classes + self.conf_thresh = conf_thresh + self.downsample_ratio = downsample_ratio + self.clip_bbox = clip_bbox + self.scale_x_y = scale_x_y + + def __call__(self, + yolo_head_out, + anchors, + im_shape, + scale_factor, + var_weight=None): + boxes_list = [] + scores_list = [] + origin_shape = im_shape / scale_factor + origin_shape = paddle.cast(origin_shape, 'int32') + for i, head_out in enumerate(yolo_head_out): + boxes, scores = paddle.vision.ops.yolo_box( + head_out, + origin_shape, + anchors[i], + self.num_classes, + self.conf_thresh, + self.downsample_ratio // 2**i, + self.clip_bbox, + scale_x_y=self.scale_x_y) + boxes_list.append(boxes) + scores_list.append(paddle.transpose(scores, perm=[0, 2, 1])) + yolo_boxes = paddle.concat(boxes_list, axis=1) + yolo_scores = paddle.concat(scores_list, axis=2) + return yolo_boxes, yolo_scores + + +@register +@serializable +class SSDBox(object): + def __init__(self, + is_normalized=True, + prior_box_var=[0.1, 0.1, 0.2, 0.2], + use_fuse_decode=False): + self.is_normalized = is_normalized + self.norm_delta = float(not self.is_normalized) + self.prior_box_var = prior_box_var + self.use_fuse_decode = use_fuse_decode + + def __call__(self, + preds, + prior_boxes, + im_shape, + scale_factor, + var_weight=None): + boxes, scores = preds + boxes = paddle.concat(boxes, axis=1) + prior_boxes = paddle.concat(prior_boxes) + if self.use_fuse_decode: + output_boxes = ops.box_coder( + prior_boxes, + self.prior_box_var, + boxes, + code_type="decode_center_size", + box_normalized=self.is_normalized) + else: + pb_w = prior_boxes[:, 2] - prior_boxes[:, 0] + self.norm_delta + pb_h = prior_boxes[:, 3] - prior_boxes[:, 1] + self.norm_delta + pb_x = prior_boxes[:, 0] + pb_w * 0.5 + pb_y = prior_boxes[:, 1] + pb_h * 0.5 + out_x = pb_x + boxes[:, :, 0] * pb_w * self.prior_box_var[0] + out_y = pb_y + boxes[:, :, 1] * pb_h * self.prior_box_var[1] + out_w = paddle.exp(boxes[:, :, 2] * self.prior_box_var[2]) * pb_w + out_h = paddle.exp(boxes[:, :, 3] * self.prior_box_var[3]) * pb_h + output_boxes = paddle.stack( + [ + out_x - out_w / 2., out_y - out_h / 2., out_x + out_w / 2., + out_y + out_h / 2. + ], + axis=-1) + + if self.is_normalized: + h = (im_shape[:, 0] / scale_factor[:, 0]).unsqueeze(-1) + w = (im_shape[:, 1] / scale_factor[:, 1]).unsqueeze(-1) + im_shape = paddle.stack([w, h, w, h], axis=-1) + output_boxes *= im_shape + else: + output_boxes[..., -2:] -= 1.0 + output_scores = F.softmax(paddle.concat( + scores, axis=1)).transpose([0, 2, 1]) + + return output_boxes, output_scores + + +@register +class TTFBox(object): + __shared__ = ['down_ratio'] + + def __init__(self, max_per_img=100, score_thresh=0.01, down_ratio=4): + super(TTFBox, self).__init__() + self.max_per_img = max_per_img + self.score_thresh = score_thresh + self.down_ratio = down_ratio + + def _simple_nms(self, heat, kernel=3): + """ + Use maxpool to filter the max score, get local peaks. + """ + pad = (kernel - 1) // 2 + hmax = F.max_pool2d(heat, kernel, stride=1, padding=pad) + keep = paddle.cast(hmax == heat, 'float32') + return heat * keep + + def _topk(self, scores): + """ + Select top k scores and decode to get xy coordinates. + """ + k = self.max_per_img + shape_fm = paddle.shape(scores) + shape_fm.stop_gradient = True + cat, height, width = shape_fm[1], shape_fm[2], shape_fm[3] + # batch size is 1 + scores_r = paddle.reshape(scores, [cat, -1]) + topk_scores, topk_inds = paddle.topk(scores_r, k) + topk_ys = topk_inds // width + topk_xs = topk_inds % width + + topk_score_r = paddle.reshape(topk_scores, [-1]) + topk_score, topk_ind = paddle.topk(topk_score_r, k) + k_t = paddle.full(paddle.shape(topk_ind), k, dtype='int64') + topk_clses = paddle.cast(paddle.floor_divide(topk_ind, k_t), 'float32') + + topk_inds = paddle.reshape(topk_inds, [-1]) + topk_ys = paddle.reshape(topk_ys, [-1, 1]) + topk_xs = paddle.reshape(topk_xs, [-1, 1]) + topk_inds = paddle.gather(topk_inds, topk_ind) + topk_ys = paddle.gather(topk_ys, topk_ind) + topk_xs = paddle.gather(topk_xs, topk_ind) + + return topk_score, topk_inds, topk_clses, topk_ys, topk_xs + + def _decode(self, hm, wh, im_shape, scale_factor): + heatmap = F.sigmoid(hm) + heat = self._simple_nms(heatmap) + scores, inds, clses, ys, xs = self._topk(heat) + ys = paddle.cast(ys, 'float32') * self.down_ratio + xs = paddle.cast(xs, 'float32') * self.down_ratio + scores = paddle.tensor.unsqueeze(scores, [1]) + clses = paddle.tensor.unsqueeze(clses, [1]) + + wh_t = paddle.transpose(wh, [0, 2, 3, 1]) + wh = paddle.reshape(wh_t, [-1, paddle.shape(wh_t)[-1]]) + wh = paddle.gather(wh, inds) + + x1 = xs - wh[:, 0:1] + y1 = ys - wh[:, 1:2] + x2 = xs + wh[:, 2:3] + y2 = ys + wh[:, 3:4] + + bboxes = paddle.concat([x1, y1, x2, y2], axis=1) + + scale_y = scale_factor[:, 0:1] + scale_x = scale_factor[:, 1:2] + scale_expand = paddle.concat( + [scale_x, scale_y, scale_x, scale_y], axis=1) + boxes_shape = paddle.shape(bboxes) + boxes_shape.stop_gradient = True + scale_expand = paddle.expand(scale_expand, shape=boxes_shape) + bboxes = paddle.divide(bboxes, scale_expand) + results = paddle.concat([clses, scores, bboxes], axis=1) + # hack: append result with cls=-1 and score=1. to avoid all scores + # are less than score_thresh which may cause error in gather. + fill_r = paddle.to_tensor(np.array([[-1, 1, 0, 0, 0, 0]])) + fill_r = paddle.cast(fill_r, results.dtype) + results = paddle.concat([results, fill_r]) + scores = results[:, 1] + valid_ind = paddle.nonzero(scores > self.score_thresh) + results = paddle.gather(results, valid_ind) + return results, paddle.shape(results)[0:1] + + def __call__(self, hm, wh, im_shape, scale_factor): + results = [] + results_num = [] + for i in range(scale_factor.shape[0]): + result, num = self._decode(hm[i:i + 1, ], wh[i:i + 1, ], + im_shape[i:i + 1, ], + scale_factor[i:i + 1, ]) + results.append(result) + results_num.append(num) + results = paddle.concat(results, axis=0) + results_num = paddle.concat(results_num, axis=0) + return results, results_num + + +@register +@serializable +class JDEBox(object): + __shared__ = ['num_classes'] + + def __init__(self, num_classes=1, conf_thresh=0.3, downsample_ratio=32): + self.num_classes = num_classes + self.conf_thresh = conf_thresh + self.downsample_ratio = downsample_ratio + + def generate_anchor(self, nGh, nGw, anchor_wh): + nA = len(anchor_wh) + yv, xv = paddle.meshgrid([paddle.arange(nGh), paddle.arange(nGw)]) + mesh = paddle.stack( + (xv, yv), axis=0).cast(dtype='float32') # 2 x nGh x nGw + meshs = paddle.tile(mesh, [nA, 1, 1, 1]) + + anchor_offset_mesh = anchor_wh[:, :, None][:, :, :, None].repeat( + int(nGh), axis=-2).repeat( + int(nGw), axis=-1) + anchor_offset_mesh = paddle.to_tensor( + anchor_offset_mesh.astype(np.float32)) + # nA x 2 x nGh x nGw + + anchor_mesh = paddle.concat([meshs, anchor_offset_mesh], axis=1) + anchor_mesh = paddle.transpose(anchor_mesh, + [0, 2, 3, 1]) # (nA x nGh x nGw) x 4 + return anchor_mesh + + def decode_delta(self, delta, fg_anchor_list): + px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \ + fg_anchor_list[:, 2], fg_anchor_list[:,3] + dx, dy, dw, dh = delta[:, 0], delta[:, 1], delta[:, 2], delta[:, 3] + gx = pw * dx + px + gy = ph * dy + py + gw = pw * paddle.exp(dw) + gh = ph * paddle.exp(dh) + gx1 = gx - gw * 0.5 + gy1 = gy - gh * 0.5 + gx2 = gx + gw * 0.5 + gy2 = gy + gh * 0.5 + return paddle.stack([gx1, gy1, gx2, gy2], axis=1) + + def decode_delta_map(self, nA, nGh, nGw, delta_map, anchor_vec): + anchor_mesh = self.generate_anchor(nGh, nGw, anchor_vec) + anchor_mesh = paddle.unsqueeze(anchor_mesh, 0) + pred_list = self.decode_delta( + paddle.reshape( + delta_map, shape=[-1, 4]), + paddle.reshape( + anchor_mesh, shape=[-1, 4])) + pred_map = paddle.reshape(pred_list, shape=[nA * nGh * nGw, 4]) + return pred_map + + def _postprocessing_by_level(self, nA, stride, head_out, anchor_vec): + boxes_shape = head_out.shape # [nB, nA*6, nGh, nGw] + nGh, nGw = boxes_shape[-2], boxes_shape[-1] + nB = 1 # TODO: only support bs=1 now + boxes_list, scores_list = [], [] + for idx in range(nB): + p = paddle.reshape( + head_out[idx], shape=[nA, self.num_classes + 5, nGh, nGw]) + p = paddle.transpose(p, perm=[0, 2, 3, 1]) # [nA, nGh, nGw, 6] + delta_map = p[:, :, :, :4] + boxes = self.decode_delta_map(nA, nGh, nGw, delta_map, anchor_vec) + # [nA * nGh * nGw, 4] + boxes_list.append(boxes * stride) + + p_conf = paddle.transpose( + p[:, :, :, 4:6], perm=[3, 0, 1, 2]) # [2, nA, nGh, nGw] + p_conf = F.softmax( + p_conf, axis=0)[1, :, :, :].unsqueeze(-1) # [nA, nGh, nGw, 1] + scores = paddle.reshape(p_conf, shape=[nA * nGh * nGw, 1]) + scores_list.append(scores) + + boxes_results = paddle.stack(boxes_list) + scores_results = paddle.stack(scores_list) + return boxes_results, scores_results + + def __call__(self, yolo_head_out, anchors): + bbox_pred_list = [] + for i, head_out in enumerate(yolo_head_out): + stride = self.downsample_ratio // 2**i + anc_w, anc_h = anchors[i][0::2], anchors[i][1::2] + anchor_vec = np.stack((anc_w, anc_h), axis=1) / stride + nA = len(anc_w) + boxes, scores = self._postprocessing_by_level(nA, stride, head_out, + anchor_vec) + bbox_pred_list.append(paddle.concat([boxes, scores], axis=-1)) + + yolo_boxes_scores = paddle.concat(bbox_pred_list, axis=1) + boxes_idx_over_conf_thr = paddle.nonzero( + yolo_boxes_scores[:, :, -1] > self.conf_thresh) + boxes_idx_over_conf_thr.stop_gradient = True + + return boxes_idx_over_conf_thr, yolo_boxes_scores + + +@register +@serializable +class MaskMatrixNMS(object): + """ + Matrix NMS for multi-class masks. + Args: + update_threshold (float): Updated threshold of categroy score in second time. + pre_nms_top_n (int): Number of total instance to be kept per image before NMS + post_nms_top_n (int): Number of total instance to be kept per image after NMS. + kernel (str): 'linear' or 'gaussian'. + sigma (float): std in gaussian method. + Input: + seg_preds (Variable): shape (n, h, w), segmentation feature maps + seg_masks (Variable): shape (n, h, w), segmentation feature maps + cate_labels (Variable): shape (n), mask labels in descending order + cate_scores (Variable): shape (n), mask scores in descending order + sum_masks (Variable): a float tensor of the sum of seg_masks + Returns: + Variable: cate_scores, tensors of shape (n) + """ + + def __init__(self, + update_threshold=0.05, + pre_nms_top_n=500, + post_nms_top_n=100, + kernel='gaussian', + sigma=2.0): + super(MaskMatrixNMS, self).__init__() + self.update_threshold = update_threshold + self.pre_nms_top_n = pre_nms_top_n + self.post_nms_top_n = post_nms_top_n + self.kernel = kernel + self.sigma = sigma + + def _sort_score(self, scores, top_num): + if paddle.shape(scores)[0] > top_num: + return paddle.topk(scores, top_num)[1] + else: + return paddle.argsort(scores, descending=True) + + def __call__(self, + seg_preds, + seg_masks, + cate_labels, + cate_scores, + sum_masks=None): + # sort and keep top nms_pre + sort_inds = self._sort_score(cate_scores, self.pre_nms_top_n) + seg_masks = paddle.gather(seg_masks, index=sort_inds) + seg_preds = paddle.gather(seg_preds, index=sort_inds) + sum_masks = paddle.gather(sum_masks, index=sort_inds) + cate_scores = paddle.gather(cate_scores, index=sort_inds) + cate_labels = paddle.gather(cate_labels, index=sort_inds) + + seg_masks = paddle.flatten(seg_masks, start_axis=1, stop_axis=-1) + # inter. + inter_matrix = paddle.mm(seg_masks, paddle.transpose(seg_masks, [1, 0])) + n_samples = paddle.shape(cate_labels) + # union. + sum_masks_x = paddle.expand(sum_masks, shape=[n_samples, n_samples]) + # iou. + iou_matrix = (inter_matrix / ( + sum_masks_x + paddle.transpose(sum_masks_x, [1, 0]) - inter_matrix)) + iou_matrix = paddle.triu(iou_matrix, diagonal=1) + # label_specific matrix. + cate_labels_x = paddle.expand(cate_labels, shape=[n_samples, n_samples]) + label_matrix = paddle.cast( + (cate_labels_x == paddle.transpose(cate_labels_x, [1, 0])), + 'float32') + label_matrix = paddle.triu(label_matrix, diagonal=1) + + # IoU compensation + compensate_iou = paddle.max((iou_matrix * label_matrix), axis=0) + compensate_iou = paddle.expand( + compensate_iou, shape=[n_samples, n_samples]) + compensate_iou = paddle.transpose(compensate_iou, [1, 0]) + + # IoU decay + decay_iou = iou_matrix * label_matrix + + # matrix nms + if self.kernel == 'gaussian': + decay_matrix = paddle.exp(-1 * self.sigma * (decay_iou**2)) + compensate_matrix = paddle.exp(-1 * self.sigma * + (compensate_iou**2)) + decay_coefficient = paddle.min(decay_matrix / compensate_matrix, + axis=0) + elif self.kernel == 'linear': + decay_matrix = (1 - decay_iou) / (1 - compensate_iou) + decay_coefficient = paddle.min(decay_matrix, axis=0) + else: + raise NotImplementedError + + # update the score. + cate_scores = cate_scores * decay_coefficient + y = paddle.zeros(shape=paddle.shape(cate_scores), dtype='float32') + keep = paddle.where(cate_scores >= self.update_threshold, cate_scores, + y) + keep = paddle.nonzero(keep) + keep = paddle.squeeze(keep, axis=[1]) + # Prevent empty and increase fake data + keep = paddle.concat( + [keep, paddle.cast(paddle.shape(cate_scores)[0:1] - 1, 'int64')]) + + seg_preds = paddle.gather(seg_preds, index=keep) + cate_scores = paddle.gather(cate_scores, index=keep) + cate_labels = paddle.gather(cate_labels, index=keep) + + # sort and keep top_k + sort_inds = self._sort_score(cate_scores, self.post_nms_top_n) + seg_preds = paddle.gather(seg_preds, index=sort_inds) + cate_scores = paddle.gather(cate_scores, index=sort_inds) + cate_labels = paddle.gather(cate_labels, index=sort_inds) + return seg_preds, cate_scores, cate_labels + + +def Conv2d(in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + weight_init=Normal(std=0.001), + bias_init=Constant(0.)): + weight_attr = paddle.framework.ParamAttr(initializer=weight_init) + if bias: + bias_attr = paddle.framework.ParamAttr(initializer=bias_init) + else: + bias_attr = False + conv = nn.Conv2D( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + weight_attr=weight_attr, + bias_attr=bias_attr) + return conv + + +def ConvTranspose2d(in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + output_padding=0, + groups=1, + bias=True, + dilation=1, + weight_init=Normal(std=0.001), + bias_init=Constant(0.)): + weight_attr = paddle.framework.ParamAttr(initializer=weight_init) + if bias: + bias_attr = paddle.framework.ParamAttr(initializer=bias_init) + else: + bias_attr = False + conv = nn.Conv2DTranspose( + in_channels, + out_channels, + kernel_size, + stride, + padding, + output_padding, + dilation, + groups, + weight_attr=weight_attr, + bias_attr=bias_attr) + return conv + + +def BatchNorm2d(num_features, eps=1e-05, momentum=0.9, affine=True): + if not affine: + weight_attr = False + bias_attr = False + else: + weight_attr = None + bias_attr = None + batchnorm = nn.BatchNorm2D( + num_features, + momentum, + eps, + weight_attr=weight_attr, + bias_attr=bias_attr) + return batchnorm + + +def ReLU(): + return nn.ReLU() + + +def Upsample(scale_factor=None, mode='nearest', align_corners=False): + return nn.Upsample(None, scale_factor, mode, align_corners) + + +def MaxPool(kernel_size, stride, padding, ceil_mode=False): + return nn.MaxPool2D(kernel_size, stride, padding, ceil_mode=ceil_mode) + + +class Concat(nn.Layer): + def __init__(self, dim=0): + super(Concat, self).__init__() + self.dim = dim + + def forward(self, inputs): + return paddle.concat(inputs, axis=self.dim) + + def extra_repr(self): + return 'dim={}'.format(self.dim) + + +def _convert_attention_mask(attn_mask, dtype): + """ + Convert the attention mask to the target dtype we expect. + Parameters: + attn_mask (Tensor, optional): A tensor used in multi-head attention + to prevents attention to some unwanted positions, usually the + paddings or the subsequent positions. It is a tensor with shape + broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. + When the data type is bool, the unwanted positions have `False` + values and the others have `True` values. When the data type is + int, the unwanted positions have 0 values and the others have 1 + values. When the data type is float, the unwanted positions have + `-INF` values and the others have 0 values. It can be None when + nothing wanted or needed to be prevented attention to. Default None. + dtype (VarType): The target type of `attn_mask` we expect. + Returns: + Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`. + """ + return nn.layer.transformer._convert_attention_mask(attn_mask, dtype) + + +@register +class MultiHeadAttention(nn.Layer): + """ + Attention mapps queries and a set of key-value pairs to outputs, and + Multi-Head Attention performs multiple parallel attention to jointly attending + to information from different representation subspaces. + + Please refer to `Attention Is All You Need `_ + for more details. + + Parameters: + embed_dim (int): The expected feature size in the input and output. + num_heads (int): The number of heads in multi-head attention. + dropout (float, optional): The dropout probability used on attention + weights to drop some attention targets. 0 for no dropout. Default 0 + kdim (int, optional): The feature size in key. If None, assumed equal to + `embed_dim`. Default None. + vdim (int, optional): The feature size in value. If None, assumed equal to + `embed_dim`. Default None. + need_weights (bool, optional): Indicate whether to return the attention + weights. Default False. + + Examples: + + .. code-block:: python + + import paddle + + # encoder input: [batch_size, sequence_length, d_model] + query = paddle.rand((2, 4, 128)) + # self attention mask: [batch_size, num_heads, query_len, query_len] + attn_mask = paddle.rand((2, 2, 4, 4)) + multi_head_attn = paddle.nn.MultiHeadAttention(128, 2) + output = multi_head_attn(query, None, None, attn_mask=attn_mask) # [2, 4, 128] + """ + + def __init__(self, + embed_dim, + num_heads, + dropout=0., + kdim=None, + vdim=None, + need_weights=False): + super(MultiHeadAttention, self).__init__() + self.embed_dim = embed_dim + self.kdim = kdim if kdim is not None else embed_dim + self.vdim = vdim if vdim is not None else embed_dim + self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim + + self.num_heads = num_heads + self.dropout = dropout + self.need_weights = need_weights + + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" + + if self._qkv_same_embed_dim: + self.in_proj_weight = self.create_parameter( + shape=[embed_dim, 3 * embed_dim], + attr=None, + dtype=self._dtype, + is_bias=False) + self.in_proj_bias = self.create_parameter( + shape=[3 * embed_dim], + attr=None, + dtype=self._dtype, + is_bias=True) + else: + self.q_proj = nn.Linear(embed_dim, embed_dim) + self.k_proj = nn.Linear(self.kdim, embed_dim) + self.v_proj = nn.Linear(self.vdim, embed_dim) + + self.out_proj = nn.Linear(embed_dim, embed_dim) + self._type_list = ('q_proj', 'k_proj', 'v_proj') + + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + xavier_uniform_(p) + else: + constant_(p) + + def compute_qkv(self, tensor, index): + if self._qkv_same_embed_dim: + tensor = F.linear( + x=tensor, + weight=self.in_proj_weight[:, index * self.embed_dim:(index + 1) + * self.embed_dim], + bias=self.in_proj_bias[index * self.embed_dim:(index + 1) * + self.embed_dim] + if self.in_proj_bias is not None else None) + else: + tensor = getattr(self, self._type_list[index])(tensor) + tensor = tensor.reshape( + [0, 0, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) + return tensor + + def forward(self, query, key=None, value=None, attn_mask=None): + r""" + Applies multi-head attention to map queries and a set of key-value pairs + to outputs. + + Parameters: + query (Tensor): The queries for multi-head attention. It is a + tensor with shape `[batch_size, query_length, embed_dim]`. The + data type should be float32 or float64. + key (Tensor, optional): The keys for multi-head attention. It is + a tensor with shape `[batch_size, key_length, kdim]`. The + data type should be float32 or float64. If None, use `query` as + `key`. Default None. + value (Tensor, optional): The values for multi-head attention. It + is a tensor with shape `[batch_size, value_length, vdim]`. + The data type should be float32 or float64. If None, use `query` as + `value`. Default None. + attn_mask (Tensor, optional): A tensor used in multi-head attention + to prevents attention to some unwanted positions, usually the + paddings or the subsequent positions. It is a tensor with shape + broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. + When the data type is bool, the unwanted positions have `False` + values and the others have `True` values. When the data type is + int, the unwanted positions have 0 values and the others have 1 + values. When the data type is float, the unwanted positions have + `-INF` values and the others have 0 values. It can be None when + nothing wanted or needed to be prevented attention to. Default None. + + Returns: + Tensor|tuple: It is a tensor that has the same shape and data type \ + as `query`, representing attention output. Or a tuple if \ + `need_weights` is True or `cache` is not None. If `need_weights` \ + is True, except for attention output, the tuple also includes \ + the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \ + If `cache` is not None, the tuple then includes the new cache \ + having the same type as `cache`, and if it is `StaticCache`, it \ + is same as the input `cache`, if it is `Cache`, the new cache \ + reserves tensors concatanating raw tensors with intermediate \ + results of current query. + """ + key = query if key is None else key + value = query if value is None else value + # compute q ,k ,v + q, k, v = (self.compute_qkv(t, i) + for i, t in enumerate([query, key, value])) + + # scale dot product attention + product = paddle.matmul(x=q, y=k, transpose_y=True) + scaling = float(self.head_dim)**-0.5 + product = product * scaling + + if attn_mask is not None: + # Support bool or int mask + attn_mask = _convert_attention_mask(attn_mask, product.dtype) + product = product + attn_mask + weights = F.softmax(product) + if self.dropout: + weights = F.dropout( + weights, + self.dropout, + training=self.training, + mode="upscale_in_train") + out = paddle.matmul(weights, v) + + # combine heads + out = paddle.transpose(out, perm=[0, 2, 1, 3]) + out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) + + # project to output + out = self.out_proj(out) + + outs = [out] + if self.need_weights: + outs.append(weights) + return out if len(outs) == 1 else tuple(outs) + + +@register +class ConvMixer(nn.Layer): + def __init__( + self, + dim, + depth, + kernel_size=3, ): + super().__init__() + self.dim = dim + self.depth = depth + self.kernel_size = kernel_size + + self.mixer = self.conv_mixer(dim, depth, kernel_size) + + def forward(self, x): + return self.mixer(x) + + @staticmethod + def conv_mixer( + dim, + depth, + kernel_size, ): + Seq, ActBn = nn.Sequential, lambda x: Seq(x, nn.GELU(), nn.BatchNorm2D(dim)) + Residual = type('Residual', (Seq, ), + {'forward': lambda self, x: self[0](x) + x}) + return Seq(* [ + Seq(Residual( + ActBn( + nn.Conv2D( + dim, dim, kernel_size, groups=dim, padding="same"))), + ActBn(nn.Conv2D(dim, dim, 1))) for i in range(depth) + ]) diff --git a/rtdetr_paddle/ppdet/modeling/losses/__init__.py b/rtdetr_paddle/ppdet/modeling/losses/__init__.py new file mode 100644 index 0000000..1f633cc --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/losses/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .iou_loss import * +from .gfocal_loss import * +from .detr_loss import * +from .focal_loss import * +from .smooth_l1_loss import * diff --git a/rtdetr_paddle/ppdet/modeling/losses/detr_loss.py b/rtdetr_paddle/ppdet/modeling/losses/detr_loss.py new file mode 100644 index 0000000..24f14c3 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/losses/detr_loss.py @@ -0,0 +1,578 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ppdet.core.workspace import register +from .iou_loss import GIoULoss +from ..transformers import bbox_cxcywh_to_xyxy, sigmoid_focal_loss, varifocal_loss_with_logits +from ..bbox_utils import bbox_iou + +__all__ = ['DETRLoss', 'DINOLoss'] + + +@register +class DETRLoss(nn.Layer): + __shared__ = ['num_classes', 'use_focal_loss'] + __inject__ = ['matcher'] + + def __init__(self, + num_classes=80, + matcher='HungarianMatcher', + loss_coeff={ + 'class': 1, + 'bbox': 5, + 'giou': 2, + 'no_object': 0.1, + 'mask': 1, + 'dice': 1 + }, + aux_loss=True, + use_focal_loss=False, + use_vfl=False, + use_uni_match=False, + uni_match_ind=0): + r""" + Args: + num_classes (int): The number of classes. + matcher (HungarianMatcher): It computes an assignment between the targets + and the predictions of the network. + loss_coeff (dict): The coefficient of loss. + aux_loss (bool): If 'aux_loss = True', loss at each decoder layer are to be used. + use_focal_loss (bool): Use focal loss or not. + """ + super(DETRLoss, self).__init__() + + self.num_classes = num_classes + self.matcher = matcher + self.loss_coeff = loss_coeff + self.aux_loss = aux_loss + self.use_focal_loss = use_focal_loss + self.use_vfl = use_vfl + self.use_uni_match = use_uni_match + self.uni_match_ind = uni_match_ind + + if not self.use_focal_loss: + self.loss_coeff['class'] = paddle.full([num_classes + 1], + loss_coeff['class']) + self.loss_coeff['class'][-1] = loss_coeff['no_object'] + self.giou_loss = GIoULoss() + + def _get_loss_class(self, + logits, + gt_class, + match_indices, + bg_index, + num_gts, + postfix="", + iou_score=None): + # logits: [b, query, num_classes], gt_class: list[[n, 1]] + name_class = "loss_class" + postfix + + target_label = paddle.full(logits.shape[:2], bg_index, dtype='int64') + bs, num_query_objects = target_label.shape + num_gt = sum(len(a) for a in gt_class) + if num_gt > 0: + index, updates = self._get_index_updates(num_query_objects, + gt_class, match_indices) + target_label = paddle.scatter( + target_label.reshape([-1, 1]), index, updates.astype('int64')) + target_label = target_label.reshape([bs, num_query_objects]) + if self.use_focal_loss: + target_label = F.one_hot(target_label, + self.num_classes + 1)[..., :-1] + if iou_score is not None and self.use_vfl: + target_score = paddle.zeros([bs, num_query_objects]) + if num_gt > 0: + target_score = paddle.scatter( + target_score.reshape([-1, 1]), index, iou_score) + target_score = target_score.reshape( + [bs, num_query_objects, 1]) * target_label + loss_ = self.loss_coeff['class'] * varifocal_loss_with_logits( + logits, target_score, target_label, + num_gts / num_query_objects) + else: + loss_ = self.loss_coeff['class'] * sigmoid_focal_loss( + logits, target_label, num_gts / num_query_objects) + else: + loss_ = F.cross_entropy( + logits, target_label, weight=self.loss_coeff['class']) + return {name_class: loss_} + + def _get_loss_bbox(self, boxes, gt_bbox, match_indices, num_gts, + postfix=""): + # boxes: [b, query, 4], gt_bbox: list[[n, 4]] + name_bbox = "loss_bbox" + postfix + name_giou = "loss_giou" + postfix + + loss = dict() + if sum(len(a) for a in gt_bbox) == 0: + loss[name_bbox] = paddle.to_tensor([0.]) + loss[name_giou] = paddle.to_tensor([0.]) + return loss + + src_bbox, target_bbox = self._get_src_target_assign(boxes, gt_bbox, + match_indices) + loss[name_bbox] = self.loss_coeff['bbox'] * F.l1_loss( + src_bbox, target_bbox, reduction='sum') / num_gts + loss[name_giou] = self.giou_loss( + bbox_cxcywh_to_xyxy(src_bbox), bbox_cxcywh_to_xyxy(target_bbox)) + loss[name_giou] = loss[name_giou].sum() / num_gts + loss[name_giou] = self.loss_coeff['giou'] * loss[name_giou] + return loss + + def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts, + postfix=""): + # masks: [b, query, h, w], gt_mask: list[[n, H, W]] + name_mask = "loss_mask" + postfix + name_dice = "loss_dice" + postfix + + loss = dict() + if sum(len(a) for a in gt_mask) == 0: + loss[name_mask] = paddle.to_tensor([0.]) + loss[name_dice] = paddle.to_tensor([0.]) + return loss + + src_masks, target_masks = self._get_src_target_assign(masks, gt_mask, + match_indices) + src_masks = F.interpolate( + src_masks.unsqueeze(0), + size=target_masks.shape[-2:], + mode="bilinear")[0] + loss[name_mask] = self.loss_coeff['mask'] * F.sigmoid_focal_loss( + src_masks, + target_masks, + paddle.to_tensor( + [num_gts], dtype='float32')) + loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss( + src_masks, target_masks, num_gts) + return loss + + def _dice_loss(self, inputs, targets, num_gts): + inputs = F.sigmoid(inputs) + inputs = inputs.flatten(1) + targets = targets.flatten(1) + numerator = 2 * (inputs * targets).sum(1) + denominator = inputs.sum(-1) + targets.sum(-1) + loss = 1 - (numerator + 1) / (denominator + 1) + return loss.sum() / num_gts + + def _get_loss_aux(self, + boxes, + logits, + gt_bbox, + gt_class, + bg_index, + num_gts, + dn_match_indices=None, + postfix="", + masks=None, + gt_mask=None): + loss_class = [] + loss_bbox, loss_giou = [], [] + loss_mask, loss_dice = [], [] + if dn_match_indices is not None: + match_indices = dn_match_indices + elif self.use_uni_match: + match_indices = self.matcher( + boxes[self.uni_match_ind], + logits[self.uni_match_ind], + gt_bbox, + gt_class, + masks=masks[self.uni_match_ind] if masks is not None else None, + gt_mask=gt_mask) + for i, (aux_boxes, aux_logits) in enumerate(zip(boxes, logits)): + aux_masks = masks[i] if masks is not None else None + if not self.use_uni_match and dn_match_indices is None: + match_indices = self.matcher( + aux_boxes, + aux_logits, + gt_bbox, + gt_class, + masks=aux_masks, + gt_mask=gt_mask) + if self.use_vfl: + if sum(len(a) for a in gt_bbox) > 0: + src_bbox, target_bbox = self._get_src_target_assign( + aux_boxes.detach(), gt_bbox, match_indices) + iou_score = bbox_iou( + bbox_cxcywh_to_xyxy(src_bbox).split(4, -1), + bbox_cxcywh_to_xyxy(target_bbox).split(4, -1)) + else: + iou_score = None + else: + iou_score = None + loss_class.append( + self._get_loss_class(aux_logits, gt_class, match_indices, + bg_index, num_gts, postfix, iou_score)[ + 'loss_class' + postfix]) + loss_ = self._get_loss_bbox(aux_boxes, gt_bbox, match_indices, + num_gts, postfix) + loss_bbox.append(loss_['loss_bbox' + postfix]) + loss_giou.append(loss_['loss_giou' + postfix]) + if masks is not None and gt_mask is not None: + loss_ = self._get_loss_mask(aux_masks, gt_mask, match_indices, + num_gts, postfix) + loss_mask.append(loss_['loss_mask' + postfix]) + loss_dice.append(loss_['loss_dice' + postfix]) + loss = { + "loss_class_aux" + postfix: paddle.add_n(loss_class), + "loss_bbox_aux" + postfix: paddle.add_n(loss_bbox), + "loss_giou_aux" + postfix: paddle.add_n(loss_giou) + } + if masks is not None and gt_mask is not None: + loss["loss_mask_aux" + postfix] = paddle.add_n(loss_mask) + loss["loss_dice_aux" + postfix] = paddle.add_n(loss_dice) + return loss + + def _get_index_updates(self, num_query_objects, target, match_indices): + batch_idx = paddle.concat([ + paddle.full_like(src, i) for i, (src, _) in enumerate(match_indices) + ]) + src_idx = paddle.concat([src for (src, _) in match_indices]) + src_idx += (batch_idx * num_query_objects) + target_assign = paddle.concat([ + paddle.gather( + t, dst, axis=0) for t, (_, dst) in zip(target, match_indices) + ]) + return src_idx, target_assign + + def _get_src_target_assign(self, src, target, match_indices): + src_assign = paddle.concat([ + paddle.gather( + t, I, axis=0) if len(I) > 0 else paddle.zeros([0, t.shape[-1]]) + for t, (I, _) in zip(src, match_indices) + ]) + target_assign = paddle.concat([ + paddle.gather( + t, J, axis=0) if len(J) > 0 else paddle.zeros([0, t.shape[-1]]) + for t, (_, J) in zip(target, match_indices) + ]) + return src_assign, target_assign + + def _get_num_gts(self, targets, dtype="float32"): + num_gts = sum(len(a) for a in targets) + num_gts = paddle.to_tensor([num_gts], dtype=dtype) + if paddle.distributed.get_world_size() > 1: + paddle.distributed.all_reduce(num_gts) + num_gts /= paddle.distributed.get_world_size() + num_gts = paddle.clip(num_gts, min=1.) + return num_gts + + def _get_prediction_loss(self, + boxes, + logits, + gt_bbox, + gt_class, + masks=None, + gt_mask=None, + postfix="", + dn_match_indices=None, + num_gts=1): + if dn_match_indices is None: + match_indices = self.matcher( + boxes, logits, gt_bbox, gt_class, masks=masks, gt_mask=gt_mask) + else: + match_indices = dn_match_indices + + if self.use_vfl: + if sum(len(a) for a in gt_bbox) > 0: + src_bbox, target_bbox = self._get_src_target_assign( + boxes.detach(), gt_bbox, match_indices) + iou_score = bbox_iou( + bbox_cxcywh_to_xyxy(src_bbox).split(4, -1), + bbox_cxcywh_to_xyxy(target_bbox).split(4, -1)) + else: + iou_score = None + else: + iou_score = None + + loss = dict() + loss.update( + self._get_loss_class(logits, gt_class, match_indices, + self.num_classes, num_gts, postfix, iou_score)) + loss.update( + self._get_loss_bbox(boxes, gt_bbox, match_indices, num_gts, + postfix)) + if masks is not None and gt_mask is not None: + loss.update( + self._get_loss_mask(masks, gt_mask, match_indices, num_gts, + postfix)) + return loss + + def forward(self, + boxes, + logits, + gt_bbox, + gt_class, + masks=None, + gt_mask=None, + postfix="", + **kwargs): + r""" + Args: + boxes (Tensor): [l, b, query, 4] + logits (Tensor): [l, b, query, num_classes] + gt_bbox (List(Tensor)): list[[n, 4]] + gt_class (List(Tensor)): list[[n, 1]] + masks (Tensor, optional): [l, b, query, h, w] + gt_mask (List(Tensor), optional): list[[n, H, W]] + postfix (str): postfix of loss name + """ + + dn_match_indices = kwargs.get("dn_match_indices", None) + num_gts = kwargs.get("num_gts", None) + if num_gts is None: + num_gts = self._get_num_gts(gt_class) + + total_loss = self._get_prediction_loss( + boxes[-1], + logits[-1], + gt_bbox, + gt_class, + masks=masks[-1] if masks is not None else None, + gt_mask=gt_mask, + postfix=postfix, + dn_match_indices=dn_match_indices, + num_gts=num_gts) + + if self.aux_loss: + total_loss.update( + self._get_loss_aux( + boxes[:-1], + logits[:-1], + gt_bbox, + gt_class, + self.num_classes, + num_gts, + dn_match_indices, + postfix, + masks=masks[:-1] if masks is not None else None, + gt_mask=gt_mask)) + + return total_loss + + +@register +class DINOLoss(DETRLoss): + def forward(self, + boxes, + logits, + gt_bbox, + gt_class, + masks=None, + gt_mask=None, + postfix="", + dn_out_bboxes=None, + dn_out_logits=None, + dn_meta=None, + **kwargs): + num_gts = self._get_num_gts(gt_class) + total_loss = super(DINOLoss, self).forward( + boxes, logits, gt_bbox, gt_class, num_gts=num_gts) + + if dn_meta is not None: + dn_positive_idx, dn_num_group = \ + dn_meta["dn_positive_idx"], dn_meta["dn_num_group"] + assert len(gt_class) == len(dn_positive_idx) + + # denoising match indices + dn_match_indices = self.get_dn_match_indices( + gt_class, dn_positive_idx, dn_num_group) + + # compute denoising training loss + num_gts *= dn_num_group + dn_loss = super(DINOLoss, self).forward( + dn_out_bboxes, + dn_out_logits, + gt_bbox, + gt_class, + postfix="_dn", + dn_match_indices=dn_match_indices, + num_gts=num_gts) + total_loss.update(dn_loss) + else: + total_loss.update( + {k + '_dn': paddle.to_tensor([0.]) + for k in total_loss.keys()}) + + return total_loss + + @staticmethod + def get_dn_match_indices(labels, dn_positive_idx, dn_num_group): + dn_match_indices = [] + for i in range(len(labels)): + num_gt = len(labels[i]) + if num_gt > 0: + gt_idx = paddle.arange(end=num_gt, dtype="int64") + gt_idx = gt_idx.tile([dn_num_group]) + assert len(dn_positive_idx[i]) == len(gt_idx) + dn_match_indices.append((dn_positive_idx[i], gt_idx)) + else: + dn_match_indices.append((paddle.zeros( + [0], dtype="int64"), paddle.zeros( + [0], dtype="int64"))) + return dn_match_indices + + +@register +class MaskDINOLoss(DETRLoss): + __shared__ = ['num_classes', 'use_focal_loss', 'num_sample_points'] + __inject__ = ['matcher'] + + def __init__(self, + num_classes=80, + matcher='HungarianMatcher', + loss_coeff={ + 'class': 4, + 'bbox': 5, + 'giou': 2, + 'mask': 5, + 'dice': 5 + }, + aux_loss=True, + use_focal_loss=False, + num_sample_points=12544, + oversample_ratio=3.0, + important_sample_ratio=0.75): + super(MaskDINOLoss, self).__init__(num_classes, matcher, loss_coeff, + aux_loss, use_focal_loss) + assert oversample_ratio >= 1 + assert important_sample_ratio <= 1 and important_sample_ratio >= 0 + + self.num_sample_points = num_sample_points + self.oversample_ratio = oversample_ratio + self.important_sample_ratio = important_sample_ratio + self.num_oversample_points = int(num_sample_points * oversample_ratio) + self.num_important_points = int(num_sample_points * + important_sample_ratio) + self.num_random_points = num_sample_points - self.num_important_points + + def forward(self, + boxes, + logits, + gt_bbox, + gt_class, + masks=None, + gt_mask=None, + postfix="", + dn_out_bboxes=None, + dn_out_logits=None, + dn_out_masks=None, + dn_meta=None, + **kwargs): + num_gts = self._get_num_gts(gt_class) + total_loss = super(MaskDINOLoss, self).forward( + boxes, + logits, + gt_bbox, + gt_class, + masks=masks, + gt_mask=gt_mask, + num_gts=num_gts) + + if dn_meta is not None: + dn_positive_idx, dn_num_group = \ + dn_meta["dn_positive_idx"], dn_meta["dn_num_group"] + assert len(gt_class) == len(dn_positive_idx) + + # denoising match indices + dn_match_indices = DINOLoss.get_dn_match_indices( + gt_class, dn_positive_idx, dn_num_group) + + # compute denoising training loss + num_gts *= dn_num_group + dn_loss = super(MaskDINOLoss, self).forward( + dn_out_bboxes, + dn_out_logits, + gt_bbox, + gt_class, + masks=dn_out_masks, + gt_mask=gt_mask, + postfix="_dn", + dn_match_indices=dn_match_indices, + num_gts=num_gts) + total_loss.update(dn_loss) + else: + total_loss.update( + {k + '_dn': paddle.to_tensor([0.]) + for k in total_loss.keys()}) + + return total_loss + + def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts, + postfix=""): + # masks: [b, query, h, w], gt_mask: list[[n, H, W]] + name_mask = "loss_mask" + postfix + name_dice = "loss_dice" + postfix + + loss = dict() + if sum(len(a) for a in gt_mask) == 0: + loss[name_mask] = paddle.to_tensor([0.]) + loss[name_dice] = paddle.to_tensor([0.]) + return loss + + src_masks, target_masks = self._get_src_target_assign(masks, gt_mask, + match_indices) + # sample points + sample_points = self._get_point_coords_by_uncertainty(src_masks) + sample_points = 2.0 * sample_points.unsqueeze(1) - 1.0 + + src_masks = F.grid_sample( + src_masks.unsqueeze(1), sample_points, + align_corners=False).squeeze([1, 2]) + + target_masks = F.grid_sample( + target_masks.unsqueeze(1), sample_points, + align_corners=False).squeeze([1, 2]).detach() + + loss[name_mask] = self.loss_coeff[ + 'mask'] * F.binary_cross_entropy_with_logits( + src_masks, target_masks, + reduction='none').mean(1).sum() / num_gts + loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss( + src_masks, target_masks, num_gts) + return loss + + def _get_point_coords_by_uncertainty(self, masks): + # Sample points based on their uncertainty. + masks = masks.detach() + num_masks = masks.shape[0] + sample_points = paddle.rand( + [num_masks, 1, self.num_oversample_points, 2]) + + out_mask = F.grid_sample( + masks.unsqueeze(1), 2.0 * sample_points - 1.0, + align_corners=False).squeeze([1, 2]) + out_mask = -paddle.abs(out_mask) + + _, topk_ind = paddle.topk(out_mask, self.num_important_points, axis=1) + batch_ind = paddle.arange(end=num_masks, dtype=topk_ind.dtype) + batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_important_points]) + topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1) + + sample_points = paddle.gather_nd(sample_points.squeeze(1), topk_ind) + if self.num_random_points > 0: + sample_points = paddle.concat( + [ + sample_points, + paddle.rand([num_masks, self.num_random_points, 2]) + ], + axis=1) + return sample_points diff --git a/rtdetr_paddle/ppdet/modeling/losses/focal_loss.py b/rtdetr_paddle/ppdet/modeling/losses/focal_loss.py new file mode 100644 index 0000000..b9a64e1 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/losses/focal_loss.py @@ -0,0 +1,138 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn.functional as F +import paddle.nn as nn +from ppdet.core.workspace import register + +__all__ = ['FocalLoss', 'Weighted_FocalLoss'] + +@register +class FocalLoss(nn.Layer): + """A wrapper around paddle.nn.functional.sigmoid_focal_loss. + Args: + use_sigmoid (bool): currently only support use_sigmoid=True + alpha (float): parameter alpha in Focal Loss + gamma (float): parameter gamma in Focal Loss + loss_weight (float): final loss will be multiplied by this + """ + def __init__(self, + use_sigmoid=True, + alpha=0.25, + gamma=2.0, + loss_weight=1.0): + super(FocalLoss, self).__init__() + assert use_sigmoid == True, \ + 'Focal Loss only supports sigmoid at the moment' + self.use_sigmoid = use_sigmoid + self.alpha = alpha + self.gamma = gamma + self.loss_weight = loss_weight + + def forward(self, pred, target, reduction='none'): + """forward function. + Args: + pred (Tensor): logits of class prediction, of shape (N, num_classes) + target (Tensor): target class label, of shape (N, ) + reduction (str): the way to reduce loss, one of (none, sum, mean) + """ + num_classes = pred.shape[1] + target = F.one_hot(target, num_classes+1).cast(pred.dtype) + target = target[:, :-1].detach() + loss = F.sigmoid_focal_loss( + pred, target, alpha=self.alpha, gamma=self.gamma, + reduction=reduction) + return loss * self.loss_weight + + +@register +class Weighted_FocalLoss(FocalLoss): + """A wrapper around paddle.nn.functional.sigmoid_focal_loss. + Args: + use_sigmoid (bool): currently only support use_sigmoid=True + alpha (float): parameter alpha in Focal Loss + gamma (float): parameter gamma in Focal Loss + loss_weight (float): final loss will be multiplied by this + """ + def __init__(self, + use_sigmoid=True, + alpha=0.25, + gamma=2.0, + loss_weight=1.0, + reduction="mean"): + super(FocalLoss, self).__init__() + assert use_sigmoid == True, \ + 'Focal Loss only supports sigmoid at the moment' + self.use_sigmoid = use_sigmoid + self.alpha = alpha + self.gamma = gamma + self.loss_weight = loss_weight + self.reduction = reduction + + def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None): + """forward function. + Args: + pred (Tensor): logits of class prediction, of shape (N, num_classes) + target (Tensor): target class label, of shape (N, ) + reduction (str): the way to reduce loss, one of (none, sum, mean) + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + num_classes = pred.shape[1] + target = F.one_hot(target, num_classes + 1).astype(pred.dtype) + target = target[:, :-1].detach() + loss = F.sigmoid_focal_loss( + pred, target, alpha=self.alpha, gamma=self.gamma, + reduction='none') + + if weight is not None: + if weight.shape != loss.shape: + if weight.shape[0] == loss.shape[0]: + # For most cases, weight is of shape (num_priors, ), + # which means it does not have the second axis num_class + weight = weight.reshape((-1, 1)) + else: + # Sometimes, weight per anchor per class is also needed. e.g. + # in FSAF. But it may be flattened of shape + # (num_priors x num_class, ), while loss is still of shape + # (num_priors, num_class). + assert weight.numel() == loss.numel() + weight = weight.reshape((loss.shape[0], -1)) + assert weight.ndim == loss.ndim + loss = loss * weight + + # if avg_factor is not specified, just reduce the loss + if avg_factor is None: + if reduction == 'mean': + loss = loss.mean() + elif reduction == 'sum': + loss = loss.sum() + else: + # if reduction is mean, then average the loss by avg_factor + if reduction == 'mean': + # Avoid causing ZeroDivisionError when avg_factor is 0.0, + # i.e., all labels of an image belong to ignore index. + eps = 1e-10 + loss = loss.sum() / (avg_factor + eps) + # if reduction is 'none', then do nothing, otherwise raise an error + elif reduction != 'none': + raise ValueError('avg_factor can not be used with reduction="sum"') + + return loss * self.loss_weight diff --git a/rtdetr_paddle/ppdet/modeling/losses/gfocal_loss.py b/rtdetr_paddle/ppdet/modeling/losses/gfocal_loss.py new file mode 100644 index 0000000..37e27f0 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/losses/gfocal_loss.py @@ -0,0 +1,217 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The code is based on: +# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/gfocal_loss.py + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ppdet.core.workspace import register, serializable +from ppdet.modeling import ops + +__all__ = ['QualityFocalLoss', 'DistributionFocalLoss'] + + +def quality_focal_loss(pred, target, beta=2.0, use_sigmoid=True): + """ + Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning + Qualified and Distributed Bounding Boxes for Dense Object Detection + `_. + Args: + pred (Tensor): Predicted joint representation of classification + and quality (IoU) estimation with shape (N, C), C is the number of + classes. + target (tuple([Tensor])): Target category label with shape (N,) + and target quality label with shape (N,). + beta (float): The beta parameter for calculating the modulating factor. + Defaults to 2.0. + Returns: + Tensor: Loss tensor with shape (N,). + """ + assert len(target) == 2, """target for QFL must be a tuple of two elements, + including category label and quality label, respectively""" + # label denotes the category id, score denotes the quality score + label, score = target + if use_sigmoid: + func = F.binary_cross_entropy_with_logits + else: + func = F.binary_cross_entropy + + # negatives are supervised by 0 quality score + pred_sigmoid = F.sigmoid(pred) if use_sigmoid else pred + scale_factor = pred_sigmoid + zerolabel = paddle.zeros(pred.shape, dtype='float32') + loss = func(pred, zerolabel, reduction='none') * scale_factor.pow(beta) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = pred.shape[1] + pos = paddle.logical_and((label >= 0), + (label < bg_class_ind)).nonzero().squeeze(1) + if pos.shape[0] == 0: + return loss.sum(axis=1) + pos_label = paddle.gather(label, pos, axis=0) + pos_mask = np.zeros(pred.shape, dtype=np.int32) + pos_mask[pos.numpy(), pos_label.numpy()] = 1 + pos_mask = paddle.to_tensor(pos_mask, dtype='bool') + score = score.unsqueeze(-1).expand([-1, pred.shape[1]]).cast('float32') + # positives are supervised by bbox quality (IoU) score + scale_factor_new = score - pred_sigmoid + + loss_pos = func( + pred, score, reduction='none') * scale_factor_new.abs().pow(beta) + loss = loss * paddle.logical_not(pos_mask) + loss_pos * pos_mask + loss = loss.sum(axis=1) + return loss + + +def distribution_focal_loss(pred, label): + """Distribution Focal Loss (DFL) is from `Generalized Focal Loss: Learning + Qualified and Distributed Bounding Boxes for Dense Object Detection + `_. + Args: + pred (Tensor): Predicted general distribution of bounding boxes + (before softmax) with shape (N, n+1), n is the max value of the + integral set `{0, ..., n}` in paper. + label (Tensor): Target distance label for bounding boxes with + shape (N,). + Returns: + Tensor: Loss tensor with shape (N,). + """ + dis_left = label.cast('int64') + dis_right = dis_left + 1 + weight_left = dis_right.cast('float32') - label + weight_right = label - dis_left.cast('float32') + loss = F.cross_entropy(pred, dis_left, reduction='none') * weight_left \ + + F.cross_entropy(pred, dis_right, reduction='none') * weight_right + return loss + + +@register +@serializable +class QualityFocalLoss(nn.Layer): + r"""Quality Focal Loss (QFL) is a variant of `Generalized Focal Loss: + Learning Qualified and Distributed Bounding Boxes for Dense Object + Detection `_. + Args: + use_sigmoid (bool): Whether sigmoid operation is conducted in QFL. + Defaults to True. + beta (float): The beta parameter for calculating the modulating factor. + Defaults to 2.0. + reduction (str): Options are "none", "mean" and "sum". + loss_weight (float): Loss weight of current loss. + """ + + def __init__(self, + use_sigmoid=True, + beta=2.0, + reduction='mean', + loss_weight=1.0): + super(QualityFocalLoss, self).__init__() + self.use_sigmoid = use_sigmoid + self.beta = beta + assert reduction in ('none', 'mean', 'sum') + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, pred, target, weight=None, avg_factor=None): + """Forward function. + Args: + pred (Tensor): Predicted joint representation of + classification and quality (IoU) estimation with shape (N, C), + C is the number of classes. + target (tuple([Tensor])): Target category label with shape + (N,) and target quality label with shape (N,). + weight (Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + """ + + loss = self.loss_weight * quality_focal_loss( + pred, target, beta=self.beta, use_sigmoid=self.use_sigmoid) + + if weight is not None: + loss = loss * weight + if avg_factor is None: + if self.reduction == 'none': + return loss + elif self.reduction == 'mean': + return loss.mean() + elif self.reduction == 'sum': + return loss.sum() + else: + # if reduction is mean, then average the loss by avg_factor + if self.reduction == 'mean': + loss = loss.sum() / avg_factor + # if reduction is 'none', then do nothing, otherwise raise an error + elif self.reduction != 'none': + raise ValueError( + 'avg_factor can not be used with reduction="sum"') + return loss + + +@register +@serializable +class DistributionFocalLoss(nn.Layer): + """Distribution Focal Loss (DFL) is a variant of `Generalized Focal Loss: + Learning Qualified and Distributed Bounding Boxes for Dense Object + Detection `_. + Args: + reduction (str): Options are `'none'`, `'mean'` and `'sum'`. + loss_weight (float): Loss weight of current loss. + """ + + def __init__(self, reduction='mean', loss_weight=1.0): + super(DistributionFocalLoss, self).__init__() + assert reduction in ('none', 'mean', 'sum') + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, pred, target, weight=None, avg_factor=None): + """Forward function. + Args: + pred (Tensor): Predicted general distribution of bounding + boxes (before softmax) with shape (N, n+1), n is the max value + of the integral set `{0, ..., n}` in paper. + target (Tensor): Target distance label for bounding boxes + with shape (N,). + weight (Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + """ + loss = self.loss_weight * distribution_focal_loss(pred, target) + if weight is not None: + loss = loss * weight + if avg_factor is None: + if self.reduction == 'none': + return loss + elif self.reduction == 'mean': + return loss.mean() + elif self.reduction == 'sum': + return loss.sum() + else: + # if reduction is mean, then average the loss by avg_factor + if self.reduction == 'mean': + loss = loss.sum() / avg_factor + # if reduction is 'none', then do nothing, otherwise raise an error + elif self.reduction != 'none': + raise ValueError( + 'avg_factor can not be used with reduction="sum"') + return loss diff --git a/rtdetr_paddle/ppdet/modeling/losses/iou_loss.py b/rtdetr_paddle/ppdet/modeling/losses/iou_loss.py new file mode 100644 index 0000000..b5cac22 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/losses/iou_loss.py @@ -0,0 +1,295 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import math +import paddle + +from ppdet.core.workspace import register, serializable +from ..bbox_utils import bbox_iou + +__all__ = ['IouLoss', 'GIoULoss', 'DIouLoss', 'SIoULoss'] + + +@register +@serializable +class IouLoss(object): + """ + iou loss, see https://arxiv.org/abs/1908.03851 + loss = 1.0 - iou * iou + Args: + loss_weight (float): iou loss weight, default is 2.5 + max_height (int): max height of input to support random shape input + max_width (int): max width of input to support random shape input + ciou_term (bool): whether to add ciou_term + loss_square (bool): whether to square the iou term + """ + + def __init__(self, + loss_weight=2.5, + giou=False, + diou=False, + ciou=False, + loss_square=True): + self.loss_weight = loss_weight + self.giou = giou + self.diou = diou + self.ciou = ciou + self.loss_square = loss_square + + def __call__(self, pbox, gbox): + iou = bbox_iou( + pbox, gbox, giou=self.giou, diou=self.diou, ciou=self.ciou) + if self.loss_square: + loss_iou = 1 - iou * iou + else: + loss_iou = 1 - iou + + loss_iou = loss_iou * self.loss_weight + return loss_iou + + +@register +@serializable +class GIoULoss(object): + """ + Generalized Intersection over Union, see https://arxiv.org/abs/1902.09630 + Args: + loss_weight (float): giou loss weight, default as 1 + eps (float): epsilon to avoid divide by zero, default as 1e-10 + reduction (string): Options are "none", "mean" and "sum". default as none + """ + + def __init__(self, loss_weight=1., eps=1e-10, reduction='none'): + self.loss_weight = loss_weight + self.eps = eps + assert reduction in ('none', 'mean', 'sum') + self.reduction = reduction + + def bbox_overlap(self, box1, box2, eps=1e-10): + """calculate the iou of box1 and box2 + Args: + box1 (Tensor): box1 with the shape (..., 4) + box2 (Tensor): box1 with the shape (..., 4) + eps (float): epsilon to avoid divide by zero + Return: + iou (Tensor): iou of box1 and box2 + overlap (Tensor): overlap of box1 and box2 + union (Tensor): union of box1 and box2 + """ + x1, y1, x2, y2 = box1 + x1g, y1g, x2g, y2g = box2 + + xkis1 = paddle.maximum(x1, x1g) + ykis1 = paddle.maximum(y1, y1g) + xkis2 = paddle.minimum(x2, x2g) + ykis2 = paddle.minimum(y2, y2g) + w_inter = (xkis2 - xkis1).clip(0) + h_inter = (ykis2 - ykis1).clip(0) + overlap = w_inter * h_inter + + area1 = (x2 - x1) * (y2 - y1) + area2 = (x2g - x1g) * (y2g - y1g) + union = area1 + area2 - overlap + eps + iou = overlap / union + + return iou, overlap, union + + def __call__(self, pbox, gbox, iou_weight=1., loc_reweight=None): + x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1) + x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1) + box1 = [x1, y1, x2, y2] + box2 = [x1g, y1g, x2g, y2g] + iou, overlap, union = self.bbox_overlap(box1, box2, self.eps) + xc1 = paddle.minimum(x1, x1g) + yc1 = paddle.minimum(y1, y1g) + xc2 = paddle.maximum(x2, x2g) + yc2 = paddle.maximum(y2, y2g) + + area_c = (xc2 - xc1) * (yc2 - yc1) + self.eps + miou = iou - ((area_c - union) / area_c) + if loc_reweight is not None: + loc_reweight = paddle.reshape(loc_reweight, shape=(-1, 1)) + loc_thresh = 0.9 + giou = 1 - (1 - loc_thresh + ) * miou - loc_thresh * miou * loc_reweight + else: + giou = 1 - miou + if self.reduction == 'none': + loss = giou + elif self.reduction == 'sum': + loss = paddle.sum(giou * iou_weight) + else: + loss = paddle.mean(giou * iou_weight) + return loss * self.loss_weight + + +@register +@serializable +class DIouLoss(GIoULoss): + """ + Distance-IoU Loss, see https://arxiv.org/abs/1911.08287 + Args: + loss_weight (float): giou loss weight, default as 1 + eps (float): epsilon to avoid divide by zero, default as 1e-10 + use_complete_iou_loss (bool): whether to use complete iou loss + """ + + def __init__(self, loss_weight=1., eps=1e-10, use_complete_iou_loss=True): + super(DIouLoss, self).__init__(loss_weight=loss_weight, eps=eps) + self.use_complete_iou_loss = use_complete_iou_loss + + def __call__(self, pbox, gbox, iou_weight=1.): + x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1) + x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1) + cx = (x1 + x2) / 2 + cy = (y1 + y2) / 2 + w = x2 - x1 + h = y2 - y1 + + cxg = (x1g + x2g) / 2 + cyg = (y1g + y2g) / 2 + wg = x2g - x1g + hg = y2g - y1g + + x2 = paddle.maximum(x1, x2) + y2 = paddle.maximum(y1, y2) + + # A and B + xkis1 = paddle.maximum(x1, x1g) + ykis1 = paddle.maximum(y1, y1g) + xkis2 = paddle.minimum(x2, x2g) + ykis2 = paddle.minimum(y2, y2g) + + # A or B + xc1 = paddle.minimum(x1, x1g) + yc1 = paddle.minimum(y1, y1g) + xc2 = paddle.maximum(x2, x2g) + yc2 = paddle.maximum(y2, y2g) + + intsctk = (xkis2 - xkis1) * (ykis2 - ykis1) + intsctk = intsctk * paddle.greater_than( + xkis2, xkis1) * paddle.greater_than(ykis2, ykis1) + unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g + ) - intsctk + self.eps + iouk = intsctk / unionk + + # DIOU term + dist_intersection = (cx - cxg) * (cx - cxg) + (cy - cyg) * (cy - cyg) + dist_union = (xc2 - xc1) * (xc2 - xc1) + (yc2 - yc1) * (yc2 - yc1) + diou_term = (dist_intersection + self.eps) / (dist_union + self.eps) + + # CIOU term + ciou_term = 0 + if self.use_complete_iou_loss: + ar_gt = wg / hg + ar_pred = w / h + arctan = paddle.atan(ar_gt) - paddle.atan(ar_pred) + ar_loss = 4. / np.pi / np.pi * arctan * arctan + alpha = ar_loss / (1 - iouk + ar_loss + self.eps) + alpha.stop_gradient = True + ciou_term = alpha * ar_loss + + diou = paddle.mean((1 - iouk + ciou_term + diou_term) * iou_weight) + + return diou * self.loss_weight + + +@register +@serializable +class SIoULoss(GIoULoss): + """ + see https://arxiv.org/pdf/2205.12740.pdf + Args: + loss_weight (float): siou loss weight, default as 1 + eps (float): epsilon to avoid divide by zero, default as 1e-10 + theta (float): default as 4 + reduction (str): Options are "none", "mean" and "sum". default as none + """ + + def __init__(self, loss_weight=1., eps=1e-10, theta=4., reduction='none'): + super(SIoULoss, self).__init__(loss_weight=loss_weight, eps=eps) + self.loss_weight = loss_weight + self.eps = eps + self.theta = theta + self.reduction = reduction + + def __call__(self, pbox, gbox): + x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1) + x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1) + + box1 = [x1, y1, x2, y2] + box2 = [x1g, y1g, x2g, y2g] + iou = bbox_iou(box1, box2) + + cx = (x1 + x2) / 2 + cy = (y1 + y2) / 2 + w = x2 - x1 + self.eps + h = y2 - y1 + self.eps + + cxg = (x1g + x2g) / 2 + cyg = (y1g + y2g) / 2 + wg = x2g - x1g + self.eps + hg = y2g - y1g + self.eps + + x2 = paddle.maximum(x1, x2) + y2 = paddle.maximum(y1, y2) + + # A or B + xc1 = paddle.minimum(x1, x1g) + yc1 = paddle.minimum(y1, y1g) + xc2 = paddle.maximum(x2, x2g) + yc2 = paddle.maximum(y2, y2g) + + cw_out = xc2 - xc1 + ch_out = yc2 - yc1 + + ch = paddle.maximum(cy, cyg) - paddle.minimum(cy, cyg) + cw = paddle.maximum(cx, cxg) - paddle.minimum(cx, cxg) + + # angle cost + dist_intersection = paddle.sqrt((cx - cxg)**2 + (cy - cyg)**2) + sin_angle_alpha = ch / dist_intersection + sin_angle_beta = cw / dist_intersection + thred = paddle.pow(paddle.to_tensor(2), 0.5) / 2 + thred.stop_gradient = True + sin_alpha = paddle.where(sin_angle_alpha > thred, sin_angle_beta, + sin_angle_alpha) + angle_cost = paddle.cos(paddle.asin(sin_alpha) * 2 - math.pi / 2) + + # distance cost + gamma = 2 - angle_cost + # gamma.stop_gradient = True + beta_x = ((cxg - cx) / cw_out)**2 + beta_y = ((cyg - cy) / ch_out)**2 + dist_cost = 1 - paddle.exp(-gamma * beta_x) + 1 - paddle.exp(-gamma * + beta_y) + + # shape cost + omega_w = paddle.abs(w - wg) / paddle.maximum(w, wg) + omega_h = paddle.abs(hg - h) / paddle.maximum(h, hg) + omega = (1 - paddle.exp(-omega_w))**self.theta + ( + 1 - paddle.exp(-omega_h))**self.theta + siou_loss = 1 - iou + (omega + dist_cost) / 2 + + if self.reduction == 'mean': + siou_loss = paddle.mean(siou_loss) + elif self.reduction == 'sum': + siou_loss = paddle.sum(siou_loss) + + return siou_loss * self.loss_weight diff --git a/rtdetr_paddle/ppdet/modeling/losses/smooth_l1_loss.py b/rtdetr_paddle/ppdet/modeling/losses/smooth_l1_loss.py new file mode 100644 index 0000000..f89c28f --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/losses/smooth_l1_loss.py @@ -0,0 +1,60 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ppdet.core.workspace import register + +__all__ = ['SmoothL1Loss'] + +@register +class SmoothL1Loss(nn.Layer): + """Smooth L1 Loss. + Args: + beta (float): controls smooth region, it becomes L1 Loss when beta=0.0 + loss_weight (float): the final loss will be multiplied by this + """ + def __init__(self, + beta=1.0, + loss_weight=1.0): + super(SmoothL1Loss, self).__init__() + assert beta >= 0 + self.beta = beta + self.loss_weight = loss_weight + + def forward(self, pred, target, reduction='none'): + """forward function, based on fvcore. + Args: + pred (Tensor): prediction tensor + target (Tensor): target tensor, pred.shape must be the same as target.shape + reduction (str): the way to reduce loss, one of (none, sum, mean) + """ + assert reduction in ('none', 'sum', 'mean') + target = target.detach() + if self.beta < 1e-5: + loss = paddle.abs(pred - target) + else: + n = paddle.abs(pred - target) + cond = n < self.beta + loss = paddle.where(cond, 0.5 * n ** 2 / self.beta, n - 0.5 * self.beta) + if reduction == 'mean': + loss = loss.mean() if loss.size > 0 else 0.0 * loss.sum() + elif reduction == 'sum': + loss = loss.sum() + return loss * self.loss_weight diff --git a/rtdetr_paddle/ppdet/modeling/losses/varifocal_loss.py b/rtdetr_paddle/ppdet/modeling/losses/varifocal_loss.py new file mode 100644 index 0000000..42d18a6 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/losses/varifocal_loss.py @@ -0,0 +1,152 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The code is based on: +# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/varifocal_loss.py + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ppdet.core.workspace import register, serializable +from ppdet.modeling import ops + +__all__ = ['VarifocalLoss'] + + +def varifocal_loss(pred, + target, + alpha=0.75, + gamma=2.0, + iou_weighted=True, + use_sigmoid=True): + """`Varifocal Loss `_ + + Args: + pred (Tensor): The prediction with shape (N, C), C is the + number of classes + target (Tensor): The learning target of the iou-aware + classification score with shape (N, C), C is the number of classes. + alpha (float, optional): A balance factor for the negative part of + Varifocal Loss, which is different from the alpha of Focal Loss. + Defaults to 0.75. + gamma (float, optional): The gamma for calculating the modulating + factor. Defaults to 2.0. + iou_weighted (bool, optional): Whether to weight the loss of the + positive example with the iou target. Defaults to True. + """ + # pred and target should be of the same size + assert pred.shape == target.shape + if use_sigmoid: + pred_new = F.sigmoid(pred) + else: + pred_new = pred + target = target.cast(pred.dtype) + if iou_weighted: + focal_weight = target * (target > 0.0).cast('float32') + \ + alpha * (pred_new - target).abs().pow(gamma) * \ + (target <= 0.0).cast('float32') + else: + focal_weight = (target > 0.0).cast('float32') + \ + alpha * (pred_new - target).abs().pow(gamma) * \ + (target <= 0.0).cast('float32') + + if use_sigmoid: + loss = F.binary_cross_entropy_with_logits( + pred, target, reduction='none') * focal_weight + else: + loss = F.binary_cross_entropy( + pred, target, reduction='none') * focal_weight + loss = loss.sum(axis=1) + return loss + + +@register +@serializable +class VarifocalLoss(nn.Layer): + def __init__(self, + use_sigmoid=True, + alpha=0.75, + gamma=2.0, + iou_weighted=True, + reduction='mean', + loss_weight=1.0): + """`Varifocal Loss `_ + + Args: + use_sigmoid (bool, optional): Whether the prediction is + used for sigmoid or softmax. Defaults to True. + alpha (float, optional): A balance factor for the negative part of + Varifocal Loss, which is different from the alpha of Focal + Loss. Defaults to 0.75. + gamma (float, optional): The gamma for calculating the modulating + factor. Defaults to 2.0. + iou_weighted (bool, optional): Whether to weight the loss of the + positive examples with the iou target. Defaults to True. + reduction (str, optional): The method used to reduce the loss into + a scalar. Defaults to 'mean'. Options are "none", "mean" and + "sum". + loss_weight (float, optional): Weight of loss. Defaults to 1.0. + """ + super(VarifocalLoss, self).__init__() + assert alpha >= 0.0 + self.use_sigmoid = use_sigmoid + self.alpha = alpha + self.gamma = gamma + self.iou_weighted = iou_weighted + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, pred, target, weight=None, avg_factor=None): + """Forward function. + + Args: + pred (Tensor): The prediction. + target (Tensor): The learning target of the prediction. + weight (Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + Returns: + Tensor: The calculated loss + """ + loss = self.loss_weight * varifocal_loss( + pred, + target, + alpha=self.alpha, + gamma=self.gamma, + iou_weighted=self.iou_weighted, + use_sigmoid=self.use_sigmoid) + + if weight is not None: + loss = loss * weight + if avg_factor is None: + if self.reduction == 'none': + return loss + elif self.reduction == 'mean': + return loss.mean() + elif self.reduction == 'sum': + return loss.sum() + else: + # if reduction is mean, then average the loss by avg_factor + if self.reduction == 'mean': + loss = loss.sum() / avg_factor + # if reduction is 'none', then do nothing, otherwise raise an error + elif self.reduction != 'none': + raise ValueError( + 'avg_factor can not be used with reduction="sum"') + return loss diff --git a/rtdetr_paddle/ppdet/modeling/ops.py b/rtdetr_paddle/ppdet/modeling/ops.py new file mode 100644 index 0000000..d9a1192 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/ops.py @@ -0,0 +1,1114 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn.functional as F +import paddle.nn as nn +from paddle import ParamAttr +from paddle.regularizer import L2Decay +try: + import paddle._legacy_C_ops as C_ops +except: + import paddle._C_ops as C_ops + +from paddle import in_dynamic_mode +from paddle.common_ops_import import Variable, LayerHelper, check_variable_and_dtype, check_type, check_dtype + +__all__ = [ + 'prior_box', 'generate_proposals', 'box_coder', 'multiclass_nms', + 'distribute_fpn_proposals', 'matrix_nms', 'batch_norm', 'mish', 'silu', + 'swish', 'identity', 'anchor_generator' +] + + +def identity(x): + return x + + +def mish(x): + return F.mish(x) if hasattr(F, mish) else x * F.tanh(F.softplus(x)) + + +def silu(x): + return F.silu(x) + + +def swish(x): + return x * F.sigmoid(x) + + +TRT_ACT_SPEC = {'swish': swish, 'silu': swish} + +ACT_SPEC = {'mish': mish, 'silu': silu} + + +def get_act_fn(act=None, trt=False): + assert act is None or isinstance(act, ( + str, dict)), 'name of activation should be str, dict or None' + if not act: + return identity + + if isinstance(act, dict): + name = act['name'] + act.pop('name') + kwargs = act + else: + name = act + kwargs = dict() + + if trt and name in TRT_ACT_SPEC: + fn = TRT_ACT_SPEC[name] + elif name in ACT_SPEC: + fn = ACT_SPEC[name] + else: + fn = getattr(F, name) + + return lambda x: fn(x, **kwargs) + + +def batch_norm(ch, + norm_type='bn', + norm_decay=0., + freeze_norm=False, + initializer=None, + data_format='NCHW'): + + norm_lr = 0. if freeze_norm else 1. + weight_attr = ParamAttr( + initializer=initializer, + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay), + trainable=False if freeze_norm else True) + bias_attr = ParamAttr( + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay), + trainable=False if freeze_norm else True) + + if norm_type in ['sync_bn', 'bn']: + norm_layer = nn.BatchNorm2D( + ch, + weight_attr=weight_attr, + bias_attr=bias_attr, + data_format=data_format) + + norm_params = norm_layer.parameters() + if freeze_norm: + for param in norm_params: + param.stop_gradient = True + + return norm_layer + + +@paddle.jit.not_to_static +def anchor_generator(input, + anchor_sizes=None, + aspect_ratios=None, + variance=[0.1, 0.1, 0.2, 0.2], + stride=None, + offset=0.5): + """ + **Anchor generator operator** + Generate anchors for Faster RCNN algorithm. + Each position of the input produce N anchors, N = + size(anchor_sizes) * size(aspect_ratios). The order of generated anchors + is firstly aspect_ratios loop then anchor_sizes loop. + Args: + input(Variable): 4-D Tensor with shape [N,C,H,W]. The input feature map. + anchor_sizes(float32|list|tuple, optional): The anchor sizes of generated + anchors, given in absolute pixels e.g. [64., 128., 256., 512.]. + For instance, the anchor size of 64 means the area of this anchor + equals to 64**2. None by default. + aspect_ratios(float32|list|tuple, optional): The height / width ratios + of generated anchors, e.g. [0.5, 1.0, 2.0]. None by default. + variance(list|tuple, optional): The variances to be used in box + regression deltas. The data type is float32, [0.1, 0.1, 0.2, 0.2] by + default. + stride(list|tuple, optional): The anchors stride across width and height. + The data type is float32. e.g. [16.0, 16.0]. None by default. + offset(float32, optional): Prior boxes center offset. 0.5 by default. + Returns: + Tuple: + Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4]. + H is the height of input, W is the width of input, + num_anchors is the box count of each position. + Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized. + + Variances(Variable): The expanded variances of anchors + with a layout of [H, W, num_priors, 4]. + H is the height of input, W is the width of input + num_anchors is the box count of each position. + Each variance is in (xcenter, ycenter, w, h) format. + Examples: + .. code-block:: python + import paddle.fluid as fluid + conv1 = fluid.data(name='conv1', shape=[None, 48, 16, 16], dtype='float32') + anchor, var = fluid.layers.anchor_generator( + input=conv1, + anchor_sizes=[64, 128, 256, 512], + aspect_ratios=[0.5, 1.0, 2.0], + variance=[0.1, 0.1, 0.2, 0.2], + stride=[16.0, 16.0], + offset=0.5) + """ + + def _is_list_or_tuple_(data): + return (isinstance(data, list) or isinstance(data, tuple)) + + if not _is_list_or_tuple_(anchor_sizes): + anchor_sizes = [anchor_sizes] + if not _is_list_or_tuple_(aspect_ratios): + aspect_ratios = [aspect_ratios] + if not (_is_list_or_tuple_(stride) and len(stride) == 2): + raise ValueError('stride should be a list or tuple ', + 'with length 2, (stride_width, stride_height).') + + anchor_sizes = list(map(float, anchor_sizes)) + aspect_ratios = list(map(float, aspect_ratios)) + stride = list(map(float, stride)) + + if in_dynamic_mode(): + attrs = ('anchor_sizes', anchor_sizes, 'aspect_ratios', aspect_ratios, + 'variances', variance, 'stride', stride, 'offset', offset) + anchor, var = C_ops.anchor_generator(input, *attrs) + return anchor, var + + helper = LayerHelper("anchor_generator", **locals()) + dtype = helper.input_dtype() + attrs = { + 'anchor_sizes': anchor_sizes, + 'aspect_ratios': aspect_ratios, + 'variances': variance, + 'stride': stride, + 'offset': offset + } + + anchor = helper.create_variable_for_type_inference(dtype) + var = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type="anchor_generator", + inputs={"Input": input}, + outputs={"Anchors": anchor, + "Variances": var}, + attrs=attrs, ) + anchor.stop_gradient = True + var.stop_gradient = True + return anchor, var + + +@paddle.jit.not_to_static +def distribute_fpn_proposals(fpn_rois, + min_level, + max_level, + refer_level, + refer_scale, + pixel_offset=False, + rois_num=None, + name=None): + r""" + + **This op only takes LoDTensor as input.** In Feature Pyramid Networks + (FPN) models, it is needed to distribute all proposals into different FPN + level, with respect to scale of the proposals, the referring scale and the + referring level. Besides, to restore the order of proposals, we return an + array which indicates the original index of rois in current proposals. + To compute FPN level for each roi, the formula is given as follows: + + .. math:: + + roi\_scale &= \sqrt{BBoxArea(fpn\_roi)} + + level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level) + + where BBoxArea is a function to compute the area of each roi. + + Args: + + fpn_rois(Variable): 2-D Tensor with shape [N, 4] and data type is + float32 or float64. The input fpn_rois. + min_level(int32): The lowest level of FPN layer where the proposals come + from. + max_level(int32): The highest level of FPN layer where the proposals + come from. + refer_level(int32): The referring level of FPN layer with specified scale. + refer_scale(int32): The referring scale of FPN layer with specified level. + rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. + The shape is [B] and data type is int32. B is the number of images. + If it is not None then return a list of 1-D Tensor. Each element + is the output RoIs' number of each image on the corresponding level + and the shape is [B]. None by default. + name(str, optional): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. + + Returns: + Tuple: + + multi_rois(List) : A list of 2-D LoDTensor with shape [M, 4] + and data type of float32 and float64. The length is + max_level-min_level+1. The proposals in each FPN level. + + restore_ind(Variable): A 2-D Tensor with shape [N, 1], N is + the number of total rois. The data type is int32. It is + used to restore the order of fpn_rois. + + rois_num_per_level(List): A list of 1-D Tensor and each Tensor is + the RoIs' number in each image on the corresponding level. The shape + is [B] and data type of int32. B is the number of images + + + Examples: + .. code-block:: python + + import paddle + from ppdet.modeling import ops + paddle.enable_static() + fpn_rois = paddle.static.data( + name='data', shape=[None, 4], dtype='float32', lod_level=1) + multi_rois, restore_ind = ops.distribute_fpn_proposals( + fpn_rois=fpn_rois, + min_level=2, + max_level=5, + refer_level=4, + refer_scale=224) + """ + num_lvl = max_level - min_level + 1 + + if in_dynamic_mode(): + assert rois_num is not None, "rois_num should not be None in dygraph mode." + attrs = ('min_level', min_level, 'max_level', max_level, 'refer_level', + refer_level, 'refer_scale', refer_scale, 'pixel_offset', + pixel_offset) + multi_rois, restore_ind, rois_num_per_level = C_ops.distribute_fpn_proposals( + fpn_rois, rois_num, num_lvl, num_lvl, *attrs) + + return multi_rois, restore_ind, rois_num_per_level + + else: + check_variable_and_dtype(fpn_rois, 'fpn_rois', ['float32', 'float64'], + 'distribute_fpn_proposals') + helper = LayerHelper('distribute_fpn_proposals', **locals()) + dtype = helper.input_dtype('fpn_rois') + multi_rois = [ + helper.create_variable_for_type_inference(dtype) + for i in range(num_lvl) + ] + + restore_ind = helper.create_variable_for_type_inference(dtype='int32') + + inputs = {'FpnRois': fpn_rois} + outputs = { + 'MultiFpnRois': multi_rois, + 'RestoreIndex': restore_ind, + } + + if rois_num is not None: + inputs['RoisNum'] = rois_num + rois_num_per_level = [ + helper.create_variable_for_type_inference(dtype='int32') + for i in range(num_lvl) + ] + outputs['MultiLevelRoIsNum'] = rois_num_per_level + else: + rois_num_per_level = None + + helper.append_op( + type='distribute_fpn_proposals', + inputs=inputs, + outputs=outputs, + attrs={ + 'min_level': min_level, + 'max_level': max_level, + 'refer_level': refer_level, + 'refer_scale': refer_scale, + 'pixel_offset': pixel_offset + }) + return multi_rois, restore_ind, rois_num_per_level + + +@paddle.jit.not_to_static +def prior_box(input, + image, + min_sizes, + max_sizes=None, + aspect_ratios=[1.], + variance=[0.1, 0.1, 0.2, 0.2], + flip=False, + clip=False, + steps=[0.0, 0.0], + offset=0.5, + min_max_aspect_ratios_order=False, + name=None): + """ + + This op generates prior boxes for SSD(Single Shot MultiBox Detector) algorithm. + Each position of the input produce N prior boxes, N is determined by + the count of min_sizes, max_sizes and aspect_ratios, The size of the + box is in range(min_size, max_size) interval, which is generated in + sequence according to the aspect_ratios. + + Parameters: + input(Tensor): 4-D tensor(NCHW), the data type should be float32 or float64. + image(Tensor): 4-D tensor(NCHW), the input image data of PriorBoxOp, + the data type should be float32 or float64. + min_sizes(list|tuple|float): the min sizes of generated prior boxes. + max_sizes(list|tuple|None): the max sizes of generated prior boxes. + Default: None. + aspect_ratios(list|tuple|float): the aspect ratios of generated + prior boxes. Default: [1.]. + variance(list|tuple): the variances to be encoded in prior boxes. + Default:[0.1, 0.1, 0.2, 0.2]. + flip(bool): Whether to flip aspect ratios. Default:False. + clip(bool): Whether to clip out-of-boundary boxes. Default: False. + step(list|tuple): Prior boxes step across width and height, If + step[0] equals to 0.0 or step[1] equals to 0.0, the prior boxes step across + height or weight of the input will be automatically calculated. + Default: [0., 0.] + offset(float): Prior boxes center offset. Default: 0.5 + min_max_aspect_ratios_order(bool): If set True, the output prior box is + in order of [min, max, aspect_ratios], which is consistent with + Caffe. Please note, this order affects the weights order of + convolution layer followed by and does not affect the final + detection results. Default: False. + name(str, optional): The default value is None. Normally there is no need for + user to set this property. For more information, please refer to :ref:`api_guide_Name` + + Returns: + Tuple: A tuple with two Variable (boxes, variances) + + boxes(Tensor): the output prior boxes of PriorBox. + 4-D tensor, the layout is [H, W, num_priors, 4]. + H is the height of input, W is the width of input, + num_priors is the total box count of each position of input. + + variances(Tensor): the expanded variances of PriorBox. + 4-D tensor, the layput is [H, W, num_priors, 4]. + H is the height of input, W is the width of input + num_priors is the total box count of each position of input + + Examples: + .. code-block:: python + + import paddle + from ppdet.modeling import ops + + paddle.enable_static() + input = paddle.static.data(name="input", shape=[None,3,6,9]) + image = paddle.static.data(name="image", shape=[None,3,9,12]) + box, var = ops.prior_box( + input=input, + image=image, + min_sizes=[100.], + clip=True, + flip=True) + """ + helper = LayerHelper("prior_box", **locals()) + dtype = helper.input_dtype() + check_variable_and_dtype( + input, 'input', ['uint8', 'int8', 'float32', 'float64'], 'prior_box') + + def _is_list_or_tuple_(data): + return (isinstance(data, list) or isinstance(data, tuple)) + + if not _is_list_or_tuple_(min_sizes): + min_sizes = [min_sizes] + if not _is_list_or_tuple_(aspect_ratios): + aspect_ratios = [aspect_ratios] + if not (_is_list_or_tuple_(steps) and len(steps) == 2): + raise ValueError('steps should be a list or tuple ', + 'with length 2, (step_width, step_height).') + + min_sizes = list(map(float, min_sizes)) + aspect_ratios = list(map(float, aspect_ratios)) + steps = list(map(float, steps)) + + cur_max_sizes = None + if max_sizes is not None and len(max_sizes) > 0 and max_sizes[0] > 0: + if not _is_list_or_tuple_(max_sizes): + max_sizes = [max_sizes] + cur_max_sizes = max_sizes + + if in_dynamic_mode(): + attrs = ('min_sizes', min_sizes, 'aspect_ratios', aspect_ratios, + 'variances', variance, 'flip', flip, 'clip', clip, 'step_w', + steps[0], 'step_h', steps[1], 'offset', offset, + 'min_max_aspect_ratios_order', min_max_aspect_ratios_order) + if cur_max_sizes is not None: + attrs += ('max_sizes', cur_max_sizes) + box, var = C_ops.prior_box(input, image, *attrs) + return box, var + else: + attrs = { + 'min_sizes': min_sizes, + 'aspect_ratios': aspect_ratios, + 'variances': variance, + 'flip': flip, + 'clip': clip, + 'step_w': steps[0], + 'step_h': steps[1], + 'offset': offset, + 'min_max_aspect_ratios_order': min_max_aspect_ratios_order + } + + if cur_max_sizes is not None: + attrs['max_sizes'] = cur_max_sizes + + box = helper.create_variable_for_type_inference(dtype) + var = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type="prior_box", + inputs={"Input": input, + "Image": image}, + outputs={"Boxes": box, + "Variances": var}, + attrs=attrs, ) + box.stop_gradient = True + var.stop_gradient = True + return box, var + + +@paddle.jit.not_to_static +def multiclass_nms(bboxes, + scores, + score_threshold, + nms_top_k, + keep_top_k, + nms_threshold=0.3, + normalized=True, + nms_eta=1., + background_label=-1, + return_index=False, + return_rois_num=True, + rois_num=None, + name=None): + """ + This operator is to do multi-class non maximum suppression (NMS) on + boxes and scores. + In the NMS step, this operator greedily selects a subset of detection bounding + boxes that have high scores larger than score_threshold, if providing this + threshold, then selects the largest nms_top_k confidences scores if nms_top_k + is larger than -1. Then this operator pruns away boxes that have high IOU + (intersection over union) overlap with already selected boxes by adaptive + threshold NMS based on parameters of nms_threshold and nms_eta. + Aftern NMS step, at most keep_top_k number of total bboxes are to be kept + per image if keep_top_k is larger than -1. + Args: + bboxes (Tensor): Two types of bboxes are supported: + 1. (Tensor) A 3-D Tensor with shape + [N, M, 4 or 8 16 24 32] represents the + predicted locations of M bounding bboxes, + N is the batch size. Each bounding box has four + coordinate values and the layout is + [xmin, ymin, xmax, ymax], when box size equals to 4. + 2. (LoDTensor) A 3-D Tensor with shape [M, C, 4] + M is the number of bounding boxes, C is the + class number + scores (Tensor): Two types of scores are supported: + 1. (Tensor) A 3-D Tensor with shape [N, C, M] + represents the predicted confidence predictions. + N is the batch size, C is the class number, M is + number of bounding boxes. For each category there + are total M scores which corresponding M bounding + boxes. Please note, M is equal to the 2nd dimension + of BBoxes. + 2. (LoDTensor) A 2-D LoDTensor with shape [M, C]. + M is the number of bbox, C is the class number. + In this case, input BBoxes should be the second + case with shape [M, C, 4]. + background_label (int): The index of background label, the background + label will be ignored. If set to -1, then all + categories will be considered. Default: 0 + score_threshold (float): Threshold to filter out bounding boxes with + low confidence score. If not provided, + consider all boxes. + nms_top_k (int): Maximum number of detections to be kept according to + the confidences after the filtering detections based + on score_threshold. + nms_threshold (float): The threshold to be used in NMS. Default: 0.3 + nms_eta (float): The threshold to be used in NMS. Default: 1.0 + keep_top_k (int): Number of total bboxes to be kept per image after NMS + step. -1 means keeping all bboxes after NMS step. + normalized (bool): Whether detections are normalized. Default: True + return_index(bool): Whether return selected index. Default: False + rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. + The shape is [B] and data type is int32. B is the number of images. + If it is not None then return a list of 1-D Tensor. Each element + is the output RoIs' number of each image on the corresponding level + and the shape is [B]. None by default. + name(str): Name of the multiclass nms op. Default: None. + Returns: + A tuple with two Variables: (Out, Index) if return_index is True, + otherwise, a tuple with one Variable(Out) is returned. + Out: A 2-D LoDTensor with shape [No, 6] represents the detections. + Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax] + or A 2-D LoDTensor with shape [No, 10] represents the detections. + Each row has 10 values: [label, confidence, x1, y1, x2, y2, x3, y3, + x4, y4]. No is the total number of detections. + If all images have not detected results, all elements in LoD will be + 0, and output tensor is empty (None). + Index: Only return when return_index is True. A 2-D LoDTensor with + shape [No, 1] represents the selected index which type is Integer. + The index is the absolute value cross batches. No is the same number + as Out. If the index is used to gather other attribute such as age, + one needs to reshape the input(N, M, 1) to (N * M, 1) as first, where + N is the batch size and M is the number of boxes. + Examples: + .. code-block:: python + + import paddle + from ppdet.modeling import ops + boxes = paddle.static.data(name='bboxes', shape=[81, 4], + dtype='float32', lod_level=1) + scores = paddle.static.data(name='scores', shape=[81], + dtype='float32', lod_level=1) + out, index = ops.multiclass_nms(bboxes=boxes, + scores=scores, + background_label=0, + score_threshold=0.5, + nms_top_k=400, + nms_threshold=0.3, + keep_top_k=200, + normalized=False, + return_index=True) + """ + helper = LayerHelper('multiclass_nms3', **locals()) + + if in_dynamic_mode(): + attrs = ('background_label', background_label, 'score_threshold', + score_threshold, 'nms_top_k', nms_top_k, 'nms_threshold', + nms_threshold, 'keep_top_k', keep_top_k, 'nms_eta', nms_eta, + 'normalized', normalized) + output, index, nms_rois_num = C_ops.multiclass_nms3(bboxes, scores, + rois_num, *attrs) + if not return_index: + index = None + return output, nms_rois_num, index + + else: + output = helper.create_variable_for_type_inference(dtype=bboxes.dtype) + index = helper.create_variable_for_type_inference(dtype='int32') + + inputs = {'BBoxes': bboxes, 'Scores': scores} + outputs = {'Out': output, 'Index': index} + + if rois_num is not None: + inputs['RoisNum'] = rois_num + + if return_rois_num: + nms_rois_num = helper.create_variable_for_type_inference( + dtype='int32') + outputs['NmsRoisNum'] = nms_rois_num + + helper.append_op( + type="multiclass_nms3", + inputs=inputs, + attrs={ + 'background_label': background_label, + 'score_threshold': score_threshold, + 'nms_top_k': nms_top_k, + 'nms_threshold': nms_threshold, + 'keep_top_k': keep_top_k, + 'nms_eta': nms_eta, + 'normalized': normalized + }, + outputs=outputs) + output.stop_gradient = True + index.stop_gradient = True + if not return_index: + index = None + if not return_rois_num: + nms_rois_num = None + + return output, nms_rois_num, index + + +@paddle.jit.not_to_static +def matrix_nms(bboxes, + scores, + score_threshold, + post_threshold, + nms_top_k, + keep_top_k, + use_gaussian=False, + gaussian_sigma=2., + background_label=0, + normalized=True, + return_index=False, + return_rois_num=True, + name=None): + """ + **Matrix NMS** + This operator does matrix non maximum suppression (NMS). + First selects a subset of candidate bounding boxes that have higher scores + than score_threshold (if provided), then the top k candidate is selected if + nms_top_k is larger than -1. Score of the remaining candidate are then + decayed according to the Matrix NMS scheme. + Aftern NMS step, at most keep_top_k number of total bboxes are to be kept + per image if keep_top_k is larger than -1. + Args: + bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the + predicted locations of M bounding bboxes, + N is the batch size. Each bounding box has four + coordinate values and the layout is + [xmin, ymin, xmax, ymax], when box size equals to 4. + The data type is float32 or float64. + scores (Tensor): A 3-D Tensor with shape [N, C, M] + represents the predicted confidence predictions. + N is the batch size, C is the class number, M is + number of bounding boxes. For each category there + are total M scores which corresponding M bounding + boxes. Please note, M is equal to the 2nd dimension + of BBoxes. The data type is float32 or float64. + score_threshold (float): Threshold to filter out bounding boxes with + low confidence score. + post_threshold (float): Threshold to filter out bounding boxes with + low confidence score AFTER decaying. + nms_top_k (int): Maximum number of detections to be kept according to + the confidences after the filtering detections based + on score_threshold. + keep_top_k (int): Number of total bboxes to be kept per image after NMS + step. -1 means keeping all bboxes after NMS step. + use_gaussian (bool): Use Gaussian as the decay function. Default: False + gaussian_sigma (float): Sigma for Gaussian decay function. Default: 2.0 + background_label (int): The index of background label, the background + label will be ignored. If set to -1, then all + categories will be considered. Default: 0 + normalized (bool): Whether detections are normalized. Default: True + return_index(bool): Whether return selected index. Default: False + return_rois_num(bool): whether return rois_num. Default: True + name(str): Name of the matrix nms op. Default: None. + Returns: + A tuple with three Tensor: (Out, Index, RoisNum) if return_index is True, + otherwise, a tuple with two Tensor (Out, RoisNum) is returned. + Out (Tensor): A 2-D Tensor with shape [No, 6] containing the + detection results. + Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax] + (After version 1.3, when no boxes detected, the lod is changed + from {0} to {1}) + Index (Tensor): A 2-D Tensor with shape [No, 1] containing the + selected indices, which are absolute values cross batches. + rois_num (Tensor): A 1-D Tensor with shape [N] containing + the number of detected boxes in each image. + Examples: + .. code-block:: python + import paddle + from ppdet.modeling import ops + boxes = paddle.static.data(name='bboxes', shape=[None,81, 4], + dtype='float32', lod_level=1) + scores = paddle.static.data(name='scores', shape=[None,81], + dtype='float32', lod_level=1) + out = ops.matrix_nms(bboxes=boxes, scores=scores, background_label=0, + score_threshold=0.5, post_threshold=0.1, + nms_top_k=400, keep_top_k=200, normalized=False) + """ + check_variable_and_dtype(bboxes, 'BBoxes', ['float32', 'float64'], + 'matrix_nms') + check_variable_and_dtype(scores, 'Scores', ['float32', 'float64'], + 'matrix_nms') + check_type(score_threshold, 'score_threshold', float, 'matrix_nms') + check_type(post_threshold, 'post_threshold', float, 'matrix_nms') + check_type(nms_top_k, 'nums_top_k', int, 'matrix_nms') + check_type(keep_top_k, 'keep_top_k', int, 'matrix_nms') + check_type(normalized, 'normalized', bool, 'matrix_nms') + check_type(use_gaussian, 'use_gaussian', bool, 'matrix_nms') + check_type(gaussian_sigma, 'gaussian_sigma', float, 'matrix_nms') + check_type(background_label, 'background_label', int, 'matrix_nms') + + if in_dynamic_mode(): + attrs = ('background_label', background_label, 'score_threshold', + score_threshold, 'post_threshold', post_threshold, 'nms_top_k', + nms_top_k, 'gaussian_sigma', gaussian_sigma, 'use_gaussian', + use_gaussian, 'keep_top_k', keep_top_k, 'normalized', + normalized) + out, index, rois_num = C_ops.matrix_nms(bboxes, scores, *attrs) + if not return_index: + index = None + if not return_rois_num: + rois_num = None + return out, rois_num, index + else: + helper = LayerHelper('matrix_nms', **locals()) + output = helper.create_variable_for_type_inference(dtype=bboxes.dtype) + index = helper.create_variable_for_type_inference(dtype='int32') + outputs = {'Out': output, 'Index': index} + if return_rois_num: + rois_num = helper.create_variable_for_type_inference(dtype='int32') + outputs['RoisNum'] = rois_num + + helper.append_op( + type="matrix_nms", + inputs={'BBoxes': bboxes, + 'Scores': scores}, + attrs={ + 'background_label': background_label, + 'score_threshold': score_threshold, + 'post_threshold': post_threshold, + 'nms_top_k': nms_top_k, + 'gaussian_sigma': gaussian_sigma, + 'use_gaussian': use_gaussian, + 'keep_top_k': keep_top_k, + 'normalized': normalized + }, + outputs=outputs) + output.stop_gradient = True + + if not return_index: + index = None + if not return_rois_num: + rois_num = None + return output, rois_num, index + + +@paddle.jit.not_to_static +def box_coder(prior_box, + prior_box_var, + target_box, + code_type="encode_center_size", + box_normalized=True, + axis=0, + name=None): + r""" + **Box Coder Layer** + Encode/Decode the target bounding box with the priorbox information. + + The Encoding schema described below: + .. math:: + ox = (tx - px) / pw / pxv + oy = (ty - py) / ph / pyv + ow = \log(\abs(tw / pw)) / pwv + oh = \log(\abs(th / ph)) / phv + The Decoding schema described below: + + .. math:: + + ox = (pw * pxv * tx * + px) - tw / 2 + oy = (ph * pyv * ty * + py) - th / 2 + ow = \exp(pwv * tw) * pw + tw / 2 + oh = \exp(phv * th) * ph + th / 2 + where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, + width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote + the priorbox's (anchor) center coordinates, width and height. `pxv`, + `pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`, + `ow`, `oh` denote the encoded/decoded coordinates, width and height. + During Box Decoding, two modes for broadcast are supported. Say target + box has shape [N, M, 4], and the shape of prior box can be [N, 4] or + [M, 4]. Then prior box will broadcast to target box along the + assigned axis. + + Args: + prior_box(Tensor): Box list prior_box is a 2-D Tensor with shape + [M, 4] holds M boxes and data type is float32 or float64. Each box + is represented as [xmin, ymin, xmax, ymax], [xmin, ymin] is the + left top coordinate of the anchor box, if the input is image feature + map, they are close to the origin of the coordinate system. + [xmax, ymax] is the right bottom coordinate of the anchor box. + prior_box_var(List|Tensor|None): prior_box_var supports three types + of input. One is Tensor with shape [M, 4] which holds M group and + data type is float32 or float64. The second is list consist of + 4 elements shared by all boxes and data type is float32 or float64. + Other is None and not involved in calculation. + target_box(Tensor): This input can be a 2-D LoDTensor with shape + [N, 4] when code_type is 'encode_center_size'. This input also can + be a 3-D Tensor with shape [N, M, 4] when code_type is + 'decode_center_size'. Each box is represented as + [xmin, ymin, xmax, ymax]. The data type is float32 or float64. + code_type(str): The code type used with the target box. It can be + `encode_center_size` or `decode_center_size`. `encode_center_size` + by default. + box_normalized(bool): Whether treat the priorbox as a normalized box. + Set true by default. + axis(int): Which axis in PriorBox to broadcast for box decode, + for example, if axis is 0 and TargetBox has shape [N, M, 4] and + PriorBox has shape [M, 4], then PriorBox will broadcast to [N, M, 4] + for decoding. It is only valid when code type is + `decode_center_size`. Set 0 by default. + name(str, optional): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. + + Returns: + Tensor: + output_box(Tensor): When code_type is 'encode_center_size', the + output tensor of box_coder_op with shape [N, M, 4] representing the + result of N target boxes encoded with M Prior boxes and variances. + When code_type is 'decode_center_size', N represents the batch size + and M represents the number of decoded boxes. + + Examples: + + .. code-block:: python + + import paddle + from ppdet.modeling import ops + paddle.enable_static() + # For encode + prior_box_encode = paddle.static.data(name='prior_box_encode', + shape=[512, 4], + dtype='float32') + target_box_encode = paddle.static.data(name='target_box_encode', + shape=[81, 4], + dtype='float32') + output_encode = ops.box_coder(prior_box=prior_box_encode, + prior_box_var=[0.1,0.1,0.2,0.2], + target_box=target_box_encode, + code_type="encode_center_size") + # For decode + prior_box_decode = paddle.static.data(name='prior_box_decode', + shape=[512, 4], + dtype='float32') + target_box_decode = paddle.static.data(name='target_box_decode', + shape=[512, 81, 4], + dtype='float32') + output_decode = ops.box_coder(prior_box=prior_box_decode, + prior_box_var=[0.1,0.1,0.2,0.2], + target_box=target_box_decode, + code_type="decode_center_size", + box_normalized=False, + axis=1) + """ + check_variable_and_dtype(prior_box, 'prior_box', ['float32', 'float64'], + 'box_coder') + check_variable_and_dtype(target_box, 'target_box', ['float32', 'float64'], + 'box_coder') + + if in_dynamic_mode(): + if isinstance(prior_box_var, Variable): + output_box = C_ops.box_coder( + prior_box, prior_box_var, target_box, "code_type", code_type, + "box_normalized", box_normalized, "axis", axis) + + elif isinstance(prior_box_var, list): + output_box = C_ops.box_coder( + prior_box, None, target_box, "code_type", code_type, + "box_normalized", box_normalized, "axis", axis, "variance", + prior_box_var) + else: + raise TypeError( + "Input variance of box_coder must be Variable or list") + return output_box + else: + helper = LayerHelper("box_coder", **locals()) + + output_box = helper.create_variable_for_type_inference( + dtype=prior_box.dtype) + + inputs = {"PriorBox": prior_box, "TargetBox": target_box} + attrs = { + "code_type": code_type, + "box_normalized": box_normalized, + "axis": axis + } + if isinstance(prior_box_var, Variable): + inputs['PriorBoxVar'] = prior_box_var + elif isinstance(prior_box_var, list): + attrs['variance'] = prior_box_var + else: + raise TypeError( + "Input variance of box_coder must be Variable or list") + helper.append_op( + type="box_coder", + inputs=inputs, + attrs=attrs, + outputs={"OutputBox": output_box}) + return output_box + + +@paddle.jit.not_to_static +def generate_proposals(scores, + bbox_deltas, + im_shape, + anchors, + variances, + pre_nms_top_n=6000, + post_nms_top_n=1000, + nms_thresh=0.5, + min_size=0.1, + eta=1.0, + pixel_offset=False, + return_rois_num=False, + name=None): + """ + **Generate proposal Faster-RCNN** + This operation proposes RoIs according to each box with their + probability to be a foreground object and + the box can be calculated by anchors. Bbox_deltais and scores + to be an object are the output of RPN. Final proposals + could be used to train detection net. + For generating proposals, this operation performs following steps: + 1. Transposes and resizes scores and bbox_deltas in size of + (H*W*A, 1) and (H*W*A, 4) + 2. Calculate box locations as proposals candidates. + 3. Clip boxes to image + 4. Remove predicted boxes with small area. + 5. Apply NMS to get final proposals as output. + Args: + scores(Tensor): A 4-D Tensor with shape [N, A, H, W] represents + the probability for each box to be an object. + N is batch size, A is number of anchors, H and W are height and + width of the feature map. The data type must be float32. + bbox_deltas(Tensor): A 4-D Tensor with shape [N, 4*A, H, W] + represents the difference between predicted box location and + anchor location. The data type must be float32. + im_shape(Tensor): A 2-D Tensor with shape [N, 2] represents H, W, the + origin image size or input size. The data type can be float32 or + float64. + anchors(Tensor): A 4-D Tensor represents the anchors with a layout + of [H, W, A, 4]. H and W are height and width of the feature map, + num_anchors is the box count of each position. Each anchor is + in (xmin, ymin, xmax, ymax) format an unnormalized. The data type must be float32. + variances(Tensor): A 4-D Tensor. The expanded variances of anchors with a layout of + [H, W, num_priors, 4]. Each variance is in + (xcenter, ycenter, w, h) format. The data type must be float32. + pre_nms_top_n(float): Number of total bboxes to be kept per + image before NMS. The data type must be float32. `6000` by default. + post_nms_top_n(float): Number of total bboxes to be kept per + image after NMS. The data type must be float32. `1000` by default. + nms_thresh(float): Threshold in NMS. The data type must be float32. `0.5` by default. + min_size(float): Remove predicted boxes with either height or + width < min_size. The data type must be float32. `0.1` by default. + eta(float): Apply in adaptive NMS, if adaptive `threshold > 0.5`, + `adaptive_threshold = adaptive_threshold * eta` in each iteration. + return_rois_num(bool): When setting True, it will return a 1D Tensor with shape [N, ] that includes Rois's + num of each image in one batch. The N is the image's num. For example, the tensor has values [4,5] that represents + the first image has 4 Rois, the second image has 5 Rois. It only used in rcnn model. + 'False' by default. + name(str, optional): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. + + Returns: + tuple: + A tuple with format ``(rpn_rois, rpn_roi_probs)``. + - **rpn_rois**: The generated RoIs. 2-D Tensor with shape ``[N, 4]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``. + - **rpn_roi_probs**: The scores of generated RoIs. 2-D Tensor with shape ``[N, 1]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``. + + Examples: + .. code-block:: python + + import paddle + from ppdet.modeling import ops + paddle.enable_static() + scores = paddle.static.data(name='scores', shape=[None, 4, 5, 5], dtype='float32') + bbox_deltas = paddle.static.data(name='bbox_deltas', shape=[None, 16, 5, 5], dtype='float32') + im_shape = paddle.static.data(name='im_shape', shape=[None, 2], dtype='float32') + anchors = paddle.static.data(name='anchors', shape=[None, 5, 4, 4], dtype='float32') + variances = paddle.static.data(name='variances', shape=[None, 5, 10, 4], dtype='float32') + rois, roi_probs = ops.generate_proposals(scores, bbox_deltas, + im_shape, anchors, variances) + """ + if in_dynamic_mode(): + assert return_rois_num, "return_rois_num should be True in dygraph mode." + attrs = ('pre_nms_topN', pre_nms_top_n, 'post_nms_topN', post_nms_top_n, + 'nms_thresh', nms_thresh, 'min_size', min_size, 'eta', eta, + 'pixel_offset', pixel_offset) + rpn_rois, rpn_roi_probs, rpn_rois_num = C_ops.generate_proposals_v2( + scores, bbox_deltas, im_shape, anchors, variances, *attrs) + if not return_rois_num: + rpn_rois_num = None + return rpn_rois, rpn_roi_probs, rpn_rois_num + + else: + helper = LayerHelper('generate_proposals_v2', **locals()) + + check_variable_and_dtype(scores, 'scores', ['float32'], + 'generate_proposals_v2') + check_variable_and_dtype(bbox_deltas, 'bbox_deltas', ['float32'], + 'generate_proposals_v2') + check_variable_and_dtype(im_shape, 'im_shape', ['float32', 'float64'], + 'generate_proposals_v2') + check_variable_and_dtype(anchors, 'anchors', ['float32'], + 'generate_proposals_v2') + check_variable_and_dtype(variances, 'variances', ['float32'], + 'generate_proposals_v2') + + rpn_rois = helper.create_variable_for_type_inference( + dtype=bbox_deltas.dtype) + rpn_roi_probs = helper.create_variable_for_type_inference( + dtype=scores.dtype) + outputs = { + 'RpnRois': rpn_rois, + 'RpnRoiProbs': rpn_roi_probs, + } + if return_rois_num: + rpn_rois_num = helper.create_variable_for_type_inference( + dtype='int32') + rpn_rois_num.stop_gradient = True + outputs['RpnRoisNum'] = rpn_rois_num + + helper.append_op( + type="generate_proposals_v2", + inputs={ + 'Scores': scores, + 'BboxDeltas': bbox_deltas, + 'ImShape': im_shape, + 'Anchors': anchors, + 'Variances': variances + }, + attrs={ + 'pre_nms_topN': pre_nms_top_n, + 'post_nms_topN': post_nms_top_n, + 'nms_thresh': nms_thresh, + 'min_size': min_size, + 'eta': eta, + 'pixel_offset': pixel_offset + }, + outputs=outputs) + rpn_rois.stop_gradient = True + rpn_roi_probs.stop_gradient = True + if not return_rois_num: + rpn_rois_num = None + + return rpn_rois, rpn_roi_probs, rpn_rois_num + + +def sigmoid_cross_entropy_with_logits(input, + label, + ignore_index=-100, + normalize=False): + output = F.binary_cross_entropy_with_logits(input, label, reduction='none') + mask_tensor = paddle.cast(label != ignore_index, 'float32') + output = paddle.multiply(output, mask_tensor) + if normalize: + sum_valid_mask = paddle.sum(mask_tensor) + output = output / sum_valid_mask + return output + + +def smooth_l1(input, label, inside_weight=None, outside_weight=None, + sigma=None): + input_new = paddle.multiply(input, inside_weight) + label_new = paddle.multiply(label, inside_weight) + delta = 1 / (sigma * sigma) + out = F.smooth_l1_loss(input_new, label_new, reduction='none', delta=delta) + out = paddle.multiply(out, outside_weight) + out = out / delta + out = paddle.reshape(out, shape=[out.shape[0], -1]) + out = paddle.sum(out, axis=1) + return out + + +def channel_shuffle(x, groups): + batch_size, num_channels, height, width = x.shape[0:4] + assert num_channels % groups == 0, 'num_channels should be divisible by groups' + channels_per_group = num_channels // groups + x = paddle.reshape( + x=x, shape=[batch_size, groups, channels_per_group, height, width]) + x = paddle.transpose(x=x, perm=[0, 2, 1, 3, 4]) + x = paddle.reshape(x=x, shape=[batch_size, num_channels, height, width]) + return x + + +def get_static_shape(tensor): + shape = paddle.shape(tensor) + shape.stop_gradient = True + return shape diff --git a/rtdetr_paddle/ppdet/modeling/post_process.py b/rtdetr_paddle/ppdet/modeling/post_process.py new file mode 100644 index 0000000..795bb5c --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/post_process.py @@ -0,0 +1,244 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +import paddle.nn.functional as F +from ppdet.core.workspace import register +from .transformers import bbox_cxcywh_to_xyxy + +__all__ = [ + 'DETRPostProcess', +] + +@register +class DETRPostProcess(object): + __shared__ = ['num_classes', 'use_focal_loss', 'with_mask'] + __inject__ = [] + + def __init__(self, + num_classes=80, + num_top_queries=100, + dual_queries=False, + dual_groups=0, + use_focal_loss=False, + with_mask=False, + mask_threshold=0.5, + use_avg_mask_score=False, + bbox_decode_type='origin'): + super(DETRPostProcess, self).__init__() + assert bbox_decode_type in ['origin', 'pad'] + + self.num_classes = num_classes + self.num_top_queries = num_top_queries + self.dual_queries = dual_queries + self.dual_groups = dual_groups + self.use_focal_loss = use_focal_loss + self.with_mask = with_mask + self.mask_threshold = mask_threshold + self.use_avg_mask_score = use_avg_mask_score + self.bbox_decode_type = bbox_decode_type + + def _mask_postprocess(self, mask_pred, score_pred, index): + mask_score = F.sigmoid(paddle.gather_nd(mask_pred, index)) + mask_pred = (mask_score > self.mask_threshold).astype(mask_score.dtype) + if self.use_avg_mask_score: + avg_mask_score = (mask_pred * mask_score).sum([-2, -1]) / ( + mask_pred.sum([-2, -1]) + 1e-6) + score_pred *= avg_mask_score + + return mask_pred[0].astype('int32'), score_pred + + def __call__(self, head_out, im_shape, scale_factor, pad_shape): + """ + Decode the bbox and mask. + + Args: + head_out (tuple): bbox_pred, cls_logit and masks of bbox_head output. + im_shape (Tensor): The shape of the input image without padding. + scale_factor (Tensor): The scale factor of the input image. + pad_shape (Tensor): The shape of the input image with padding. + Returns: + bbox_pred (Tensor): The output prediction with shape [N, 6], including + labels, scores and bboxes. The size of bboxes are corresponding + to the input image, the bboxes may be used in other branch. + bbox_num (Tensor): The number of prediction boxes of each batch with + shape [bs], and is N. + """ + bboxes, logits, masks = head_out + if self.dual_queries: + num_queries = logits.shape[1] + logits, bboxes = logits[:, :int(num_queries // (self.dual_groups + 1)), :], \ + bboxes[:, :int(num_queries // (self.dual_groups + 1)), :] + + bbox_pred = bbox_cxcywh_to_xyxy(bboxes) + # calculate the original shape of the image + origin_shape = paddle.floor(im_shape / scale_factor + 0.5) + img_h, img_w = paddle.split(origin_shape, 2, axis=-1) + if self.bbox_decode_type == 'pad': + # calculate the shape of the image with padding + out_shape = pad_shape / im_shape * origin_shape + out_shape = out_shape.flip(1).tile([1, 2]).unsqueeze(1) + elif self.bbox_decode_type == 'origin': + out_shape = origin_shape.flip(1).tile([1, 2]).unsqueeze(1) + else: + raise Exception( + f'Wrong `bbox_decode_type`: {self.bbox_decode_type}.') + bbox_pred *= out_shape + + scores = F.sigmoid(logits) if self.use_focal_loss else F.softmax( + logits)[:, :, :-1] + + if not self.use_focal_loss: + scores, labels = scores.max(-1), scores.argmax(-1) + if scores.shape[1] > self.num_top_queries: + scores, index = paddle.topk( + scores, self.num_top_queries, axis=-1) + batch_ind = paddle.arange( + end=scores.shape[0]).unsqueeze(-1).tile( + [1, self.num_top_queries]) + index = paddle.stack([batch_ind, index], axis=-1) + labels = paddle.gather_nd(labels, index) + bbox_pred = paddle.gather_nd(bbox_pred, index) + else: + scores, index = paddle.topk( + scores.flatten(1), self.num_top_queries, axis=-1) + labels = index % self.num_classes + index = index // self.num_classes + batch_ind = paddle.arange(end=scores.shape[0]).unsqueeze(-1).tile( + [1, self.num_top_queries]) + index = paddle.stack([batch_ind, index], axis=-1) + bbox_pred = paddle.gather_nd(bbox_pred, index) + + mask_pred = None + if self.with_mask: + assert masks is not None + masks = F.interpolate( + masks, scale_factor=4, mode="bilinear", align_corners=False) + # TODO: Support prediction with bs>1. + # remove padding for input image + h, w = im_shape.astype('int32')[0] + masks = masks[..., :h, :w] + # get pred_mask in the original resolution. + img_h = img_h[0].astype('int32') + img_w = img_w[0].astype('int32') + masks = F.interpolate( + masks, + size=(img_h, img_w), + mode="bilinear", + align_corners=False) + mask_pred, scores = self._mask_postprocess(masks, scores, index) + + bbox_pred = paddle.concat( + [ + labels.unsqueeze(-1).astype('float32'), scores.unsqueeze(-1), + bbox_pred + ], + axis=-1) + bbox_num = paddle.to_tensor( + self.num_top_queries, dtype='int32').tile([bbox_pred.shape[0]]) + bbox_pred = bbox_pred.reshape([-1, 6]) + return bbox_pred, bbox_num, mask_pred + + + +def paste_mask(masks, boxes, im_h, im_w, assign_on_cpu=False): + """ + Paste the mask prediction to the original image. + """ + x0_int, y0_int = 0, 0 + x1_int, y1_int = im_w, im_h + x0, y0, x1, y1 = paddle.split(boxes, 4, axis=1) + N = masks.shape[0] + img_y = paddle.arange(y0_int, y1_int) + 0.5 + img_x = paddle.arange(x0_int, x1_int) + 0.5 + + img_y = (img_y - y0) / (y1 - y0) * 2 - 1 + img_x = (img_x - x0) / (x1 - x0) * 2 - 1 + # img_x, img_y have shapes (N, w), (N, h) + + if assign_on_cpu: + paddle.set_device('cpu') + gx = img_x[:, None, :].expand( + [N, paddle.shape(img_y)[1], paddle.shape(img_x)[1]]) + gy = img_y[:, :, None].expand( + [N, paddle.shape(img_y)[1], paddle.shape(img_x)[1]]) + grid = paddle.stack([gx, gy], axis=3) + img_masks = F.grid_sample(masks, grid, align_corners=False) + return img_masks[:, 0] + + +def multiclass_nms(bboxs, num_classes, match_threshold=0.6, match_metric='iou'): + final_boxes = [] + for c in range(num_classes): + idxs = bboxs[:, 0] == c + if np.count_nonzero(idxs) == 0: continue + r = nms(bboxs[idxs, 1:], match_threshold, match_metric) + final_boxes.append(np.concatenate([np.full((r.shape[0], 1), c), r], 1)) + return final_boxes + + +def nms(dets, match_threshold=0.6, match_metric='iou'): + """ Apply NMS to avoid detecting too many overlapping bounding boxes. + Args: + dets: shape [N, 5], [score, x1, y1, x2, y2] + match_metric: 'iou' or 'ios' + match_threshold: overlap thresh for match metric. + """ + if dets.shape[0] == 0: + return dets[[], :] + scores = dets[:, 0] + x1 = dets[:, 1] + y1 = dets[:, 2] + x2 = dets[:, 3] + y2 = dets[:, 4] + areas = (x2 - x1 + 1) * (y2 - y1 + 1) + order = scores.argsort()[::-1] + + ndets = dets.shape[0] + suppressed = np.zeros((ndets), dtype=np.int32) + + for _i in range(ndets): + i = order[_i] + if suppressed[i] == 1: + continue + ix1 = x1[i] + iy1 = y1[i] + ix2 = x2[i] + iy2 = y2[i] + iarea = areas[i] + for _j in range(_i + 1, ndets): + j = order[_j] + if suppressed[j] == 1: + continue + xx1 = max(ix1, x1[j]) + yy1 = max(iy1, y1[j]) + xx2 = min(ix2, x2[j]) + yy2 = min(iy2, y2[j]) + w = max(0.0, xx2 - xx1 + 1) + h = max(0.0, yy2 - yy1 + 1) + inter = w * h + if match_metric == 'iou': + union = iarea + areas[j] - inter + match_value = inter / union + elif match_metric == 'ios': + smaller = min(iarea, areas[j]) + match_value = inter / smaller + else: + raise ValueError() + if match_value >= match_threshold: + suppressed[j] = 1 + keep = np.where(suppressed == 0)[0] + dets = dets[keep, :] + return dets diff --git a/rtdetr_paddle/ppdet/modeling/shape_spec.py b/rtdetr_paddle/ppdet/modeling/shape_spec.py new file mode 100644 index 0000000..81601fd --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/shape_spec.py @@ -0,0 +1,25 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The code is based on: +# https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/shape_spec.py + +from collections import namedtuple + + +class ShapeSpec( + namedtuple("_ShapeSpec", ["channels", "height", "width", "stride"])): + def __new__(cls, channels=None, height=None, width=None, stride=None): + return super(ShapeSpec, cls).__new__(cls, channels, height, width, + stride) diff --git a/rtdetr_paddle/ppdet/modeling/transformers/__init__.py b/rtdetr_paddle/ppdet/modeling/transformers/__init__.py new file mode 100644 index 0000000..47f09bf --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/transformers/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .utils import * +from .matchers import * +from .position_encoding import * +from .rtdetr_transformer import * +from .dino_transformer import * +from .hybrid_encoder import * \ No newline at end of file diff --git a/rtdetr_paddle/ppdet/modeling/transformers/deformable_transformer.py b/rtdetr_paddle/ppdet/modeling/transformers/deformable_transformer.py new file mode 100644 index 0000000..ab05704 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/transformers/deformable_transformer.py @@ -0,0 +1,537 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR) +# Copyright (c) 2020 SenseTime. All Rights Reserved. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr + +from ppdet.core.workspace import register +from ..layers import MultiHeadAttention +from .position_encoding import PositionEmbedding +from .utils import _get_clones, get_valid_ratio +from ..initializer import linear_init_, constant_, xavier_uniform_, normal_ + +__all__ = ['DeformableTransformer'] + + +class MSDeformableAttention(nn.Layer): + def __init__(self, + embed_dim=256, + num_heads=8, + num_levels=4, + num_points=4, + lr_mult=0.1): + """ + Multi-Scale Deformable Attention Module + """ + super(MSDeformableAttention, self).__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.num_levels = num_levels + self.num_points = num_points + self.total_points = num_heads * num_levels * num_points + + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" + + self.sampling_offsets = nn.Linear( + embed_dim, + self.total_points * 2, + weight_attr=ParamAttr(learning_rate=lr_mult), + bias_attr=ParamAttr(learning_rate=lr_mult)) + + self.attention_weights = nn.Linear(embed_dim, self.total_points) + self.value_proj = nn.Linear(embed_dim, embed_dim) + self.output_proj = nn.Linear(embed_dim, embed_dim) + try: + # use cuda op + from deformable_detr_ops import ms_deformable_attn + except: + # use paddle func + from .utils import deformable_attention_core_func as ms_deformable_attn + self.ms_deformable_attn_core = ms_deformable_attn + + self._reset_parameters() + + def _reset_parameters(self): + # sampling_offsets + constant_(self.sampling_offsets.weight) + thetas = paddle.arange( + self.num_heads, + dtype=paddle.float32) * (2.0 * math.pi / self.num_heads) + grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1) + grid_init = grid_init / grid_init.abs().max(-1, keepdim=True) + grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile( + [1, self.num_levels, self.num_points, 1]) + scaling = paddle.arange( + 1, self.num_points + 1, + dtype=paddle.float32).reshape([1, 1, -1, 1]) + grid_init *= scaling + self.sampling_offsets.bias.set_value(grid_init.flatten()) + # attention_weights + constant_(self.attention_weights.weight) + constant_(self.attention_weights.bias) + # proj + xavier_uniform_(self.value_proj.weight) + constant_(self.value_proj.bias) + xavier_uniform_(self.output_proj.weight) + constant_(self.output_proj.bias) + + def forward(self, + query, + reference_points, + value, + value_spatial_shapes, + value_level_start_index, + value_mask=None): + """ + Args: + query (Tensor): [bs, query_length, C] + reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area + value (Tensor): [bs, value_length, C] + value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] + value_level_start_index (Tensor(int64)): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...] + value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements + + Returns: + output (Tensor): [bs, Length_{query}, C] + """ + bs, Len_q = query.shape[:2] + Len_v = value.shape[1] + assert int(value_spatial_shapes.prod(1).sum()) == Len_v + + value = self.value_proj(value) + if value_mask is not None: + value_mask = value_mask.astype(value.dtype).unsqueeze(-1) + value *= value_mask + value = value.reshape([bs, Len_v, self.num_heads, self.head_dim]) + + sampling_offsets = self.sampling_offsets(query).reshape( + [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2]) + attention_weights = self.attention_weights(query).reshape( + [bs, Len_q, self.num_heads, self.num_levels * self.num_points]) + attention_weights = F.softmax(attention_weights).reshape( + [bs, Len_q, self.num_heads, self.num_levels, self.num_points]) + + if reference_points.shape[-1] == 2: + offset_normalizer = value_spatial_shapes.flip([1]).reshape( + [1, 1, 1, self.num_levels, 1, 2]) + sampling_locations = reference_points.reshape([ + bs, Len_q, 1, self.num_levels, 1, 2 + ]) + sampling_offsets / offset_normalizer + elif reference_points.shape[-1] == 4: + sampling_locations = ( + reference_points[:, :, None, :, None, :2] + sampling_offsets / + self.num_points * reference_points[:, :, None, :, None, 2:] * + 0.5) + else: + raise ValueError( + "Last dim of reference_points must be 2 or 4, but get {} instead.". + format(reference_points.shape[-1])) + + output = self.ms_deformable_attn_core( + value, value_spatial_shapes, value_level_start_index, + sampling_locations, attention_weights) + output = self.output_proj(output) + + return output + + +class DeformableTransformerEncoderLayer(nn.Layer): + def __init__(self, + d_model=256, + n_head=8, + dim_feedforward=1024, + dropout=0.1, + activation="relu", + n_levels=4, + n_points=4, + lr_mult=0.1, + weight_attr=None, + bias_attr=None): + super(DeformableTransformerEncoderLayer, self).__init__() + # self attention + self.self_attn = MSDeformableAttention(d_model, n_head, n_levels, + n_points, lr_mult) + self.dropout1 = nn.Dropout(dropout) + self.norm1 = nn.LayerNorm( + d_model, weight_attr=weight_attr, bias_attr=bias_attr) + # ffn + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.activation = getattr(F, activation) + self.dropout2 = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + self.dropout3 = nn.Dropout(dropout) + self.norm2 = nn.LayerNorm( + d_model, weight_attr=weight_attr, bias_attr=bias_attr) + self._reset_parameters() + + def _reset_parameters(self): + linear_init_(self.linear1) + linear_init_(self.linear2) + xavier_uniform_(self.linear1.weight) + xavier_uniform_(self.linear2.weight) + + def with_pos_embed(self, tensor, pos): + return tensor if pos is None else tensor + pos + + def forward_ffn(self, src): + src2 = self.linear2(self.dropout2(self.activation(self.linear1(src)))) + src = src + self.dropout3(src2) + src = self.norm2(src) + return src + + def forward(self, + src, + reference_points, + spatial_shapes, + level_start_index, + src_mask=None, + query_pos_embed=None): + # self attention + src2 = self.self_attn( + self.with_pos_embed(src, query_pos_embed), reference_points, src, + spatial_shapes, level_start_index, src_mask) + src = src + self.dropout1(src2) + src = self.norm1(src) + # ffn + src = self.forward_ffn(src) + + return src + + +class DeformableTransformerEncoder(nn.Layer): + def __init__(self, encoder_layer, num_layers): + super(DeformableTransformerEncoder, self).__init__() + self.layers = _get_clones(encoder_layer, num_layers) + self.num_layers = num_layers + + @staticmethod + def get_reference_points(spatial_shapes, valid_ratios, offset=0.5): + valid_ratios = valid_ratios.unsqueeze(1) + reference_points = [] + for i, (H, W) in enumerate(spatial_shapes): + ref_y, ref_x = paddle.meshgrid( + paddle.arange(end=H) + offset, paddle.arange(end=W) + offset) + ref_y = ref_y.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 1] * + H) + ref_x = ref_x.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 0] * + W) + reference_points.append(paddle.stack((ref_x, ref_y), axis=-1)) + reference_points = paddle.concat(reference_points, 1).unsqueeze(2) + reference_points = reference_points * valid_ratios + return reference_points + + def forward(self, + feat, + spatial_shapes, + level_start_index, + feat_mask=None, + query_pos_embed=None, + valid_ratios=None): + if valid_ratios is None: + valid_ratios = paddle.ones( + [feat.shape[0], spatial_shapes.shape[0], 2]) + reference_points = self.get_reference_points(spatial_shapes, + valid_ratios) + for layer in self.layers: + feat = layer(feat, reference_points, spatial_shapes, + level_start_index, feat_mask, query_pos_embed) + + return feat + + +class DeformableTransformerDecoderLayer(nn.Layer): + def __init__(self, + d_model=256, + n_head=8, + dim_feedforward=1024, + dropout=0.1, + activation="relu", + n_levels=4, + n_points=4, + lr_mult=0.1, + weight_attr=None, + bias_attr=None): + super(DeformableTransformerDecoderLayer, self).__init__() + + # self attention + self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout) + self.dropout1 = nn.Dropout(dropout) + self.norm1 = nn.LayerNorm( + d_model, weight_attr=weight_attr, bias_attr=bias_attr) + + # cross attention + self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels, + n_points, lr_mult) + self.dropout2 = nn.Dropout(dropout) + self.norm2 = nn.LayerNorm( + d_model, weight_attr=weight_attr, bias_attr=bias_attr) + + # ffn + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.activation = getattr(F, activation) + self.dropout3 = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + self.dropout4 = nn.Dropout(dropout) + self.norm3 = nn.LayerNorm( + d_model, weight_attr=weight_attr, bias_attr=bias_attr) + self._reset_parameters() + + def _reset_parameters(self): + linear_init_(self.linear1) + linear_init_(self.linear2) + xavier_uniform_(self.linear1.weight) + xavier_uniform_(self.linear2.weight) + + def with_pos_embed(self, tensor, pos): + return tensor if pos is None else tensor + pos + + def forward_ffn(self, tgt): + tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) + tgt = tgt + self.dropout4(tgt2) + tgt = self.norm3(tgt) + return tgt + + def forward(self, + tgt, + reference_points, + memory, + memory_spatial_shapes, + memory_level_start_index, + memory_mask=None, + query_pos_embed=None): + # self attention + q = k = self.with_pos_embed(tgt, query_pos_embed) + tgt2 = self.self_attn(q, k, value=tgt) + tgt = tgt + self.dropout1(tgt2) + tgt = self.norm1(tgt) + + # cross attention + tgt2 = self.cross_attn( + self.with_pos_embed(tgt, query_pos_embed), reference_points, memory, + memory_spatial_shapes, memory_level_start_index, memory_mask) + tgt = tgt + self.dropout2(tgt2) + tgt = self.norm2(tgt) + + # ffn + tgt = self.forward_ffn(tgt) + + return tgt + + +class DeformableTransformerDecoder(nn.Layer): + def __init__(self, decoder_layer, num_layers, return_intermediate=False): + super(DeformableTransformerDecoder, self).__init__() + self.layers = _get_clones(decoder_layer, num_layers) + self.num_layers = num_layers + self.return_intermediate = return_intermediate + + def forward(self, + tgt, + reference_points, + memory, + memory_spatial_shapes, + memory_level_start_index, + memory_mask=None, + query_pos_embed=None): + output = tgt + intermediate = [] + for lid, layer in enumerate(self.layers): + output = layer(output, reference_points, memory, + memory_spatial_shapes, memory_level_start_index, + memory_mask, query_pos_embed) + + if self.return_intermediate: + intermediate.append(output) + + if self.return_intermediate: + return paddle.stack(intermediate) + + return output.unsqueeze(0) + + +@register +class DeformableTransformer(nn.Layer): + __shared__ = ['hidden_dim'] + + def __init__(self, + num_queries=300, + position_embed_type='sine', + return_intermediate_dec=True, + in_feats_channel=[512, 1024, 2048], + num_feature_levels=4, + num_encoder_points=4, + num_decoder_points=4, + hidden_dim=256, + nhead=8, + num_encoder_layers=6, + num_decoder_layers=6, + dim_feedforward=1024, + dropout=0.1, + activation="relu", + lr_mult=0.1, + pe_temperature=10000, + pe_offset=-0.5): + super(DeformableTransformer, self).__init__() + assert position_embed_type in ['sine', 'learned'], \ + f'ValueError: position_embed_type not supported {position_embed_type}!' + assert len(in_feats_channel) <= num_feature_levels + + self.hidden_dim = hidden_dim + self.nhead = nhead + self.num_feature_levels = num_feature_levels + + encoder_layer = DeformableTransformerEncoderLayer( + hidden_dim, nhead, dim_feedforward, dropout, activation, + num_feature_levels, num_encoder_points, lr_mult) + self.encoder = DeformableTransformerEncoder(encoder_layer, + num_encoder_layers) + + decoder_layer = DeformableTransformerDecoderLayer( + hidden_dim, nhead, dim_feedforward, dropout, activation, + num_feature_levels, num_decoder_points) + self.decoder = DeformableTransformerDecoder( + decoder_layer, num_decoder_layers, return_intermediate_dec) + + self.level_embed = nn.Embedding(num_feature_levels, hidden_dim) + self.tgt_embed = nn.Embedding(num_queries, hidden_dim) + self.query_pos_embed = nn.Embedding(num_queries, hidden_dim) + + self.reference_points = nn.Linear( + hidden_dim, + 2, + weight_attr=ParamAttr(learning_rate=lr_mult), + bias_attr=ParamAttr(learning_rate=lr_mult)) + + self.input_proj = nn.LayerList() + for in_channels in in_feats_channel: + self.input_proj.append( + nn.Sequential( + nn.Conv2D( + in_channels, hidden_dim, kernel_size=1), + nn.GroupNorm(32, hidden_dim))) + in_channels = in_feats_channel[-1] + for _ in range(num_feature_levels - len(in_feats_channel)): + self.input_proj.append( + nn.Sequential( + nn.Conv2D( + in_channels, + hidden_dim, + kernel_size=3, + stride=2, + padding=1), + nn.GroupNorm(32, hidden_dim))) + in_channels = hidden_dim + + self.position_embedding = PositionEmbedding( + hidden_dim // 2, + temperature=pe_temperature, + normalize=True if position_embed_type == 'sine' else False, + embed_type=position_embed_type, + offset=pe_offset, + eps=1e-4) + + self._reset_parameters() + + def _reset_parameters(self): + normal_(self.level_embed.weight) + normal_(self.tgt_embed.weight) + normal_(self.query_pos_embed.weight) + xavier_uniform_(self.reference_points.weight) + constant_(self.reference_points.bias) + for l in self.input_proj: + xavier_uniform_(l[0].weight) + constant_(l[0].bias) + + @classmethod + def from_config(cls, cfg, input_shape): + return {'in_feats_channel': [i.channels for i in input_shape], } + + def forward(self, src_feats, src_mask=None, *args, **kwargs): + srcs = [] + for i in range(len(src_feats)): + srcs.append(self.input_proj[i](src_feats[i])) + if self.num_feature_levels > len(srcs): + len_srcs = len(srcs) + for i in range(len_srcs, self.num_feature_levels): + if i == len_srcs: + srcs.append(self.input_proj[i](src_feats[-1])) + else: + srcs.append(self.input_proj[i](srcs[-1])) + src_flatten = [] + mask_flatten = [] + lvl_pos_embed_flatten = [] + spatial_shapes = [] + valid_ratios = [] + for level, src in enumerate(srcs): + src_shape = paddle.shape(src) + bs = src_shape[0:1] + h = src_shape[2:3] + w = src_shape[3:4] + spatial_shapes.append(paddle.concat([h, w])) + src = src.flatten(2).transpose([0, 2, 1]) + src_flatten.append(src) + if src_mask is not None: + mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0] + else: + mask = paddle.ones([bs, h, w]) + valid_ratios.append(get_valid_ratio(mask)) + pos_embed = self.position_embedding(mask).flatten(1, 2) + lvl_pos_embed = pos_embed + self.level_embed.weight[level] + lvl_pos_embed_flatten.append(lvl_pos_embed) + mask = mask.flatten(1) + mask_flatten.append(mask) + src_flatten = paddle.concat(src_flatten, 1) + mask_flatten = None if src_mask is None else paddle.concat(mask_flatten, + 1) + lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1) + # [l, 2] + spatial_shapes = paddle.to_tensor( + paddle.stack(spatial_shapes).astype('int64')) + # [l], 每一个level的起始index + level_start_index = paddle.concat([ + paddle.zeros( + [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1] + ]) + # [b, l, 2] + valid_ratios = paddle.stack(valid_ratios, 1) + + # encoder + memory = self.encoder(src_flatten, spatial_shapes, level_start_index, + mask_flatten, lvl_pos_embed_flatten, valid_ratios) + + # prepare input for decoder + bs, _, c = memory.shape + query_embed = self.query_pos_embed.weight.unsqueeze(0).tile([bs, 1, 1]) + tgt = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1]) + reference_points = F.sigmoid(self.reference_points(query_embed)) + reference_points_input = reference_points.unsqueeze( + 2) * valid_ratios.unsqueeze(1) + + # decoder + hs = self.decoder(tgt, reference_points_input, memory, spatial_shapes, + level_start_index, mask_flatten, query_embed) + + return (hs, memory, reference_points) diff --git a/rtdetr_paddle/ppdet/modeling/transformers/detr_transformer.py b/rtdetr_paddle/ppdet/modeling/transformers/detr_transformer.py new file mode 100644 index 0000000..efeb320 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/transformers/detr_transformer.py @@ -0,0 +1,359 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Modified from DETR (https://github.com/facebookresearch/detr) +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from ppdet.core.workspace import register +from ..layers import MultiHeadAttention, _convert_attention_mask +from .position_encoding import PositionEmbedding +from .utils import _get_clones +from ..initializer import linear_init_, conv_init_, xavier_uniform_, normal_ + +__all__ = ['DETRTransformer'] + + +class TransformerEncoderLayer(nn.Layer): + def __init__(self, + d_model, + nhead, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + attn_dropout=None, + act_dropout=None, + normalize_before=False): + super(TransformerEncoderLayer, self).__init__() + attn_dropout = dropout if attn_dropout is None else attn_dropout + act_dropout = dropout if act_dropout is None else act_dropout + self.normalize_before = normalize_before + + self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train") + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") + self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train") + self.activation = getattr(F, activation) + self._reset_parameters() + + def _reset_parameters(self): + linear_init_(self.linear1) + linear_init_(self.linear2) + + @staticmethod + def with_pos_embed(tensor, pos_embed): + return tensor if pos_embed is None else tensor + pos_embed + + def forward(self, src, src_mask=None, pos_embed=None): + residual = src + if self.normalize_before: + src = self.norm1(src) + q = k = self.with_pos_embed(src, pos_embed) + src = self.self_attn(q, k, value=src, attn_mask=src_mask) + + src = residual + self.dropout1(src) + if not self.normalize_before: + src = self.norm1(src) + + residual = src + if self.normalize_before: + src = self.norm2(src) + src = self.linear2(self.dropout(self.activation(self.linear1(src)))) + src = residual + self.dropout2(src) + if not self.normalize_before: + src = self.norm2(src) + return src + + +class TransformerEncoder(nn.Layer): + def __init__(self, encoder_layer, num_layers, norm=None): + super(TransformerEncoder, self).__init__() + self.layers = _get_clones(encoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + + def forward(self, src, src_mask=None, pos_embed=None): + output = src + for layer in self.layers: + output = layer(output, src_mask=src_mask, pos_embed=pos_embed) + + if self.norm is not None: + output = self.norm(output) + + return output + + +class TransformerDecoderLayer(nn.Layer): + def __init__(self, + d_model, + nhead, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + attn_dropout=None, + act_dropout=None, + normalize_before=False): + super(TransformerDecoderLayer, self).__init__() + attn_dropout = dropout if attn_dropout is None else attn_dropout + act_dropout = dropout if act_dropout is None else act_dropout + self.normalize_before = normalize_before + + self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout) + self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train") + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.norm3 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") + self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train") + self.dropout3 = nn.Dropout(dropout, mode="upscale_in_train") + self.activation = getattr(F, activation) + self._reset_parameters() + + def _reset_parameters(self): + linear_init_(self.linear1) + linear_init_(self.linear2) + + @staticmethod + def with_pos_embed(tensor, pos_embed): + return tensor if pos_embed is None else tensor + pos_embed + + def forward(self, + tgt, + memory, + tgt_mask=None, + memory_mask=None, + pos_embed=None, + query_pos_embed=None): + tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype) + + residual = tgt + if self.normalize_before: + tgt = self.norm1(tgt) + q = k = self.with_pos_embed(tgt, query_pos_embed) + tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask) + tgt = residual + self.dropout1(tgt) + if not self.normalize_before: + tgt = self.norm1(tgt) + + residual = tgt + if self.normalize_before: + tgt = self.norm2(tgt) + q = self.with_pos_embed(tgt, query_pos_embed) + k = self.with_pos_embed(memory, pos_embed) + tgt = self.cross_attn(q, k, value=memory, attn_mask=memory_mask) + tgt = residual + self.dropout2(tgt) + if not self.normalize_before: + tgt = self.norm2(tgt) + + residual = tgt + if self.normalize_before: + tgt = self.norm3(tgt) + tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) + tgt = residual + self.dropout3(tgt) + if not self.normalize_before: + tgt = self.norm3(tgt) + return tgt + + +class TransformerDecoder(nn.Layer): + def __init__(self, + decoder_layer, + num_layers, + norm=None, + return_intermediate=False): + super(TransformerDecoder, self).__init__() + self.layers = _get_clones(decoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + self.return_intermediate = return_intermediate + + def forward(self, + tgt, + memory, + tgt_mask=None, + memory_mask=None, + pos_embed=None, + query_pos_embed=None): + tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype) + + output = tgt + intermediate = [] + for layer in self.layers: + output = layer( + output, + memory, + tgt_mask=tgt_mask, + memory_mask=memory_mask, + pos_embed=pos_embed, + query_pos_embed=query_pos_embed) + if self.return_intermediate: + intermediate.append(self.norm(output)) + + if self.norm is not None: + output = self.norm(output) + + if self.return_intermediate: + return paddle.stack(intermediate) + + return output.unsqueeze(0) + + +@register +class DETRTransformer(nn.Layer): + __shared__ = ['hidden_dim'] + + def __init__(self, + num_queries=100, + position_embed_type='sine', + return_intermediate_dec=True, + backbone_num_channels=2048, + hidden_dim=256, + nhead=8, + num_encoder_layers=6, + num_decoder_layers=6, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + pe_temperature=10000, + pe_offset=0., + attn_dropout=None, + act_dropout=None, + normalize_before=False): + super(DETRTransformer, self).__init__() + assert position_embed_type in ['sine', 'learned'],\ + f'ValueError: position_embed_type not supported {position_embed_type}!' + self.hidden_dim = hidden_dim + self.nhead = nhead + + encoder_layer = TransformerEncoderLayer( + hidden_dim, nhead, dim_feedforward, dropout, activation, + attn_dropout, act_dropout, normalize_before) + encoder_norm = nn.LayerNorm(hidden_dim) if normalize_before else None + self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, + encoder_norm) + + decoder_layer = TransformerDecoderLayer( + hidden_dim, nhead, dim_feedforward, dropout, activation, + attn_dropout, act_dropout, normalize_before) + decoder_norm = nn.LayerNorm(hidden_dim) + self.decoder = TransformerDecoder( + decoder_layer, + num_decoder_layers, + decoder_norm, + return_intermediate=return_intermediate_dec) + + self.input_proj = nn.Conv2D( + backbone_num_channels, hidden_dim, kernel_size=1) + self.query_pos_embed = nn.Embedding(num_queries, hidden_dim) + self.position_embedding = PositionEmbedding( + hidden_dim // 2, + temperature=pe_temperature, + normalize=True if position_embed_type == 'sine' else False, + embed_type=position_embed_type, + offset=pe_offset) + + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + xavier_uniform_(p) + conv_init_(self.input_proj) + normal_(self.query_pos_embed.weight) + + @classmethod + def from_config(cls, cfg, input_shape): + return { + 'backbone_num_channels': [i.channels for i in input_shape][-1], + } + + def _convert_attention_mask(self, mask): + return (mask - 1.0) * 1e9 + + def forward(self, src, src_mask=None, *args, **kwargs): + r""" + Applies a Transformer model on the inputs. + + Parameters: + src (List(Tensor)): Backbone feature maps with shape [[bs, c, h, w]]. + src_mask (Tensor, optional): A tensor used in multi-head attention + to prevents attention to some unwanted positions, usually the + paddings or the subsequent positions. It is a tensor with shape + [bs, H, W]`. When the data type is bool, the unwanted positions + have `False` values and the others have `True` values. When the + data type is int, the unwanted positions have 0 values and the + others have 1 values. When the data type is float, the unwanted + positions have `-INF` values and the others have 0 values. It + can be None when nothing wanted or needed to be prevented + attention to. Default None. + + Returns: + output (Tensor): [num_levels, batch_size, num_queries, hidden_dim] + memory (Tensor): [batch_size, hidden_dim, h, w] + """ + # use last level feature map + src_proj = self.input_proj(src[-1]) + bs, c, h, w = paddle.shape(src_proj) + # flatten [B, C, H, W] to [B, HxW, C] + src_flatten = src_proj.flatten(2).transpose([0, 2, 1]) + if src_mask is not None: + src_mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0] + else: + src_mask = paddle.ones([bs, h, w]) + pos_embed = self.position_embedding(src_mask).flatten(1, 2) + + if self.training: + src_mask = self._convert_attention_mask(src_mask) + src_mask = src_mask.reshape([bs, 1, 1, h * w]) + else: + src_mask = None + + memory = self.encoder( + src_flatten, src_mask=src_mask, pos_embed=pos_embed) + + query_pos_embed = self.query_pos_embed.weight.unsqueeze(0).tile( + [bs, 1, 1]) + tgt = paddle.zeros_like(query_pos_embed) + output = self.decoder( + tgt, + memory, + memory_mask=src_mask, + pos_embed=pos_embed, + query_pos_embed=query_pos_embed) + + if self.training: + src_mask = src_mask.reshape([bs, 1, 1, h, w]) + else: + src_mask = None + + return (output, memory.transpose([0, 2, 1]).reshape([bs, c, h, w]), + src_proj, src_mask) diff --git a/rtdetr_paddle/ppdet/modeling/transformers/dino_transformer.py b/rtdetr_paddle/ppdet/modeling/transformers/dino_transformer.py new file mode 100644 index 0000000..088b150 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/transformers/dino_transformer.py @@ -0,0 +1,527 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR) +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Modified from detrex (https://github.com/IDEA-Research/detrex) +# Copyright 2022 The IDEA Authors. All rights reserved. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.regularizer import L2Decay + +from ppdet.core.workspace import register +from ..layers import MultiHeadAttention +from .position_encoding import PositionEmbedding +from .deformable_transformer import (MSDeformableAttention, + DeformableTransformerEncoderLayer, + DeformableTransformerEncoder) +from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_, + bias_init_with_prob) +from .utils import (_get_clones, get_valid_ratio, + get_contrastive_denoising_training_group, + get_sine_pos_embed, inverse_sigmoid, MLP) + +__all__ = ['DINOTransformer'] + + +class DINOTransformerDecoderLayer(nn.Layer): + def __init__(self, + d_model=256, + n_head=8, + dim_feedforward=1024, + dropout=0., + activation="relu", + n_levels=4, + n_points=4, + lr_mult=1.0, + weight_attr=None, + bias_attr=None): + super(DINOTransformerDecoderLayer, self).__init__() + + # self attention + self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout) + self.dropout1 = nn.Dropout(dropout) + self.norm1 = nn.LayerNorm( + d_model, weight_attr=weight_attr, bias_attr=bias_attr) + + # cross attention + self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels, + n_points, lr_mult) + self.dropout2 = nn.Dropout(dropout) + self.norm2 = nn.LayerNorm( + d_model, weight_attr=weight_attr, bias_attr=bias_attr) + + # ffn + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.activation = getattr(F, activation) + self.dropout3 = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + self.dropout4 = nn.Dropout(dropout) + self.norm3 = nn.LayerNorm( + d_model, weight_attr=weight_attr, bias_attr=bias_attr) + self._reset_parameters() + + def _reset_parameters(self): + linear_init_(self.linear1) + linear_init_(self.linear2) + xavier_uniform_(self.linear1.weight) + xavier_uniform_(self.linear2.weight) + + def with_pos_embed(self, tensor, pos): + return tensor if pos is None else tensor + pos + + def forward_ffn(self, tgt): + return self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) + + def forward(self, + tgt, + reference_points, + memory, + memory_spatial_shapes, + memory_level_start_index, + attn_mask=None, + memory_mask=None, + query_pos_embed=None): + # self attention + q = k = self.with_pos_embed(tgt, query_pos_embed) + if attn_mask is not None: + attn_mask = paddle.where( + attn_mask.astype('bool'), + paddle.zeros(attn_mask.shape, tgt.dtype), + paddle.full(attn_mask.shape, float("-inf"), tgt.dtype)) + tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask) + tgt = tgt + self.dropout1(tgt2) + tgt = self.norm1(tgt) + + # cross attention + tgt2 = self.cross_attn( + self.with_pos_embed(tgt, query_pos_embed), reference_points, memory, + memory_spatial_shapes, memory_level_start_index, memory_mask) + tgt = tgt + self.dropout2(tgt2) + tgt = self.norm2(tgt) + + # ffn + tgt2 = self.forward_ffn(tgt) + tgt = tgt + self.dropout4(tgt2) + tgt = self.norm3(tgt) + + return tgt + + +class DINOTransformerDecoder(nn.Layer): + def __init__(self, + hidden_dim, + decoder_layer, + num_layers, + weight_attr=None, + bias_attr=None): + super(DINOTransformerDecoder, self).__init__() + self.layers = _get_clones(decoder_layer, num_layers) + self.hidden_dim = hidden_dim + self.num_layers = num_layers + self.norm = nn.LayerNorm( + hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr) + + def forward(self, + tgt, + ref_points_unact, + memory, + memory_spatial_shapes, + memory_level_start_index, + bbox_head, + query_pos_head, + valid_ratios=None, + attn_mask=None, + memory_mask=None): + if valid_ratios is None: + valid_ratios = paddle.ones( + [memory.shape[0], memory_spatial_shapes.shape[0], 2]) + + output = tgt + intermediate = [] + inter_bboxes = [] + ref_points = F.sigmoid(ref_points_unact) + for i, layer in enumerate(self.layers): + reference_points_input = ref_points.detach().unsqueeze( + 2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1) + query_pos_embed = get_sine_pos_embed( + reference_points_input[..., 0, :], self.hidden_dim // 2) + query_pos_embed = query_pos_head(query_pos_embed) + + output = layer(output, reference_points_input, memory, + memory_spatial_shapes, memory_level_start_index, + attn_mask, memory_mask, query_pos_embed) + + ref_points = F.sigmoid(bbox_head[i](output) + inverse_sigmoid( + ref_points.detach())) + + intermediate.append(self.norm(output)) + inter_bboxes.append(ref_points) + + return paddle.stack(intermediate), paddle.stack(inter_bboxes) + + +@register +class DINOTransformer(nn.Layer): + __shared__ = ['num_classes', 'hidden_dim'] + + def __init__(self, + num_classes=80, + hidden_dim=256, + num_queries=900, + position_embed_type='sine', + in_feats_channel=[512, 1024, 2048], + num_levels=4, + num_encoder_points=4, + num_decoder_points=4, + nhead=8, + num_encoder_layers=6, + num_decoder_layers=6, + dim_feedforward=1024, + dropout=0., + activation="relu", + lr_mult=1.0, + pe_temperature=10000, + pe_offset=-0.5, + num_denoising=100, + label_noise_ratio=0.5, + box_noise_scale=1.0, + learnt_init_query=True, + eps=1e-2): + super(DINOTransformer, self).__init__() + assert position_embed_type in ['sine', 'learned'], \ + f'ValueError: position_embed_type not supported {position_embed_type}!' + assert len(in_feats_channel) <= num_levels + + self.hidden_dim = hidden_dim + self.nhead = nhead + self.num_levels = num_levels + self.num_classes = num_classes + self.num_queries = num_queries + self.eps = eps + self.num_decoder_layers = num_decoder_layers + + weight_attr = ParamAttr(regularizer=L2Decay(0.0)) + bias_attr = ParamAttr(regularizer=L2Decay(0.0)) + # backbone feature projection + self._build_input_proj_layer(in_feats_channel, weight_attr, bias_attr) + + # Transformer module + encoder_layer = DeformableTransformerEncoderLayer( + hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels, + num_encoder_points, lr_mult, weight_attr, bias_attr) + self.encoder = DeformableTransformerEncoder(encoder_layer, + num_encoder_layers) + decoder_layer = DINOTransformerDecoderLayer( + hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels, + num_decoder_points, lr_mult, weight_attr, bias_attr) + self.decoder = DINOTransformerDecoder(hidden_dim, decoder_layer, + num_decoder_layers, weight_attr, + bias_attr) + + # denoising part + self.denoising_class_embed = nn.Embedding( + num_classes, + hidden_dim, + weight_attr=ParamAttr(initializer=nn.initializer.Normal())) + self.num_denoising = num_denoising + self.label_noise_ratio = label_noise_ratio + self.box_noise_scale = box_noise_scale + + # position embedding + self.position_embedding = PositionEmbedding( + hidden_dim // 2, + temperature=pe_temperature, + normalize=True if position_embed_type == 'sine' else False, + embed_type=position_embed_type, + offset=pe_offset) + self.level_embed = nn.Embedding(num_levels, hidden_dim) + # decoder embedding + self.learnt_init_query = learnt_init_query + if learnt_init_query: + self.tgt_embed = nn.Embedding(num_queries, hidden_dim) + self.query_pos_head = MLP(2 * hidden_dim, + hidden_dim, + hidden_dim, + num_layers=2) + + # encoder head + self.enc_output = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim), + nn.LayerNorm( + hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr)) + self.enc_score_head = nn.Linear(hidden_dim, num_classes) + self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3) + # decoder head + self.dec_score_head = nn.LayerList([ + nn.Linear(hidden_dim, num_classes) + for _ in range(num_decoder_layers) + ]) + self.dec_bbox_head = nn.LayerList([ + MLP(hidden_dim, hidden_dim, 4, num_layers=3) + for _ in range(num_decoder_layers) + ]) + + self._reset_parameters() + + def _reset_parameters(self): + # class and bbox head init + bias_cls = bias_init_with_prob(0.01) + linear_init_(self.enc_score_head) + constant_(self.enc_score_head.bias, bias_cls) + constant_(self.enc_bbox_head.layers[-1].weight) + constant_(self.enc_bbox_head.layers[-1].bias) + for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head): + linear_init_(cls_) + constant_(cls_.bias, bias_cls) + constant_(reg_.layers[-1].weight) + constant_(reg_.layers[-1].bias) + + linear_init_(self.enc_output[0]) + xavier_uniform_(self.enc_output[0].weight) + normal_(self.level_embed.weight) + if self.learnt_init_query: + xavier_uniform_(self.tgt_embed.weight) + xavier_uniform_(self.query_pos_head.layers[0].weight) + xavier_uniform_(self.query_pos_head.layers[1].weight) + for l in self.input_proj: + xavier_uniform_(l[0].weight) + constant_(l[0].bias) + + @classmethod + def from_config(cls, cfg, input_shape): + return {'in_feats_channel': [i.channels for i in input_shape], } + + def _build_input_proj_layer(self, + in_feats_channel, + weight_attr=None, + bias_attr=None): + self.input_proj = nn.LayerList() + for in_channels in in_feats_channel: + self.input_proj.append( + nn.Sequential( + ('conv', nn.Conv2D( + in_channels, self.hidden_dim, kernel_size=1)), ( + 'norm', nn.GroupNorm( + 32, + self.hidden_dim, + weight_attr=weight_attr, + bias_attr=bias_attr)))) + in_channels = in_feats_channel[-1] + for _ in range(self.num_levels - len(in_feats_channel)): + self.input_proj.append( + nn.Sequential( + ('conv', nn.Conv2D( + in_channels, + self.hidden_dim, + kernel_size=3, + stride=2, + padding=1)), ('norm', nn.GroupNorm( + 32, + self.hidden_dim, + weight_attr=weight_attr, + bias_attr=bias_attr)))) + in_channels = self.hidden_dim + + def _get_encoder_input(self, feats, pad_mask=None): + # get projection features + proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)] + if self.num_levels > len(proj_feats): + len_srcs = len(proj_feats) + for i in range(len_srcs, self.num_levels): + if i == len_srcs: + proj_feats.append(self.input_proj[i](feats[-1])) + else: + proj_feats.append(self.input_proj[i](proj_feats[-1])) + + # get encoder inputs + feat_flatten = [] + mask_flatten = [] + lvl_pos_embed_flatten = [] + spatial_shapes = [] + valid_ratios = [] + for i, feat in enumerate(proj_feats): + bs, _, h, w = paddle.shape(feat) + spatial_shapes.append(paddle.stack([h, w])) + # [b,c,h,w] -> [b,h*w,c] + feat_flatten.append(feat.flatten(2).transpose([0, 2, 1])) + if pad_mask is not None: + mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0] + else: + mask = paddle.ones([bs, h, w]) + valid_ratios.append(get_valid_ratio(mask)) + # [b, h*w, c] + pos_embed = self.position_embedding(mask).flatten(1, 2) + lvl_pos_embed = pos_embed + self.level_embed.weight[i] + lvl_pos_embed_flatten.append(lvl_pos_embed) + if pad_mask is not None: + # [b, h*w] + mask_flatten.append(mask.flatten(1)) + + # [b, l, c] + feat_flatten = paddle.concat(feat_flatten, 1) + # [b, l] + mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten, + 1) + # [b, l, c] + lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1) + # [num_levels, 2] + spatial_shapes = paddle.to_tensor( + paddle.stack(spatial_shapes).astype('int64')) + # [l] start index of each level + level_start_index = paddle.concat([ + paddle.zeros( + [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1] + ]) + # [b, num_levels, 2] + valid_ratios = paddle.stack(valid_ratios, 1) + return (feat_flatten, spatial_shapes, level_start_index, mask_flatten, + lvl_pos_embed_flatten, valid_ratios) + + def forward(self, feats, pad_mask=None, gt_meta=None): + # input projection and embedding + (feat_flatten, spatial_shapes, level_start_index, mask_flatten, + lvl_pos_embed_flatten, + valid_ratios) = self._get_encoder_input(feats, pad_mask) + + # encoder + memory = self.encoder(feat_flatten, spatial_shapes, level_start_index, + mask_flatten, lvl_pos_embed_flatten, valid_ratios) + + # prepare denoising training + if self.training: + denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \ + get_contrastive_denoising_training_group(gt_meta, + self.num_classes, + self.num_queries, + self.denoising_class_embed.weight, + self.num_denoising, + self.label_noise_ratio, + self.box_noise_scale) + else: + denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None + + target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \ + self._get_decoder_input( + memory, spatial_shapes, mask_flatten, denoising_class, + denoising_bbox_unact) + + # decoder + inter_feats, inter_bboxes = self.decoder( + target, init_ref_points_unact, memory, spatial_shapes, + level_start_index, self.dec_bbox_head, self.query_pos_head, + valid_ratios, attn_mask, mask_flatten) + out_bboxes = [] + out_logits = [] + for i in range(self.num_decoder_layers): + out_logits.append(self.dec_score_head[i](inter_feats[i])) + if i == 0: + out_bboxes.append( + F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) + + init_ref_points_unact)) + else: + out_bboxes.append( + F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) + + inverse_sigmoid(inter_bboxes[i - 1]))) + out_bboxes = paddle.stack(out_bboxes) + out_logits = paddle.stack(out_logits) + + return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits, + dn_meta) + + def _get_encoder_output_anchors(self, + memory, + spatial_shapes, + memory_mask=None, + grid_size=0.05): + output_anchors = [] + idx = 0 + for lvl, (h, w) in enumerate(spatial_shapes): + if memory_mask is not None: + mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w]) + valid_H = paddle.sum(mask_[:, :, 0], 1) + valid_W = paddle.sum(mask_[:, 0, :], 1) + else: + valid_H, valid_W = h, w + + grid_y, grid_x = paddle.meshgrid( + paddle.arange(end=h), paddle.arange(end=w)) + grid_xy = paddle.stack([grid_x, grid_y], -1).astype(memory.dtype) + + valid_WH = paddle.stack([valid_W, valid_H], -1).reshape( + [-1, 1, 1, 2]).astype(grid_xy.dtype) + grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH + wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl) + output_anchors.append( + paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4])) + idx += h * w + + output_anchors = paddle.concat(output_anchors, 1) + valid_mask = ((output_anchors > self.eps) * + (output_anchors < 1 - self.eps)).all(-1, keepdim=True) + output_anchors = paddle.log(output_anchors / (1 - output_anchors)) + if memory_mask is not None: + valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0 + output_anchors = paddle.where(valid_mask, output_anchors, + paddle.to_tensor(float("inf"))) + + memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.)) + output_memory = self.enc_output(memory) + return output_memory, output_anchors + + def _get_decoder_input(self, + memory, + spatial_shapes, + memory_mask=None, + denoising_class=None, + denoising_bbox_unact=None): + bs, _, _ = memory.shape + # prepare input for decoder + output_memory, output_anchors = self._get_encoder_output_anchors( + memory, spatial_shapes, memory_mask) + enc_outputs_class = self.enc_score_head(output_memory) + enc_outputs_coord_unact = self.enc_bbox_head( + output_memory) + output_anchors + + _, topk_ind = paddle.topk( + enc_outputs_class.max(-1), self.num_queries, axis=1) + # extract region proposal boxes + batch_ind = paddle.arange(end=bs).astype(topk_ind.dtype) + batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries]) + topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1) + reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact, + topk_ind) # unsigmoided. + enc_topk_bboxes = F.sigmoid(reference_points_unact) + if denoising_bbox_unact is not None: + reference_points_unact = paddle.concat( + [denoising_bbox_unact, reference_points_unact], 1) + enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind) + + # extract region features + if self.learnt_init_query: + target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1]) + else: + target = paddle.gather_nd(output_memory, topk_ind).detach() + if denoising_class is not None: + target = paddle.concat([denoising_class, target], 1) + + return target, reference_points_unact.detach( + ), enc_topk_bboxes, enc_topk_logits \ No newline at end of file diff --git a/rtdetr_paddle/ppdet/modeling/transformers/ext_op/README.md b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/README.md new file mode 100644 index 0000000..144f2fa --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/README.md @@ -0,0 +1,85 @@ +# Multi-scale deformable attention自定义OP编译 +该自定义OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。 + +## 1. 环境依赖 +- Paddle >= 2.3.2 +- gcc 8.2 + +## 2. 安装 +请在当前路径下进行编译安装 +``` +cd rtdetr_paddle/ppdet/modeling/transformers/ext_op/ +python setup_ms_deformable_attn_op.py install +``` + +编译完成后即可使用,以下为`ms_deformable_attn`的使用示例 +``` +# 引入自定义op +from deformable_detr_ops import ms_deformable_attn + +# 构造fake input tensor +bs, n_heads, c = 2, 8, 8 +query_length, n_levels, n_points = 2, 2, 2 +spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64) +level_start_index = paddle.concat((paddle.to_tensor( + [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1])) +value_length = sum([(H * W).item() for H, W in spatial_shapes]) + +def get_test_tensors(channels): + value = paddle.rand( + [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01 + sampling_locations = paddle.rand( + [bs, query_length, n_heads, n_levels, n_points, 2], + dtype=paddle.float32) + attention_weights = paddle.rand( + [bs, query_length, n_heads, n_levels, n_points], + dtype=paddle.float32) + 1e-5 + attention_weights /= attention_weights.sum(-1, keepdim=True).sum( + -2, keepdim=True) + return [value, sampling_locations, attention_weights] + +value, sampling_locations, attention_weights = get_test_tensors(c) + +output = ms_deformable_attn(value, + spatial_shapes, + level_start_index, + sampling_locations, + attention_weights) +``` + +## 3. 单元测试 +可以通过执行单元测试来确认自定义算子功能的正确性,执行单元测试的示例如下所示: +``` +python test_ms_deformable_attn_op.py +``` +运行成功后,打印如下: +``` +*True check_forward_equal_with_paddle_float: max_abs_err 6.98e-10 max_rel_err 2.03e-07 +*tensor1 True check_gradient_numerical(D=30) +*tensor2 True check_gradient_numerical(D=30) +*tensor3 True check_gradient_numerical(D=30) +*tensor1 True check_gradient_numerical(D=32) +*tensor2 True check_gradient_numerical(D=32) +*tensor3 True check_gradient_numerical(D=32) +*tensor1 True check_gradient_numerical(D=64) +*tensor2 True check_gradient_numerical(D=64) +*tensor3 True check_gradient_numerical(D=64) +*tensor1 True check_gradient_numerical(D=71) +*tensor2 True check_gradient_numerical(D=71) +*tensor3 True check_gradient_numerical(D=71) +*tensor1 True check_gradient_numerical(D=128) +*tensor2 True check_gradient_numerical(D=128) +*tensor3 True check_gradient_numerical(D=128) +*tensor1 True check_gradient_numerical(D=1024) +*tensor2 True check_gradient_numerical(D=1024) +*tensor3 True check_gradient_numerical(D=1024) +*tensor1 True check_gradient_numerical(D=1025) +*tensor2 True check_gradient_numerical(D=1025) +*tensor3 True check_gradient_numerical(D=1025) +*tensor1 True check_gradient_numerical(D=2048) +*tensor2 True check_gradient_numerical(D=2048) +*tensor3 True check_gradient_numerical(D=2048) +*tensor1 True check_gradient_numerical(D=3096) +*tensor2 True check_gradient_numerical(D=3096) +*tensor3 True check_gradient_numerical(D=3096) +``` diff --git a/rtdetr_paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cc b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cc new file mode 100644 index 0000000..d1758ad --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cc @@ -0,0 +1,65 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/extension.h" + +#include + +// declare GPU implementation +std::vector +MSDeformableAttnCUDAForward(const paddle::Tensor &value, + const paddle::Tensor &value_spatial_shapes, + const paddle::Tensor &value_level_start_index, + const paddle::Tensor &sampling_locations, + const paddle::Tensor &attention_weights); + +std::vector MSDeformableAttnCUDABackward( + const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes, + const paddle::Tensor &value_level_start_index, + const paddle::Tensor &sampling_locations, + const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out); + +//// CPU not implemented + +std::vector> +MSDeformableAttnInferShape(std::vector value_shape, + std::vector value_spatial_shapes_shape, + std::vector value_level_start_index_shape, + std::vector sampling_locations_shape, + std::vector attention_weights_shape) { + return {{value_shape[0], sampling_locations_shape[1], + value_shape[2] * value_shape[3]}}; +} + +std::vector +MSDeformableAttnInferDtype(paddle::DataType value_dtype, + paddle::DataType value_spatial_shapes_dtype, + paddle::DataType value_level_start_index_dtype, + paddle::DataType sampling_locations_dtype, + paddle::DataType attention_weights_dtype) { + return {value_dtype}; +} + +PD_BUILD_OP(ms_deformable_attn) + .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations", + "AttentionWeights"}) + .Outputs({"Out"}) + .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDAForward)) + .SetInferShapeFn(PD_INFER_SHAPE(MSDeformableAttnInferShape)) + .SetInferDtypeFn(PD_INFER_DTYPE(MSDeformableAttnInferDtype)); + +PD_BUILD_GRAD_OP(ms_deformable_attn) + .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations", + "AttentionWeights", paddle::Grad("Out")}) + .Outputs({paddle::Grad("Value"), paddle::Grad("SpatialShapes"), + paddle::Grad("LevelIndex"), paddle::Grad("SamplingLocations"), + paddle::Grad("AttentionWeights")}) + .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDABackward)); diff --git a/rtdetr_paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cu b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cu new file mode 100644 index 0000000..d5a8d16 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cu @@ -0,0 +1,1073 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/extension.h" + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +inline int GET_BLOCKS(const int N, const int num_threads) { + return (N + num_threads - 1) / num_threads; +} + +// forward bilinear +template +__device__ data_t deformable_attn_bilinear_forward( + const data_t *&bottom_data, const int &height, const int &width, + const int &nheads, const int &channels, const data_t &h, const data_t &w, + const int &m, const int &c) { + const int h_low = floor(h); + const int w_low = floor(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const data_t lh = h - h_low; + const data_t lw = w - w_low; + const data_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + data_t v1 = 0; + if (h_low >= 0 && w_low >= 0) { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + } + data_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + } + data_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + } + data_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + } + + const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +// forward kernel +template +__global__ void deformable_attn_cuda_kernel_forward( + const int n, const data_t *data_value, const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, const data_t *data_sampling_loc, + const data_t *data_attn_weight, const int batch_size, + const int value_length, const int num_heads, const int channels, + const int num_levels, const int query_length, const int num_points, + data_t *output_data_ptr) { + CUDA_KERNEL_LOOP(index, n) { + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % query_length; + _temp /= query_length; + const int b_col = _temp; + + data_t *data_ptr = output_data_ptr + index; + int data_weight_ptr = sampling_index * num_levels * num_points; + int data_loc_w_ptr = data_weight_ptr << 1; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * value_length * qid_stride; + data_t col = 0; + + for (int l_col = 0; l_col < num_levels; ++l_col) { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const data_t *data_value_ptr = data_value + (data_value_ptr_init_offset + + level_start_id * qid_stride); + for (int p_col = 0; p_col < num_points; ++p_col) { + const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const data_t weight = data_attn_weight[data_weight_ptr]; + + const data_t h_im = loc_h * spatial_h - 0.5; + const data_t w_im = loc_w * spatial_w - 0.5; + + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { + col += deformable_attn_bilinear_forward( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, + h_im, w_im, m_col, c_col) * + weight; + } + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + } + } + *data_ptr = col; + } +} + +#define CHECK_INPUT_GPU(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.") +// forward +std::vector +MSDeformableAttnCUDAForward(const paddle::Tensor &value, + const paddle::Tensor &value_spatial_shapes, + const paddle::Tensor &value_level_start_index, + const paddle::Tensor &sampling_locations, + const paddle::Tensor &attention_weights) { + + CHECK_INPUT_GPU(value); + CHECK_INPUT_GPU(value_spatial_shapes); + CHECK_INPUT_GPU(value_level_start_index); + CHECK_INPUT_GPU(sampling_locations); + CHECK_INPUT_GPU(attention_weights); + + const int batch_size = value.shape()[0]; + const int value_length = value.shape()[1]; + const int num_heads = value.shape()[2]; + const int channels = value.shape()[3]; + + const int num_levels = value_spatial_shapes.shape()[0]; + const int query_length = sampling_locations.shape()[1]; + const int num_points = sampling_locations.shape()[4]; + + auto output = paddle::full({batch_size, query_length, num_heads * channels}, + 0, value.dtype(), paddle::GPUPlace()); + + const int num_kernels = batch_size * query_length * num_heads * channels; + deformable_attn_cuda_kernel_forward + <<>>(num_kernels, value.data(), + value_spatial_shapes.data(), + value_level_start_index.data(), + sampling_locations.data(), + attention_weights.data(), batch_size, + value_length, num_heads, channels, num_levels, + query_length, num_points, output.data()); + return {output}; +} + +// backward bilinear +template +__device__ void deformable_attn_bilinear_backward( + const data_t *&bottom_data, const int &height, const int &width, + const int &nheads, const int &channels, const data_t &h, const data_t &w, + const int &m, const int &c, const data_t &top_grad, + const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc, + data_t *grad_attn_weight) { + const int h_low = floor(h); + const int w_low = floor(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const data_t lh = h - h_low; + const data_t lw = w - w_low; + const data_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + const data_t top_grad_value = top_grad * attn_weight; + data_t grad_h_weight = 0, grad_w_weight = 0; + + data_t v1 = 0; + if (h_low >= 0 && w_low >= 0) { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + grad_h_weight -= hw * v1; + grad_w_weight -= hh * v1; + atomicAdd(grad_value + ptr1, w1 * top_grad_value); + } + data_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + grad_h_weight -= lw * v2; + grad_w_weight += hh * v2; + atomicAdd(grad_value + ptr2, w2 * top_grad_value); + } + data_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + grad_h_weight += hw * v3; + grad_w_weight -= lh * v3; + atomicAdd(grad_value + ptr3, w3 * top_grad_value); + } + data_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + grad_h_weight += lw * v4; + grad_w_weight += lh * v4; + atomicAdd(grad_value + ptr4, w4 * top_grad_value); + } + + const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + *grad_attn_weight = top_grad * val; + *grad_sampling_loc = width * grad_w_weight * top_grad_value; + *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value; +} + +template +__device__ void deformable_attn_bilinear_backward_gm( + const data_t *&bottom_data, const int &height, const int &width, + const int &nheads, const int &channels, const data_t &h, const data_t &w, + const int &m, const int &c, const data_t &top_grad, + const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc, + data_t *grad_attn_weight) { + const int h_low = floor(h); + const int w_low = floor(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const data_t lh = h - h_low; + const data_t lw = w - w_low; + const data_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + const data_t top_grad_value = top_grad * attn_weight; + data_t grad_h_weight = 0, grad_w_weight = 0; + + data_t v1 = 0; + if (h_low >= 0 && w_low >= 0) { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + grad_h_weight -= hw * v1; + grad_w_weight -= hh * v1; + atomicAdd(grad_value + ptr1, w1 * top_grad_value); + } + data_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + grad_h_weight -= lw * v2; + grad_w_weight += hh * v2; + atomicAdd(grad_value + ptr2, w2 * top_grad_value); + } + data_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + grad_h_weight += hw * v3; + grad_w_weight -= lh * v3; + atomicAdd(grad_value + ptr3, w3 * top_grad_value); + } + data_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + grad_h_weight += lw * v4; + grad_w_weight += lh * v4; + atomicAdd(grad_value + ptr4, w4 * top_grad_value); + } + + const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + atomicAdd(grad_attn_weight, top_grad * val); + atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value); + atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value); +} + +// backward kernels +// channels > 1024 +template +__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks( + const int n, const data_t *grad_col, const data_t *data_value, + const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, + const data_t *data_sampling_loc, const data_t *data_attn_weight, + const int batch_size, const int value_length, const int num_heads, + const int channels, const int num_levels, const int query_length, + const int num_points, data_t *grad_value, data_t *grad_sampling_loc, + data_t *grad_attn_weight) { + CUDA_KERNEL_LOOP(index, n) { + extern __shared__ int _s[]; + data_t *cache_grad_sampling_loc = (data_t *)_s; + data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % query_length; + _temp /= query_length; + const int b_col = _temp; + + const data_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_points; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * value_length * qid_stride; + + for (int l_col = 0; l_col < num_levels; ++l_col) { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = + data_value_ptr_init_offset + level_start_id * qid_stride; + const data_t *data_value_ptr = data_value + value_ptr_offset; + data_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col = 0; p_col < num_points; ++p_col) { + const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const data_t weight = data_attn_weight[data_weight_ptr]; + + const data_t h_im = loc_h * spatial_h - 0.5; + const data_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight + threadIdx.x) = 0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { + deformable_attn_bilinear_backward( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, + w_im, m_col, c_col, top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc + (threadIdx.x << 1), + cache_grad_attn_weight + threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0; + s >>= 1, spre >>= 1) { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += + cache_grad_sampling_loc[xid2 + 1]; + if (tid + (s << 1) < spre) { + cache_grad_attn_weight[tid] += + cache_grad_attn_weight[tid + (s << 1)]; + cache_grad_sampling_loc[xid1] += + cache_grad_sampling_loc[xid2 + (s << 1)]; + cache_grad_sampling_loc[xid1 + 1] += + cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; + } + } + __syncthreads(); + } + + if (tid == 0) { + atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]); + atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]); + atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]); + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + +template +__global__ void deformable_attn_cuda_kernel_backward_gm( + const int n, const data_t *grad_col, const data_t *data_value, + const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, + const data_t *data_sampling_loc, const data_t *data_attn_weight, + const int batch_size, const int value_length, const int num_heads, + const int channels, const int num_levels, const int query_length, + const int num_points, data_t *grad_value, data_t *grad_sampling_loc, + data_t *grad_attn_weight) { + CUDA_KERNEL_LOOP(index, n) { + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % query_length; + _temp /= query_length; + const int b_col = _temp; + + const data_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_points; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * value_length * qid_stride; + + for (int l_col = 0; l_col < num_levels; ++l_col) { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = + data_value_ptr_init_offset + level_start_id * qid_stride; + const data_t *data_value_ptr = data_value + value_ptr_offset; + data_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col = 0; p_col < num_points; ++p_col) { + const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const data_t weight = data_attn_weight[data_weight_ptr]; + + const data_t h_im = loc_h * spatial_h - 0.5; + const data_t w_im = loc_w * spatial_w - 0.5; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { + deformable_attn_bilinear_backward_gm( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, + w_im, m_col, c_col, top_grad, weight, grad_value_ptr, + grad_sampling_loc, grad_attn_weight); + } + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + +// channels <= 1024 +template +__global__ void +deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1( + const int n, const data_t *grad_col, const data_t *data_value, + const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, + const data_t *data_sampling_loc, const data_t *data_attn_weight, + const int batch_size, const int value_length, const int num_heads, + const int channels, const int num_levels, const int query_length, + const int num_points, data_t *grad_value, data_t *grad_sampling_loc, + data_t *grad_attn_weight) { + CUDA_KERNEL_LOOP(index, n) { + __shared__ data_t cache_grad_sampling_loc[blockSize * 2]; + __shared__ data_t cache_grad_attn_weight[blockSize]; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % query_length; + _temp /= query_length; + const int b_col = _temp; + + const data_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_points; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * value_length * qid_stride; + + for (int l_col = 0; l_col < num_levels; ++l_col) { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = + data_value_ptr_init_offset + level_start_id * qid_stride; + const data_t *data_value_ptr = data_value + value_ptr_offset; + data_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col = 0; p_col < num_points; ++p_col) { + const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const data_t weight = data_attn_weight[data_weight_ptr]; + + const data_t h_im = loc_h * spatial_h - 0.5; + const data_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight + threadIdx.x) = 0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { + deformable_attn_bilinear_backward( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, + w_im, m_col, c_col, top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc + (threadIdx.x << 1), + cache_grad_attn_weight + threadIdx.x); + } + + __syncthreads(); + if (tid == 0) { + data_t _grad_w = cache_grad_sampling_loc[0], + _grad_h = cache_grad_sampling_loc[1], + _grad_a = cache_grad_attn_weight[0]; + int sid = 2; + for (unsigned int tid = 1; tid < blockSize; ++tid) { + _grad_w += cache_grad_sampling_loc[sid]; + _grad_h += cache_grad_sampling_loc[sid + 1]; + _grad_a += cache_grad_attn_weight[tid]; + sid += 2; + } + + *grad_sampling_loc = _grad_w; + *(grad_sampling_loc + 1) = _grad_h; + *grad_attn_weight = _grad_a; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + +template +__global__ void +deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2( + const int n, const data_t *grad_col, const data_t *data_value, + const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, + const data_t *data_sampling_loc, const data_t *data_attn_weight, + const int batch_size, const int value_length, const int num_heads, + const int channels, const int num_levels, const int query_length, + const int num_points, data_t *grad_value, data_t *grad_sampling_loc, + data_t *grad_attn_weight) { + CUDA_KERNEL_LOOP(index, n) { + __shared__ data_t cache_grad_sampling_loc[blockSize * 2]; + __shared__ data_t cache_grad_attn_weight[blockSize]; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % query_length; + _temp /= query_length; + const int b_col = _temp; + + const data_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_points; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * value_length * qid_stride; + + for (int l_col = 0; l_col < num_levels; ++l_col) { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = + data_value_ptr_init_offset + level_start_id * qid_stride; + const data_t *data_value_ptr = data_value + value_ptr_offset; + data_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col = 0; p_col < num_points; ++p_col) { + const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const data_t weight = data_attn_weight[data_weight_ptr]; + + const data_t h_im = loc_h * spatial_h - 0.5; + const data_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight + threadIdx.x) = 0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { + deformable_attn_bilinear_backward( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, + w_im, m_col, c_col, top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc + (threadIdx.x << 1), + cache_grad_attn_weight + threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s = blockSize / 2; s > 0; s >>= 1) { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += + cache_grad_sampling_loc[xid2 + 1]; + } + __syncthreads(); + } + + if (tid == 0) { + *grad_sampling_loc = cache_grad_sampling_loc[0]; + *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; + *grad_attn_weight = cache_grad_attn_weight[0]; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + +template +__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v1( + const int n, const data_t *grad_col, const data_t *data_value, + const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, + const data_t *data_sampling_loc, const data_t *data_attn_weight, + const int batch_size, const int value_length, const int num_heads, + const int channels, const int num_levels, const int query_length, + const int num_points, data_t *grad_value, data_t *grad_sampling_loc, + data_t *grad_attn_weight) { + CUDA_KERNEL_LOOP(index, n) { + extern __shared__ int _s[]; + data_t *cache_grad_sampling_loc = (data_t *)_s; + data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % query_length; + _temp /= query_length; + const int b_col = _temp; + + const data_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_points; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * value_length * qid_stride; + + for (int l_col = 0; l_col < num_levels; ++l_col) { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = + data_value_ptr_init_offset + level_start_id * qid_stride; + const data_t *data_value_ptr = data_value + value_ptr_offset; + data_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col = 0; p_col < num_points; ++p_col) { + const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const data_t weight = data_attn_weight[data_weight_ptr]; + + const data_t h_im = loc_h * spatial_h - 0.5; + const data_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight + threadIdx.x) = 0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { + deformable_attn_bilinear_backward( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, + w_im, m_col, c_col, top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc + (threadIdx.x << 1), + cache_grad_attn_weight + threadIdx.x); + } + + __syncthreads(); + if (tid == 0) { + data_t _grad_w = cache_grad_sampling_loc[0], + _grad_h = cache_grad_sampling_loc[1], + _grad_a = cache_grad_attn_weight[0]; + int sid = 2; + for (unsigned int tid = 1; tid < blockDim.x; ++tid) { + _grad_w += cache_grad_sampling_loc[sid]; + _grad_h += cache_grad_sampling_loc[sid + 1]; + _grad_a += cache_grad_attn_weight[tid]; + sid += 2; + } + + *grad_sampling_loc = _grad_w; + *(grad_sampling_loc + 1) = _grad_h; + *grad_attn_weight = _grad_a; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + +template +__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2( + const int n, const data_t *grad_col, const data_t *data_value, + const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, + const data_t *data_sampling_loc, const data_t *data_attn_weight, + const int batch_size, const int value_length, const int num_heads, + const int channels, const int num_levels, const int query_length, + const int num_points, data_t *grad_value, data_t *grad_sampling_loc, + data_t *grad_attn_weight) { + CUDA_KERNEL_LOOP(index, n) { + extern __shared__ int _s[]; + data_t *cache_grad_sampling_loc = (data_t *)_s; + data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % query_length; + _temp /= query_length; + const int b_col = _temp; + + const data_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_points; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * value_length * qid_stride; + + for (int l_col = 0; l_col < num_levels; ++l_col) { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = + data_value_ptr_init_offset + level_start_id * qid_stride; + const data_t *data_value_ptr = data_value + value_ptr_offset; + data_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col = 0; p_col < num_points; ++p_col) { + const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const data_t weight = data_attn_weight[data_weight_ptr]; + + const data_t h_im = loc_h * spatial_h - 0.5; + const data_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight + threadIdx.x) = 0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { + deformable_attn_bilinear_backward( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, + w_im, m_col, c_col, top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc + (threadIdx.x << 1), + cache_grad_attn_weight + threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0; + s >>= 1, spre >>= 1) { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += + cache_grad_sampling_loc[xid2 + 1]; + if (tid + (s << 1) < spre) { + cache_grad_attn_weight[tid] += + cache_grad_attn_weight[tid + (s << 1)]; + cache_grad_sampling_loc[xid1] += + cache_grad_sampling_loc[xid2 + (s << 1)]; + cache_grad_sampling_loc[xid1 + 1] += + cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; + } + } + __syncthreads(); + } + + if (tid == 0) { + *grad_sampling_loc = cache_grad_sampling_loc[0]; + *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; + *grad_attn_weight = cache_grad_attn_weight[0]; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + +// backward branch +template +void deformable_attn_cuda_backward( + cudaStream_t stream, const data_t *grad_out, const data_t *data_value, + const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, + const data_t *data_sampling_loc, const data_t *data_attn_weight, + const int batch_size, const int value_length, const int num_heads, + const int channels, const int num_levels, const int query_length, + const int num_points, data_t *grad_value, data_t *grad_sampling_loc, + data_t *grad_attn_weight) { + const int num_threads = + (channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : channels; + const int num_kernels = batch_size * query_length * num_heads * channels; + const int num_actual_kernels = + batch_size * query_length * num_heads * channels; + if (channels > 1024) { + if ((channels & 1023) == 0) { + deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks + <<>>( + num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, data_attn_weight, + batch_size, value_length, num_heads, channels, num_levels, + query_length, num_points, grad_value, grad_sampling_loc, + grad_attn_weight); + } else { + deformable_attn_cuda_kernel_backward_gm + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + } + } else { + switch (channels) { + case 1: + deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + break; + case 2: + deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + break; + case 4: + deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + break; + case 8: + deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + break; + case 16: + deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + break; + case 32: + deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + break; + case 64: + deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2 + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + break; + case 128: + deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2 + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + break; + case 256: + deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2 + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + break; + case 512: + deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2 + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + break; + case 1024: + deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2 + <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, + data_attn_weight, batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, + grad_value, grad_sampling_loc, grad_attn_weight); + break; + default: + if (channels < 64) { + deformable_attn_cuda_kernel_backward_shm_reduce_v1 + <<>>( + num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, data_attn_weight, + batch_size, value_length, num_heads, channels, num_levels, + query_length, num_points, grad_value, grad_sampling_loc, + grad_attn_weight); + } else { + deformable_attn_cuda_kernel_backward_shm_reduce_v2 + <<>>( + num_kernels, grad_out, data_value, data_spatial_shapes, + data_level_start_index, data_sampling_loc, data_attn_weight, + batch_size, value_length, num_heads, channels, num_levels, + query_length, num_points, grad_value, grad_sampling_loc, + grad_attn_weight); + } + } + } +} + +// backward +std::vector MSDeformableAttnCUDABackward( + const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes, + const paddle::Tensor &value_level_start_index, + const paddle::Tensor &sampling_locations, + const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out) { + + CHECK_INPUT_GPU(value); + CHECK_INPUT_GPU(value_spatial_shapes); + CHECK_INPUT_GPU(value_level_start_index); + CHECK_INPUT_GPU(sampling_locations); + CHECK_INPUT_GPU(attention_weights); + CHECK_INPUT_GPU(grad_out); + + const int batch_size = value.shape()[0]; + const int value_length = value.shape()[1]; + const int num_heads = value.shape()[2]; + const int channels = value.shape()[3]; + + const int num_levels = value_spatial_shapes.shape()[0]; + const int query_length = sampling_locations.shape()[1]; + const int num_points = sampling_locations.shape()[4]; + + auto grad_value = + paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace()); + auto grad_spatial_shapes = + paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace()); + auto grad_level_start_index = + paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace()); + auto grad_sampling_locations = + paddle::full(sampling_locations.shape(), 0, sampling_locations.dtype(), + paddle::GPUPlace()); + auto grad_attention_weights = + paddle::full(attention_weights.shape(), 0, attention_weights.dtype(), + paddle::GPUPlace()); + + deformable_attn_cuda_backward( + value.stream(), grad_out.data(), value.data(), + value_spatial_shapes.data(), + value_level_start_index.data(), sampling_locations.data(), + attention_weights.data(), batch_size, value_length, num_heads, + channels, num_levels, query_length, num_points, grad_value.data(), + grad_sampling_locations.data(), + grad_attention_weights.data()); + + return {grad_value, grad_spatial_shapes, grad_level_start_index, + grad_sampling_locations, grad_attention_weights}; +} diff --git a/rtdetr_paddle/ppdet/modeling/transformers/ext_op/setup_ms_deformable_attn_op.py b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/setup_ms_deformable_attn_op.py new file mode 100644 index 0000000..7c3c386 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/setup_ms_deformable_attn_op.py @@ -0,0 +1,7 @@ +from paddle.utils.cpp_extension import CUDAExtension, setup + +if __name__ == "__main__": + setup( + name='deformable_detr_ops', + ext_modules=CUDAExtension( + sources=['ms_deformable_attn_op.cc', 'ms_deformable_attn_op.cu'])) diff --git a/rtdetr_paddle/ppdet/modeling/transformers/ext_op/test_ms_deformable_attn_op.py b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/test_ms_deformable_attn_op.py new file mode 100644 index 0000000..94a0573 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/test_ms_deformable_attn_op.py @@ -0,0 +1,140 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import os +import sys +import random +import numpy as np +import paddle +# add python path of PaddleDetection to sys.path +parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 5))) +if parent_path not in sys.path: + sys.path.append(parent_path) + +from ppdet.modeling.transformers.utils import deformable_attention_core_func +ms_deform_attn_core_paddle = deformable_attention_core_func + +try: + gpu_index = int(sys.argv[1]) +except: + gpu_index = 0 +print(f'Use gpu {gpu_index} to test...') +paddle.set_device(f'gpu:{gpu_index}') + +try: + from deformable_detr_ops import ms_deformable_attn +except Exception as e: + print('import deformable_detr_ops error', e) + sys.exit(-1) + +paddle.seed(1) +random.seed(1) +np.random.seed(1) + +bs, n_heads, c = 2, 8, 8 +query_length, n_levels, n_points = 2, 2, 2 +spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64) +level_start_index = paddle.concat((paddle.to_tensor( + [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1])) +value_length = sum([(H * W).item() for H, W in spatial_shapes]) + + +def get_test_tensors(channels): + value = paddle.rand( + [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01 + sampling_locations = paddle.rand( + [bs, query_length, n_heads, n_levels, n_points, 2], + dtype=paddle.float32) + attention_weights = paddle.rand( + [bs, query_length, n_heads, n_levels, n_points], + dtype=paddle.float32) + 1e-5 + attention_weights /= attention_weights.sum(-1, keepdim=True).sum( + -2, keepdim=True) + + return [value, sampling_locations, attention_weights] + + +@paddle.no_grad() +def check_forward_equal_with_paddle_float(): + value, sampling_locations, attention_weights = get_test_tensors(c) + + output_paddle = ms_deform_attn_core_paddle( + value, spatial_shapes, level_start_index, sampling_locations, + attention_weights).detach().cpu() + output_cuda = ms_deformable_attn(value, spatial_shapes, level_start_index, + sampling_locations, + attention_weights).detach().cpu() + fwdok = paddle.allclose( + output_cuda, output_paddle, rtol=1e-2, atol=1e-3).item() + max_abs_err = (output_cuda - output_paddle).abs().max().item() + max_rel_err = ( + (output_cuda - output_paddle).abs() / output_paddle.abs()).max().item() + + print( + f'*{fwdok} check_forward_equal_with_paddle_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}' + ) + + +def check_gradient_numerical(channels=4): + value_paddle, sampling_locations_paddle, attention_weights_paddle = get_test_tensors( + channels) + value_paddle.stop_gradient = False + sampling_locations_paddle.stop_gradient = False + attention_weights_paddle.stop_gradient = False + + value_cuda = value_paddle.detach().clone() + sampling_locations_cuda = sampling_locations_paddle.detach().clone() + attention_weights_cuda = attention_weights_paddle.detach().clone() + value_cuda.stop_gradient = False + sampling_locations_cuda.stop_gradient = False + attention_weights_cuda.stop_gradient = False + + output_paddle = ms_deform_attn_core_paddle( + value_paddle, spatial_shapes, level_start_index, + sampling_locations_paddle, attention_weights_paddle) + output_paddle.sum().backward() + + output_cuda = ms_deformable_attn(value_cuda, spatial_shapes, + level_start_index, sampling_locations_cuda, + attention_weights_cuda) + output_cuda.sum().backward() + + res = paddle.allclose( + value_paddle.grad, value_cuda.grad, rtol=1e-2, atol=1e-3).item() + print(f'*tensor1 {res} check_gradient_numerical(D={channels})') + + res = paddle.allclose( + sampling_locations_paddle.grad, + sampling_locations_cuda.grad, + rtol=1e-2, + atol=1e-3).item() + print(f'*tensor2 {res} check_gradient_numerical(D={channels})') + + res = paddle.allclose( + attention_weights_paddle.grad, + attention_weights_cuda.grad, + rtol=1e-2, + atol=1e-3).item() + print(f'*tensor3 {res} check_gradient_numerical(D={channels})') + + +if __name__ == '__main__': + check_forward_equal_with_paddle_float() + + for channels in [30, 32, 64, 71, 128, 1024, 1025, 2048, 3096]: + check_gradient_numerical(channels) diff --git a/rtdetr_paddle/ppdet/modeling/transformers/hybrid_encoder.py b/rtdetr_paddle/ppdet/modeling/transformers/hybrid_encoder.py new file mode 100644 index 0000000..7e0c77c --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/transformers/hybrid_encoder.py @@ -0,0 +1,287 @@ +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from ppdet.core.workspace import register, serializable +from ppdet.modeling.ops import get_act_fn +from ..shape_spec import ShapeSpec +from ..backbones.csp_darknet import BaseConv +from ..backbones.cspresnet import RepVggBlock +from ppdet.modeling.transformers.detr_transformer import TransformerEncoder +from ..initializer import xavier_uniform_, linear_init_ +from ..layers import MultiHeadAttention +from paddle import ParamAttr +from paddle.regularizer import L2Decay + +__all__ = ['HybridEncoder'] + + +class CSPRepLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + num_blocks=3, + expansion=1.0, + bias=False, + act="silu"): + super(CSPRepLayer, self).__init__() + hidden_channels = int(out_channels * expansion) + self.conv1 = BaseConv( + in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) + self.conv2 = BaseConv( + in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) + self.bottlenecks = nn.Sequential(*[ + RepVggBlock( + hidden_channels, hidden_channels, act=act) + for _ in range(num_blocks) + ]) + if hidden_channels != out_channels: + self.conv3 = BaseConv( + hidden_channels, + out_channels, + ksize=1, + stride=1, + bias=bias, + act=act) + else: + self.conv3 = nn.Identity() + + def forward(self, x): + x_1 = self.conv1(x) + x_1 = self.bottlenecks(x_1) + x_2 = self.conv2(x) + return self.conv3(x_1 + x_2) + + +@register +class TransformerLayer(nn.Layer): + def __init__(self, + d_model, + nhead, + dim_feedforward=1024, + dropout=0., + activation="relu", + attn_dropout=None, + act_dropout=None, + normalize_before=False): + super(TransformerLayer, self).__init__() + attn_dropout = dropout if attn_dropout is None else attn_dropout + act_dropout = dropout if act_dropout is None else act_dropout + self.normalize_before = normalize_before + + self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train") + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") + self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train") + self.activation = getattr(F, activation) + self._reset_parameters() + + def _reset_parameters(self): + linear_init_(self.linear1) + linear_init_(self.linear2) + + @staticmethod + def with_pos_embed(tensor, pos_embed): + return tensor if pos_embed is None else tensor + pos_embed + + def forward(self, src, src_mask=None, pos_embed=None): + residual = src + if self.normalize_before: + src = self.norm1(src) + q = k = self.with_pos_embed(src, pos_embed) + src = self.self_attn(q, k, value=src, attn_mask=src_mask) + + src = residual + self.dropout1(src) + if not self.normalize_before: + src = self.norm1(src) + + residual = src + if self.normalize_before: + src = self.norm2(src) + src = self.linear2(self.dropout(self.activation(self.linear1(src)))) + src = residual + self.dropout2(src) + if not self.normalize_before: + src = self.norm2(src) + return src + + +@register +@serializable +class HybridEncoder(nn.Layer): + __shared__ = ['depth_mult', 'act', 'trt', 'eval_size'] + __inject__ = ['encoder_layer'] + + def __init__(self, + in_channels=[512, 1024, 2048], + feat_strides=[8, 16, 32], + hidden_dim=256, + use_encoder_idx=[2], + num_encoder_layers=1, + encoder_layer='TransformerLayer', + pe_temperature=10000, + expansion=1.0, + depth_mult=1.0, + act='silu', + trt=False, + eval_size=None): + super(HybridEncoder, self).__init__() + self.in_channels = in_channels + self.feat_strides = feat_strides + self.hidden_dim = hidden_dim + self.use_encoder_idx = use_encoder_idx + self.num_encoder_layers = num_encoder_layers + self.pe_temperature = pe_temperature + self.eval_size = eval_size + + # channel projection + self.input_proj = nn.LayerList() + for in_channel in in_channels: + self.input_proj.append( + nn.Sequential( + nn.Conv2D( + in_channel, hidden_dim, kernel_size=1, bias_attr=False), + nn.BatchNorm2D( + hidden_dim, + weight_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0))))) + # encoder transformer + self.encoder = nn.LayerList([ + TransformerEncoder(encoder_layer, num_encoder_layers) + for _ in range(len(use_encoder_idx)) + ]) + + act = get_act_fn( + act, trt=trt) if act is None or isinstance(act, + (str, dict)) else act + # top-down fpn + self.lateral_convs = nn.LayerList() + self.fpn_blocks = nn.LayerList() + for idx in range(len(in_channels) - 1, 0, -1): + self.lateral_convs.append( + BaseConv( + hidden_dim, hidden_dim, 1, 1, act=act)) + self.fpn_blocks.append( + CSPRepLayer( + hidden_dim * 2, + hidden_dim, + round(3 * depth_mult), + act=act, + expansion=expansion)) + + # bottom-up pan + self.downsample_convs = nn.LayerList() + self.pan_blocks = nn.LayerList() + for idx in range(len(in_channels) - 1): + self.downsample_convs.append( + BaseConv( + hidden_dim, hidden_dim, 3, stride=2, act=act)) + self.pan_blocks.append( + CSPRepLayer( + hidden_dim * 2, + hidden_dim, + round(3 * depth_mult), + act=act, + expansion=expansion)) + + self._reset_parameters() + + def _reset_parameters(self): + if self.eval_size: + for idx in self.use_encoder_idx: + stride = self.feat_strides[idx] + pos_embed = self.build_2d_sincos_position_embedding( + self.eval_size[1] // stride, self.eval_size[0] // stride, + self.hidden_dim, self.pe_temperature) + setattr(self, f'pos_embed{idx}', pos_embed) + + @staticmethod + def build_2d_sincos_position_embedding(w, + h, + embed_dim=256, + temperature=10000.): + grid_w = paddle.arange(int(w), dtype=paddle.float32) + grid_h = paddle.arange(int(h), dtype=paddle.float32) + grid_w, grid_h = paddle.meshgrid(grid_w, grid_h) + assert embed_dim % 4 == 0, \ + 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding' + pos_dim = embed_dim // 4 + omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim + omega = 1. / (temperature**omega) + + out_w = grid_w.flatten()[..., None] @omega[None] + out_h = grid_h.flatten()[..., None] @omega[None] + + return paddle.concat( + [ + paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h), + paddle.cos(out_h) + ], + axis=1)[None, :, :] + + def forward(self, feats, for_mot=False): + assert len(feats) == len(self.in_channels) + # get projection features + proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)] + # encoder + if self.num_encoder_layers > 0: + for i, enc_ind in enumerate(self.use_encoder_idx): + h, w = proj_feats[enc_ind].shape[2:] + # flatten [B, C, H, W] to [B, HxW, C] + src_flatten = proj_feats[enc_ind].flatten(2).transpose( + [0, 2, 1]) + if self.training or self.eval_size is None: + pos_embed = self.build_2d_sincos_position_embedding( + w, h, self.hidden_dim, self.pe_temperature) + else: + pos_embed = getattr(self, f'pos_embed{enc_ind}', None) + memory = self.encoder[i](src_flatten, pos_embed=pos_embed) + proj_feats[enc_ind] = memory.transpose([0, 2, 1]).reshape( + [-1, self.hidden_dim, h, w]) + + # top-down fpn + inner_outs = [proj_feats[-1]] + for idx in range(len(self.in_channels) - 1, 0, -1): + feat_heigh = inner_outs[0] + feat_low = proj_feats[idx - 1] + feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx]( + feat_heigh) + inner_outs[0] = feat_heigh + + upsample_feat = F.interpolate( + feat_heigh, scale_factor=2., mode="nearest") + inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx]( + paddle.concat( + [upsample_feat, feat_low], axis=1)) + inner_outs.insert(0, inner_out) + + # bottom-up pan + outs = [inner_outs[0]] + for idx in range(len(self.in_channels) - 1): + feat_low = outs[-1] + feat_height = inner_outs[idx + 1] + downsample_feat = self.downsample_convs[idx](feat_low) + out = self.pan_blocks[idx](paddle.concat( + [downsample_feat, feat_height], axis=1)) + outs.append(out) + + return outs + + @classmethod + def from_config(cls, cfg, input_shape): + return { + 'in_channels': [i.channels for i in input_shape], + 'feat_strides': [i.stride for i in input_shape] + } + + @property + def out_shape(self): + return [ + ShapeSpec( + channels=self.hidden_dim, stride=self.feat_strides[idx]) + for idx in range(len(self.in_channels)) + ] diff --git a/rtdetr_paddle/ppdet/modeling/transformers/matchers.py b/rtdetr_paddle/ppdet/modeling/transformers/matchers.py new file mode 100644 index 0000000..72459a3 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/transformers/matchers.py @@ -0,0 +1,184 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Modified from DETR (https://github.com/facebookresearch/detr) +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from scipy.optimize import linear_sum_assignment + +from ppdet.core.workspace import register, serializable +from ..losses.iou_loss import GIoULoss +from .utils import bbox_cxcywh_to_xyxy + +__all__ = ['HungarianMatcher'] + + +@register +@serializable +class HungarianMatcher(nn.Layer): + __shared__ = ['use_focal_loss', 'with_mask', 'num_sample_points'] + + def __init__(self, + matcher_coeff={ + 'class': 1, + 'bbox': 5, + 'giou': 2, + 'mask': 1, + 'dice': 1 + }, + use_focal_loss=False, + with_mask=False, + num_sample_points=12544, + alpha=0.25, + gamma=2.0): + r""" + Args: + matcher_coeff (dict): The coefficient of hungarian matcher cost. + """ + super(HungarianMatcher, self).__init__() + self.matcher_coeff = matcher_coeff + self.use_focal_loss = use_focal_loss + self.with_mask = with_mask + self.num_sample_points = num_sample_points + self.alpha = alpha + self.gamma = gamma + + self.giou_loss = GIoULoss() + + def forward(self, + boxes, + logits, + gt_bbox, + gt_class, + masks=None, + gt_mask=None): + r""" + Args: + boxes (Tensor): [b, query, 4] + logits (Tensor): [b, query, num_classes] + gt_bbox (List(Tensor)): list[[n, 4]] + gt_class (List(Tensor)): list[[n, 1]] + masks (Tensor|None): [b, query, h, w] + gt_mask (List(Tensor)): list[[n, H, W]] + + Returns: + A list of size batch_size, containing tuples of (index_i, index_j) where: + - index_i is the indices of the selected predictions (in order) + - index_j is the indices of the corresponding selected targets (in order) + For each batch element, it holds: + len(index_i) = len(index_j) = min(num_queries, num_target_boxes) + """ + bs, num_queries = boxes.shape[:2] + + num_gts = [len(a) for a in gt_class] + if sum(num_gts) == 0: + return [(paddle.to_tensor( + [], dtype=paddle.int64), paddle.to_tensor( + [], dtype=paddle.int64)) for _ in range(bs)] + + # We flatten to compute the cost matrices in a batch + # [batch_size * num_queries, num_classes] + logits = logits.detach() + out_prob = F.sigmoid(logits.flatten( + 0, 1)) if self.use_focal_loss else F.softmax(logits.flatten(0, 1)) + # [batch_size * num_queries, 4] + out_bbox = boxes.detach().flatten(0, 1) + + # Also concat the target labels and boxes + tgt_ids = paddle.concat(gt_class).flatten() + tgt_bbox = paddle.concat(gt_bbox) + + # Compute the classification cost + out_prob = paddle.gather(out_prob, tgt_ids, axis=1) + if self.use_focal_loss: + neg_cost_class = (1 - self.alpha) * (out_prob**self.gamma) * (-( + 1 - out_prob + 1e-8).log()) + pos_cost_class = self.alpha * ( + (1 - out_prob)**self.gamma) * (-(out_prob + 1e-8).log()) + cost_class = pos_cost_class - neg_cost_class + else: + cost_class = -out_prob + + # Compute the L1 cost between boxes + cost_bbox = ( + out_bbox.unsqueeze(1) - tgt_bbox.unsqueeze(0)).abs().sum(-1) + + # Compute the giou cost betwen boxes + cost_giou = self.giou_loss( + bbox_cxcywh_to_xyxy(out_bbox.unsqueeze(1)), + bbox_cxcywh_to_xyxy(tgt_bbox.unsqueeze(0))).squeeze(-1) + + # Final cost matrix + C = self.matcher_coeff['class'] * cost_class + \ + self.matcher_coeff['bbox'] * cost_bbox + \ + self.matcher_coeff['giou'] * cost_giou + # Compute the mask cost and dice cost + if self.with_mask: + assert (masks is not None and gt_mask is not None, + 'Make sure the input has `mask` and `gt_mask`') + # all masks share the same set of points for efficient matching + sample_points = paddle.rand([bs, 1, self.num_sample_points, 2]) + sample_points = 2.0 * sample_points - 1.0 + + out_mask = F.grid_sample( + masks.detach(), sample_points, align_corners=False).squeeze(-2) + out_mask = out_mask.flatten(0, 1) + + tgt_mask = paddle.concat(gt_mask).unsqueeze(1) + sample_points = paddle.concat([ + a.tile([b, 1, 1, 1]) for a, b in zip(sample_points, num_gts) + if b > 0 + ]) + tgt_mask = F.grid_sample( + tgt_mask, sample_points, align_corners=False).squeeze([1, 2]) + + with paddle.amp.auto_cast(enable=False): + # binary cross entropy cost + pos_cost_mask = F.binary_cross_entropy_with_logits( + out_mask, paddle.ones_like(out_mask), reduction='none') + neg_cost_mask = F.binary_cross_entropy_with_logits( + out_mask, paddle.zeros_like(out_mask), reduction='none') + cost_mask = paddle.matmul( + pos_cost_mask, tgt_mask, transpose_y=True) + paddle.matmul( + neg_cost_mask, 1 - tgt_mask, transpose_y=True) + cost_mask /= self.num_sample_points + + # dice cost + out_mask = F.sigmoid(out_mask) + numerator = 2 * paddle.matmul( + out_mask, tgt_mask, transpose_y=True) + denominator = out_mask.sum( + -1, keepdim=True) + tgt_mask.sum(-1).unsqueeze(0) + cost_dice = 1 - (numerator + 1) / (denominator + 1) + + C = C + self.matcher_coeff['mask'] * cost_mask + \ + self.matcher_coeff['dice'] * cost_dice + + C = C.reshape([bs, num_queries, -1]) + C = [a.squeeze(0) for a in C.chunk(bs)] + sizes = [a.shape[0] for a in gt_bbox] + indices = [ + linear_sum_assignment(c.split(sizes, -1)[i].numpy()) + for i, c in enumerate(C) + ] + return [(paddle.to_tensor( + i, dtype=paddle.int64), paddle.to_tensor( + j, dtype=paddle.int64)) for i, j in indices] diff --git a/rtdetr_paddle/ppdet/modeling/transformers/position_encoding.py b/rtdetr_paddle/ppdet/modeling/transformers/position_encoding.py new file mode 100644 index 0000000..a2c3260 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/transformers/position_encoding.py @@ -0,0 +1,100 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Modified from DETR (https://github.com/facebookresearch/detr) +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +import paddle.nn as nn + +from ppdet.core.workspace import register, serializable + + +@register +@serializable +class PositionEmbedding(nn.Layer): + def __init__(self, + num_pos_feats=128, + temperature=10000, + normalize=True, + scale=2 * math.pi, + embed_type='sine', + num_embeddings=50, + offset=0., + eps=1e-6): + super(PositionEmbedding, self).__init__() + assert embed_type in ['sine', 'learned'] + + self.embed_type = embed_type + self.offset = offset + self.eps = eps + if self.embed_type == 'sine': + self.num_pos_feats = num_pos_feats + self.temperature = temperature + self.normalize = normalize + self.scale = scale + elif self.embed_type == 'learned': + self.row_embed = nn.Embedding(num_embeddings, num_pos_feats) + self.col_embed = nn.Embedding(num_embeddings, num_pos_feats) + else: + raise ValueError(f"{self.embed_type} is not supported.") + + def forward(self, mask): + """ + Args: + mask (Tensor): [B, H, W] + Returns: + pos (Tensor): [B, H, W, C] + """ + if self.embed_type == 'sine': + y_embed = mask.cumsum(1) + x_embed = mask.cumsum(2) + if self.normalize: + y_embed = (y_embed + self.offset) / ( + y_embed[:, -1:, :] + self.eps) * self.scale + x_embed = (x_embed + self.offset) / ( + x_embed[:, :, -1:] + self.eps) * self.scale + + dim_t = 2 * (paddle.arange(self.num_pos_feats) // + 2).astype('float32') + dim_t = self.temperature**(dim_t / self.num_pos_feats) + + pos_x = x_embed.unsqueeze(-1) / dim_t + pos_y = y_embed.unsqueeze(-1) / dim_t + pos_x = paddle.stack( + (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), + axis=4).flatten(3) + pos_y = paddle.stack( + (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), + axis=4).flatten(3) + return paddle.concat((pos_y, pos_x), axis=3) + elif self.embed_type == 'learned': + h, w = mask.shape[-2:] + i = paddle.arange(w) + j = paddle.arange(h) + x_emb = self.col_embed(i) + y_emb = self.row_embed(j) + return paddle.concat( + [ + x_emb.unsqueeze(0).tile([h, 1, 1]), + y_emb.unsqueeze(1).tile([1, w, 1]), + ], + axis=-1).unsqueeze(0) + else: + raise ValueError(f"not supported {self.embed_type}") diff --git a/rtdetr_paddle/ppdet/modeling/transformers/rtdetr_transformer.py b/rtdetr_paddle/ppdet/modeling/transformers/rtdetr_transformer.py new file mode 100644 index 0000000..3eccdec --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/transformers/rtdetr_transformer.py @@ -0,0 +1,523 @@ +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.regularizer import L2Decay + +from ppdet.core.workspace import register +from ..layers import MultiHeadAttention +from .deformable_transformer import MSDeformableAttention +from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_, + bias_init_with_prob) +from .utils import (_get_clones, get_sine_pos_embed, + get_contrastive_denoising_training_group, inverse_sigmoid, MLP) + +__all__ = ['RTDETRTransformer'] + + +class PPMSDeformableAttention(MSDeformableAttention): + def forward(self, + query, + reference_points, + value, + value_spatial_shapes, + value_level_start_index, + value_mask=None): + """ + Args: + query (Tensor): [bs, query_length, C] + reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area + value (Tensor): [bs, value_length, C] + value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] + value_level_start_index (List): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...] + value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements + + Returns: + output (Tensor): [bs, Length_{query}, C] + """ + bs, Len_q = query.shape[:2] + Len_v = value.shape[1] + + value = self.value_proj(value) + if value_mask is not None: + value_mask = value_mask.astype(value.dtype).unsqueeze(-1) + value *= value_mask + value = value.reshape([bs, Len_v, self.num_heads, self.head_dim]) + + sampling_offsets = self.sampling_offsets(query).reshape( + [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2]) + attention_weights = self.attention_weights(query).reshape( + [bs, Len_q, self.num_heads, self.num_levels * self.num_points]) + attention_weights = F.softmax(attention_weights).reshape( + [bs, Len_q, self.num_heads, self.num_levels, self.num_points]) + + if reference_points.shape[-1] == 2: + offset_normalizer = paddle.to_tensor(value_spatial_shapes) + offset_normalizer = offset_normalizer.flip([1]).reshape( + [1, 1, 1, self.num_levels, 1, 2]) + sampling_locations = reference_points.reshape([ + bs, Len_q, 1, self.num_levels, 1, 2 + ]) + sampling_offsets / offset_normalizer + elif reference_points.shape[-1] == 4: + sampling_locations = ( + reference_points[:, :, None, :, None, :2] + sampling_offsets / + self.num_points * reference_points[:, :, None, :, None, 2:] * + 0.5) + else: + raise ValueError( + "Last dim of reference_points must be 2 or 4, but get {} instead.". + format(reference_points.shape[-1])) + + if not isinstance(query, paddle.Tensor): + from ppdet.modeling.transformers.utils import deformable_attention_core_func + output = deformable_attention_core_func( + value, value_spatial_shapes, value_level_start_index, + sampling_locations, attention_weights) + else: + value_spatial_shapes = paddle.to_tensor(value_spatial_shapes) + value_level_start_index = paddle.to_tensor(value_level_start_index) + output = self.ms_deformable_attn_core( + value, value_spatial_shapes, value_level_start_index, + sampling_locations, attention_weights) + output = self.output_proj(output) + + return output + + +class TransformerDecoderLayer(nn.Layer): + def __init__(self, + d_model=256, + n_head=8, + dim_feedforward=1024, + dropout=0., + activation="relu", + n_levels=4, + n_points=4, + weight_attr=None, + bias_attr=None): + super(TransformerDecoderLayer, self).__init__() + + # self attention + self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout) + self.dropout1 = nn.Dropout(dropout) + self.norm1 = nn.LayerNorm( + d_model, + weight_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0))) + + # cross attention + self.cross_attn = PPMSDeformableAttention(d_model, n_head, n_levels, + n_points, 1.0) + self.dropout2 = nn.Dropout(dropout) + self.norm2 = nn.LayerNorm( + d_model, + weight_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0))) + + # ffn + self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr, + bias_attr) + self.activation = getattr(F, activation) + self.dropout3 = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr, + bias_attr) + self.dropout4 = nn.Dropout(dropout) + self.norm3 = nn.LayerNorm( + d_model, + weight_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0))) + self._reset_parameters() + + def _reset_parameters(self): + linear_init_(self.linear1) + linear_init_(self.linear2) + xavier_uniform_(self.linear1.weight) + xavier_uniform_(self.linear2.weight) + + def with_pos_embed(self, tensor, pos): + return tensor if pos is None else tensor + pos + + def forward_ffn(self, tgt): + return self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) + + def forward(self, + tgt, + reference_points, + memory, + memory_spatial_shapes, + memory_level_start_index, + attn_mask=None, + memory_mask=None, + query_pos_embed=None): + # self attention + q = k = self.with_pos_embed(tgt, query_pos_embed) + if attn_mask is not None: + attn_mask = paddle.where( + attn_mask.astype('bool'), + paddle.zeros(attn_mask.shape, tgt.dtype), + paddle.full(attn_mask.shape, float("-inf"), tgt.dtype)) + tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask) + tgt = tgt + self.dropout1(tgt2) + tgt = self.norm1(tgt) + + # cross attention + tgt2 = self.cross_attn( + self.with_pos_embed(tgt, query_pos_embed), reference_points, memory, + memory_spatial_shapes, memory_level_start_index, memory_mask) + tgt = tgt + self.dropout2(tgt2) + tgt = self.norm2(tgt) + + # ffn + tgt2 = self.forward_ffn(tgt) + tgt = tgt + self.dropout4(tgt2) + tgt = self.norm3(tgt) + + return tgt + + +class TransformerDecoder(nn.Layer): + def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1): + super(TransformerDecoder, self).__init__() + self.layers = _get_clones(decoder_layer, num_layers) + self.hidden_dim = hidden_dim + self.num_layers = num_layers + self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx + + def forward(self, + tgt, + ref_points_unact, + memory, + memory_spatial_shapes, + memory_level_start_index, + bbox_head, + score_head, + query_pos_head, + attn_mask=None, + memory_mask=None): + output = tgt + dec_out_bboxes = [] + dec_out_logits = [] + ref_points_detach = F.sigmoid(ref_points_unact) + for i, layer in enumerate(self.layers): + ref_points_input = ref_points_detach.unsqueeze(2) + query_pos_embed = query_pos_head(ref_points_detach) + + output = layer(output, ref_points_input, memory, + memory_spatial_shapes, memory_level_start_index, + attn_mask, memory_mask, query_pos_embed) + + inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid( + ref_points_detach)) + + if self.training: + dec_out_logits.append(score_head[i](output)) + if i == 0: + dec_out_bboxes.append(inter_ref_bbox) + else: + dec_out_bboxes.append( + F.sigmoid(bbox_head[i](output) + inverse_sigmoid( + ref_points))) + elif i == self.eval_idx: + dec_out_logits.append(score_head[i](output)) + dec_out_bboxes.append(inter_ref_bbox) + break + + ref_points = inter_ref_bbox + ref_points_detach = inter_ref_bbox.detach( + ) if self.training else inter_ref_bbox + + return paddle.stack(dec_out_bboxes), paddle.stack(dec_out_logits) + + +@register +class RTDETRTransformer(nn.Layer): + __shared__ = ['num_classes', 'hidden_dim', 'eval_size'] + + def __init__(self, + num_classes=80, + hidden_dim=256, + num_queries=300, + position_embed_type='sine', + backbone_feat_channels=[512, 1024, 2048], + feat_strides=[8, 16, 32], + num_levels=3, + num_decoder_points=4, + nhead=8, + num_decoder_layers=6, + dim_feedforward=1024, + dropout=0., + activation="relu", + num_denoising=100, + label_noise_ratio=0.5, + box_noise_scale=1.0, + learnt_init_query=True, + eval_size=None, + eval_idx=-1, + eps=1e-2): + super(RTDETRTransformer, self).__init__() + assert position_embed_type in ['sine', 'learned'], \ + f'ValueError: position_embed_type not supported {position_embed_type}!' + assert len(backbone_feat_channels) <= num_levels + assert len(feat_strides) == len(backbone_feat_channels) + for _ in range(num_levels - len(feat_strides)): + feat_strides.append(feat_strides[-1] * 2) + + self.hidden_dim = hidden_dim + self.nhead = nhead + self.feat_strides = feat_strides + self.num_levels = num_levels + self.num_classes = num_classes + self.num_queries = num_queries + self.eps = eps + self.num_decoder_layers = num_decoder_layers + self.eval_size = eval_size + + # backbone feature projection + self._build_input_proj_layer(backbone_feat_channels) + + # Transformer module + decoder_layer = TransformerDecoderLayer( + hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels, + num_decoder_points) + self.decoder = TransformerDecoder(hidden_dim, decoder_layer, + num_decoder_layers, eval_idx) + + # denoising part + self.denoising_class_embed = nn.Embedding( + num_classes, + hidden_dim, + weight_attr=ParamAttr(initializer=nn.initializer.Normal())) + self.num_denoising = num_denoising + self.label_noise_ratio = label_noise_ratio + self.box_noise_scale = box_noise_scale + + # decoder embedding + self.learnt_init_query = learnt_init_query + if learnt_init_query: + self.tgt_embed = nn.Embedding(num_queries, hidden_dim) + self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2) + + # encoder head + self.enc_output = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim), + nn.LayerNorm( + hidden_dim, + weight_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0)))) + self.enc_score_head = nn.Linear(hidden_dim, num_classes) + self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3) + + # decoder head + self.dec_score_head = nn.LayerList([ + nn.Linear(hidden_dim, num_classes) + for _ in range(num_decoder_layers) + ]) + self.dec_bbox_head = nn.LayerList([ + MLP(hidden_dim, hidden_dim, 4, num_layers=3) + for _ in range(num_decoder_layers) + ]) + + self._reset_parameters() + + def _reset_parameters(self): + # class and bbox head init + bias_cls = bias_init_with_prob(0.01) + linear_init_(self.enc_score_head) + constant_(self.enc_score_head.bias, bias_cls) + constant_(self.enc_bbox_head.layers[-1].weight) + constant_(self.enc_bbox_head.layers[-1].bias) + for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head): + linear_init_(cls_) + constant_(cls_.bias, bias_cls) + constant_(reg_.layers[-1].weight) + constant_(reg_.layers[-1].bias) + + linear_init_(self.enc_output[0]) + xavier_uniform_(self.enc_output[0].weight) + if self.learnt_init_query: + xavier_uniform_(self.tgt_embed.weight) + xavier_uniform_(self.query_pos_head.layers[0].weight) + xavier_uniform_(self.query_pos_head.layers[1].weight) + for l in self.input_proj: + xavier_uniform_(l[0].weight) + + # init encoder output anchors and valid_mask + if self.eval_size: + self.anchors, self.valid_mask = self._generate_anchors() + + @classmethod + def from_config(cls, cfg, input_shape): + return {'backbone_feat_channels': [i.channels for i in input_shape]} + + def _build_input_proj_layer(self, backbone_feat_channels): + self.input_proj = nn.LayerList() + for in_channels in backbone_feat_channels: + self.input_proj.append( + nn.Sequential( + ('conv', nn.Conv2D( + in_channels, + self.hidden_dim, + kernel_size=1, + bias_attr=False)), ('norm', nn.BatchNorm2D( + self.hidden_dim, + weight_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))) + in_channels = backbone_feat_channels[-1] + for _ in range(self.num_levels - len(backbone_feat_channels)): + self.input_proj.append( + nn.Sequential( + ('conv', nn.Conv2D( + in_channels, + self.hidden_dim, + kernel_size=3, + stride=2, + padding=1, + bias_attr=False)), ('norm', nn.BatchNorm2D( + self.hidden_dim, + weight_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))) + in_channels = self.hidden_dim + + def _get_encoder_input(self, feats): + # get projection features + proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)] + if self.num_levels > len(proj_feats): + len_srcs = len(proj_feats) + for i in range(len_srcs, self.num_levels): + if i == len_srcs: + proj_feats.append(self.input_proj[i](feats[-1])) + else: + proj_feats.append(self.input_proj[i](proj_feats[-1])) + + # get encoder inputs + feat_flatten = [] + spatial_shapes = [] + level_start_index = [0, ] + for i, feat in enumerate(proj_feats): + _, _, h, w = feat.shape + # [b, c, h, w] -> [b, h*w, c] + feat_flatten.append(feat.flatten(2).transpose([0, 2, 1])) + # [num_levels, 2] + spatial_shapes.append([h, w]) + # [l], start index of each level + level_start_index.append(h * w + level_start_index[-1]) + + # [b, l, c] + feat_flatten = paddle.concat(feat_flatten, 1) + level_start_index.pop() + return (feat_flatten, spatial_shapes, level_start_index) + + def forward(self, feats, pad_mask=None, gt_meta=None): + # input projection and embedding + (memory, spatial_shapes, + level_start_index) = self._get_encoder_input(feats) + + # prepare denoising training + if self.training: + denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \ + get_contrastive_denoising_training_group(gt_meta, + self.num_classes, + self.num_queries, + self.denoising_class_embed.weight, + self.num_denoising, + self.label_noise_ratio, + self.box_noise_scale) + else: + denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None + + target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \ + self._get_decoder_input( + memory, spatial_shapes, denoising_class, denoising_bbox_unact) + + # decoder + out_bboxes, out_logits = self.decoder( + target, + init_ref_points_unact, + memory, + spatial_shapes, + level_start_index, + self.dec_bbox_head, + self.dec_score_head, + self.query_pos_head, + attn_mask=attn_mask) + return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits, + dn_meta) + + def _generate_anchors(self, + spatial_shapes=None, + grid_size=0.05, + dtype="float32"): + if spatial_shapes is None: + spatial_shapes = [ + [int(self.eval_size[0] / s), int(self.eval_size[1] / s)] + for s in self.feat_strides + ] + anchors = [] + for lvl, (h, w) in enumerate(spatial_shapes): + grid_y, grid_x = paddle.meshgrid( + paddle.arange( + end=h, dtype=dtype), + paddle.arange( + end=w, dtype=dtype)) + grid_xy = paddle.stack([grid_x, grid_y], -1) + + valid_WH = paddle.to_tensor([w, h]).astype(dtype) + grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH + wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl) + anchors.append( + paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4])) + + anchors = paddle.concat(anchors, 1) + valid_mask = ((anchors > self.eps) * + (anchors < 1 - self.eps)).all(-1, keepdim=True) + anchors = paddle.log(anchors / (1 - anchors)) + anchors = paddle.where(valid_mask, anchors, + paddle.to_tensor(float("inf"))) + return anchors, valid_mask + + def _get_decoder_input(self, + memory, + spatial_shapes, + denoising_class=None, + denoising_bbox_unact=None): + bs, _, _ = memory.shape + # prepare input for decoder + if self.training or self.eval_size is None: + anchors, valid_mask = self._generate_anchors(spatial_shapes) + else: + anchors, valid_mask = self.anchors, self.valid_mask + memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.)) + output_memory = self.enc_output(memory) + + enc_outputs_class = self.enc_score_head(output_memory) + enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors + + _, topk_ind = paddle.topk( + enc_outputs_class.max(-1), self.num_queries, axis=1) + # extract region proposal boxes + batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype) + batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries]) + topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1) + + reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact, + topk_ind) # unsigmoided. + enc_topk_bboxes = F.sigmoid(reference_points_unact) + if denoising_bbox_unact is not None: + reference_points_unact = paddle.concat( + [denoising_bbox_unact, reference_points_unact], 1) + if self.training: + reference_points_unact = reference_points_unact.detach() + enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind) + + # extract region features + if self.learnt_init_query: + target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1]) + else: + target = paddle.gather_nd(output_memory, topk_ind) + if self.training: + target = target.detach() + if denoising_class is not None: + target = paddle.concat([denoising_class, target], 1) + + return target, reference_points_unact, enc_topk_bboxes, enc_topk_logits diff --git a/rtdetr_paddle/ppdet/modeling/transformers/utils.py b/rtdetr_paddle/ppdet/modeling/transformers/utils.py new file mode 100644 index 0000000..d144704 --- /dev/null +++ b/rtdetr_paddle/ppdet/modeling/transformers/utils.py @@ -0,0 +1,481 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Modified from DETR (https://github.com/facebookresearch/detr) +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +# Modified from detrex (https://github.com/IDEA-Research/detrex) +# Copyright 2022 The IDEA Authors. All rights reserved. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy +import math +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +__all__ = [ + '_get_clones', 'bbox_cxcywh_to_xyxy', + 'bbox_xyxy_to_cxcywh', 'sigmoid_focal_loss', 'inverse_sigmoid', + 'deformable_attention_core_func', 'varifocal_loss_with_logits' +] + + + +def bbox_area(boxes): + return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + + +def bbox_overlaps(boxes1, boxes2): + """ + Calculate overlaps between boxes1 and boxes2 + + Args: + boxes1 (Tensor): boxes with shape [M, 4] + boxes2 (Tensor): boxes with shape [N, 4] + + Return: + overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N] + """ + M = boxes1.shape[0] + N = boxes2.shape[0] + if M * N == 0: + return paddle.zeros([M, N], dtype='float32') + area1 = bbox_area(boxes1) + area2 = bbox_area(boxes2) + + xy_max = paddle.minimum( + paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:]) + xy_min = paddle.maximum( + paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2]) + width_height = xy_max - xy_min + width_height = width_height.clip(min=0) + inter = width_height.prod(axis=2) + + overlaps = paddle.where(inter > 0, inter / + (paddle.unsqueeze(area1, 1) + area2 - inter), + paddle.zeros_like(inter)) + return overlaps + + +def _get_clones(module, N): + return nn.LayerList([copy.deepcopy(module) for _ in range(N)]) + + +def bbox_cxcywh_to_xyxy(x): + cxcy, wh = paddle.split(x, 2, axis=-1) + return paddle.concat([cxcy - 0.5 * wh, cxcy + 0.5 * wh], axis=-1) + + +def bbox_xyxy_to_cxcywh(x): + x1, y1, x2, y2 = x.split(4, axis=-1) + return paddle.concat( + [(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)], axis=-1) + + +def sigmoid_focal_loss(logit, label, normalizer=1.0, alpha=0.25, gamma=2.0): + prob = F.sigmoid(logit) + ce_loss = F.binary_cross_entropy_with_logits(logit, label, reduction="none") + p_t = prob * label + (1 - prob) * (1 - label) + loss = ce_loss * ((1 - p_t)**gamma) + + if alpha >= 0: + alpha_t = alpha * label + (1 - alpha) * (1 - label) + loss = alpha_t * loss + return loss.mean(1).sum() / normalizer + + +def inverse_sigmoid(x, eps=1e-5): + x = x.clip(min=0., max=1.) + return paddle.log(x.clip(min=eps) / (1 - x).clip(min=eps)) + + +def deformable_attention_core_func(value, value_spatial_shapes, + value_level_start_index, sampling_locations, + attention_weights): + """ + Args: + value (Tensor): [bs, value_length, n_head, c] + value_spatial_shapes (Tensor|List): [n_levels, 2] + value_level_start_index (Tensor|List): [n_levels] + sampling_locations (Tensor): [bs, query_length, n_head, n_levels, n_points, 2] + attention_weights (Tensor): [bs, query_length, n_head, n_levels, n_points] + + Returns: + output (Tensor): [bs, Length_{query}, C] + """ + bs, _, n_head, c = value.shape + _, Len_q, _, n_levels, n_points, _ = sampling_locations.shape + + split_shape = [h * w for h, w in value_spatial_shapes] + value_list = value.split(split_shape, axis=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for level, (h, w) in enumerate(value_spatial_shapes): + # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ + value_l_ = value_list[level].flatten(2).transpose( + [0, 2, 1]).reshape([bs * n_head, c, h, w]) + # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 + sampling_grid_l_ = sampling_grids[:, :, :, level].transpose( + [0, 2, 1, 3, 4]).flatten(0, 1) + # N_*M_, D_, Lq_, P_ + sampling_value_l_ = F.grid_sample( + value_l_, + sampling_grid_l_, + mode='bilinear', + padding_mode='zeros', + align_corners=False) + sampling_value_list.append(sampling_value_l_) + # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_) + attention_weights = attention_weights.transpose([0, 2, 1, 3, 4]).reshape( + [bs * n_head, 1, Len_q, n_levels * n_points]) + output = (paddle.stack( + sampling_value_list, axis=-2).flatten(-2) * + attention_weights).sum(-1).reshape([bs, n_head * c, Len_q]) + + return output.transpose([0, 2, 1]) + + +def get_valid_ratio(mask): + _, H, W = paddle.shape(mask) + valid_ratio_h = paddle.sum(mask[:, :, 0], 1) / H + valid_ratio_w = paddle.sum(mask[:, 0, :], 1) / W + # [b, 2] + return paddle.stack([valid_ratio_w, valid_ratio_h], -1) + + +def get_denoising_training_group(targets, + num_classes, + num_queries, + class_embed, + num_denoising=100, + label_noise_ratio=0.5, + box_noise_scale=1.0): + if num_denoising <= 0: + return None, None, None, None + num_gts = [len(t) for t in targets["gt_class"]] + max_gt_num = max(num_gts) + if max_gt_num == 0: + return None, None, None, None + + num_group = num_denoising // max_gt_num + num_group = 1 if num_group == 0 else num_group + # pad gt to max_num of a batch + bs = len(targets["gt_class"]) + input_query_class = paddle.full( + [bs, max_gt_num], num_classes, dtype='int32') + input_query_bbox = paddle.zeros([bs, max_gt_num, 4]) + pad_gt_mask = paddle.zeros([bs, max_gt_num]) + for i in range(bs): + num_gt = num_gts[i] + if num_gt > 0: + input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1) + input_query_bbox[i, :num_gt] = targets["gt_bbox"][i] + pad_gt_mask[i, :num_gt] = 1 + + input_query_class = input_query_class.tile([1, num_group]) + input_query_bbox = input_query_bbox.tile([1, num_group, 1]) + pad_gt_mask = pad_gt_mask.tile([1, num_group]) + + dn_positive_idx = paddle.nonzero(pad_gt_mask)[:, 1] + dn_positive_idx = paddle.split(dn_positive_idx, + [n * num_group for n in num_gts]) + # total denoising queries + num_denoising = int(max_gt_num * num_group) + + if label_noise_ratio > 0: + input_query_class = input_query_class.flatten() + pad_gt_mask = pad_gt_mask.flatten() + # half of bbox prob + mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5) + chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1) + # randomly put a new one here + new_label = paddle.randint_like( + chosen_idx, 0, num_classes, dtype=input_query_class.dtype) + input_query_class.scatter_(chosen_idx, new_label) + input_query_class.reshape_([bs, num_denoising]) + pad_gt_mask.reshape_([bs, num_denoising]) + + if box_noise_scale > 0: + diff = paddle.concat( + [input_query_bbox[..., 2:] * 0.5, input_query_bbox[..., 2:]], + axis=-1) * box_noise_scale + diff *= (paddle.rand(input_query_bbox.shape) * 2.0 - 1.0) + input_query_bbox += diff + input_query_bbox = inverse_sigmoid(input_query_bbox) + + class_embed = paddle.concat( + [class_embed, paddle.zeros([1, class_embed.shape[-1]])]) + input_query_class = paddle.gather( + class_embed, input_query_class.flatten(), + axis=0).reshape([bs, num_denoising, -1]) + + tgt_size = num_denoising + num_queries + attn_mask = paddle.ones([tgt_size, tgt_size]) < 0 + # match query cannot see the reconstruction + attn_mask[num_denoising:, :num_denoising] = True + # reconstruct cannot see each other + for i in range(num_group): + if i == 0: + attn_mask[max_gt_num * i:max_gt_num * (i + 1), max_gt_num * (i + 1): + num_denoising] = True + if i == num_group - 1: + attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num * + i] = True + else: + attn_mask[max_gt_num * i:max_gt_num * (i + 1), max_gt_num * (i + 1): + num_denoising] = True + attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num * + i] = True + attn_mask = ~attn_mask + dn_meta = { + "dn_positive_idx": dn_positive_idx, + "dn_num_group": num_group, + "dn_num_split": [num_denoising, num_queries] + } + + return input_query_class, input_query_bbox, attn_mask, dn_meta + + +def get_contrastive_denoising_training_group(targets, + num_classes, + num_queries, + class_embed, + num_denoising=100, + label_noise_ratio=0.5, + box_noise_scale=1.0): + if num_denoising <= 0: + return None, None, None, None + num_gts = [len(t) for t in targets["gt_class"]] + max_gt_num = max(num_gts) + if max_gt_num == 0: + return None, None, None, None + + num_group = num_denoising // max_gt_num + num_group = 1 if num_group == 0 else num_group + # pad gt to max_num of a batch + bs = len(targets["gt_class"]) + input_query_class = paddle.full( + [bs, max_gt_num], num_classes, dtype='int32') + input_query_bbox = paddle.zeros([bs, max_gt_num, 4]) + pad_gt_mask = paddle.zeros([bs, max_gt_num]) + for i in range(bs): + num_gt = num_gts[i] + if num_gt > 0: + input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1) + input_query_bbox[i, :num_gt] = targets["gt_bbox"][i] + pad_gt_mask[i, :num_gt] = 1 + # each group has positive and negative queries. + input_query_class = input_query_class.tile([1, 2 * num_group]) + input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1]) + pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group]) + # positive and negative mask + negative_gt_mask = paddle.zeros([bs, max_gt_num * 2, 1]) + negative_gt_mask[:, max_gt_num:] = 1 + negative_gt_mask = negative_gt_mask.tile([1, num_group, 1]) + positive_gt_mask = 1 - negative_gt_mask + # contrastive denoising training positive index + positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask + dn_positive_idx = paddle.nonzero(positive_gt_mask)[:, 1] + dn_positive_idx = paddle.split(dn_positive_idx, + [n * num_group for n in num_gts]) + # total denoising queries + num_denoising = int(max_gt_num * 2 * num_group) + + if label_noise_ratio > 0: + input_query_class = input_query_class.flatten() + pad_gt_mask = pad_gt_mask.flatten() + + # Convert pad_gt_mask to bool if it's not already + pad_gt_mask = pad_gt_mask.astype('bool') + + # half of bbox prob + mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5) + chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1) + + # randomly put a new one here + new_label = paddle.randint_like( + chosen_idx, 0, num_classes, dtype=input_query_class.dtype) + + input_query_class.scatter_(chosen_idx, new_label) + input_query_class.reshape_([bs, num_denoising]) + pad_gt_mask.reshape_([bs, num_denoising]) + + if box_noise_scale > 0: + known_bbox = bbox_cxcywh_to_xyxy(input_query_bbox) + + diff = paddle.tile(input_query_bbox[..., 2:] * 0.5, + [1, 1, 2]) * box_noise_scale + + rand_sign = paddle.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0 + rand_part = paddle.rand(input_query_bbox.shape) + rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * ( + 1 - negative_gt_mask) + rand_part *= rand_sign + known_bbox += rand_part * diff + known_bbox.clip_(min=0.0, max=1.0) + input_query_bbox = bbox_xyxy_to_cxcywh(known_bbox) + input_query_bbox = inverse_sigmoid(input_query_bbox) + + class_embed = paddle.concat( + [class_embed, paddle.zeros([1, class_embed.shape[-1]])]) + input_query_class = paddle.gather( + class_embed, input_query_class.flatten(), + axis=0).reshape([bs, num_denoising, -1]) + + tgt_size = num_denoising + num_queries + attn_mask = paddle.ones([tgt_size, tgt_size]) < 0 + # match query cannot see the reconstruction + attn_mask[num_denoising:, :num_denoising] = True + # reconstruct cannot see each other + for i in range(num_group): + if i == 0: + attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num * + 2 * (i + 1):num_denoising] = True + if i == num_group - 1: + attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num * + i * 2] = True + else: + attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num * + 2 * (i + 1):num_denoising] = True + attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num * + 2 * i] = True + attn_mask = ~attn_mask + dn_meta = { + "dn_positive_idx": dn_positive_idx, + "dn_num_group": num_group, + "dn_num_split": [num_denoising, num_queries] + } + + return input_query_class, input_query_bbox, attn_mask, dn_meta + + +def get_sine_pos_embed(pos_tensor, + num_pos_feats=128, + temperature=10000, + exchange_xy=True): + """generate sine position embedding from a position tensor + + Args: + pos_tensor (Tensor): Shape as `(None, n)`. + num_pos_feats (int): projected shape for each float in the tensor. Default: 128 + temperature (int): The temperature used for scaling + the position embedding. Default: 10000. + exchange_xy (bool, optional): exchange pos x and pos y. \ + For example, input tensor is `[x, y]`, the results will # noqa + be `[pos(y), pos(x)]`. Defaults: True. + + Returns: + Tensor: Returned position embedding # noqa + with shape `(None, n * num_pos_feats)`. + """ + scale = 2. * math.pi + dim_t = 2. * paddle.floor_divide( + paddle.arange(num_pos_feats), paddle.to_tensor(2)) + dim_t = scale / temperature**(dim_t / num_pos_feats) + + def sine_func(x): + x *= dim_t + return paddle.stack( + (x[:, :, 0::2].sin(), x[:, :, 1::2].cos()), axis=3).flatten(2) + + pos_res = [sine_func(x) for x in pos_tensor.split(pos_tensor.shape[-1], -1)] + if exchange_xy: + pos_res[0], pos_res[1] = pos_res[1], pos_res[0] + pos_res = paddle.concat(pos_res, axis=2) + return pos_res + + +def mask_to_box_coordinate(mask, + normalize=False, + format="xyxy", + dtype="float32"): + """ + Compute the bounding boxes around the provided mask. + Args: + mask (Tensor:bool): [b, c, h, w] + + Returns: + bbox (Tensor): [b, c, 4] + """ + assert mask.ndim == 4 + assert format in ["xyxy", "xywh"] + if mask.sum() == 0: + return paddle.zeros([mask.shape[0], mask.shape[1], 4], dtype=dtype) + + h, w = mask.shape[-2:] + y, x = paddle.meshgrid( + paddle.arange( + end=h, dtype=dtype), paddle.arange( + end=w, dtype=dtype)) + + x_mask = x * mask + x_max = x_mask.flatten(-2).max(-1) + 1 + x_min = paddle.where(mask, x_mask, + paddle.to_tensor(1e8)).flatten(-2).min(-1) + + y_mask = y * mask + y_max = y_mask.flatten(-2).max(-1) + 1 + y_min = paddle.where(mask, y_mask, + paddle.to_tensor(1e8)).flatten(-2).min(-1) + out_bbox = paddle.stack([x_min, y_min, x_max, y_max], axis=-1) + if normalize: + out_bbox /= paddle.to_tensor([w, h, w, h]).astype(dtype) + + return out_bbox if format == "xyxy" else bbox_xyxy_to_cxcywh(out_bbox) + + +def varifocal_loss_with_logits(pred_logits, + gt_score, + label, + normalizer=1.0, + alpha=0.75, + gamma=2.0): + pred_score = F.sigmoid(pred_logits) + weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label + loss = F.binary_cross_entropy_with_logits( + pred_logits, gt_score, weight=weight, reduction='none') + return loss.mean(1).sum() / normalizer + + + + +from ..initializer import linear_init_ + +class MLP(nn.Layer): + """This code is based on + https://github.com/facebookresearch/detr/blob/main/models/detr.py + """ + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.LayerList( + nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + self._reset_parameters() + + def _reset_parameters(self): + for l in self.layers: + linear_init_(l) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + diff --git a/rtdetr_paddle/ppdet/optimizer/__init__.py b/rtdetr_paddle/ppdet/optimizer/__init__.py new file mode 100644 index 0000000..aa690dc --- /dev/null +++ b/rtdetr_paddle/ppdet/optimizer/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import optimizer +from . import ema + +from .optimizer import * +from .ema import * diff --git a/rtdetr_paddle/ppdet/optimizer/ema.py b/rtdetr_paddle/ppdet/optimizer/ema.py new file mode 100644 index 0000000..70d006b --- /dev/null +++ b/rtdetr_paddle/ppdet/optimizer/ema.py @@ -0,0 +1,193 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +import weakref +from copy import deepcopy + +from .utils import get_bn_running_state_names + +__all__ = ['ModelEMA', 'SimpleModelEMA'] + + +class ModelEMA(object): + """ + Exponential Weighted Average for Deep Neutal Networks + Args: + model (nn.Layer): Detector of model. + decay (int): The decay used for updating ema parameter. + Ema's parameter are updated with the formula: + `ema_param = decay * ema_param + (1 - decay) * cur_param`. + Defaults is 0.9998. + ema_decay_type (str): type in ['threshold', 'normal', 'exponential'], + 'threshold' as default. + cycle_epoch (int): The epoch of interval to reset ema_param and + step. Defaults is -1, which means not reset. Its function is to + add a regular effect to ema, which is set according to experience + and is effective when the total training epoch is large. + ema_black_list (set|list|tuple, optional): The custom EMA black_list. + Blacklist of weight names that will not participate in EMA + calculation. Default: None. + """ + + def __init__(self, + model, + decay=0.9998, + ema_decay_type='threshold', + cycle_epoch=-1, + ema_black_list=None, + ema_filter_no_grad=False): + self.step = 0 + self.epoch = 0 + self.decay = decay + self.ema_decay_type = ema_decay_type + self.cycle_epoch = cycle_epoch + self.ema_black_list = self._match_ema_black_list( + model.state_dict().keys(), ema_black_list) + bn_states_names = get_bn_running_state_names(model) + if ema_filter_no_grad: + for n, p in model.named_parameters(): + if p.stop_gradient and n not in bn_states_names: + self.ema_black_list.add(n) + + self.state_dict = dict() + for k, v in model.state_dict().items(): + if k in self.ema_black_list: + self.state_dict[k] = v + else: + self.state_dict[k] = paddle.zeros_like(v) + + self._model_state = { + k: weakref.ref(p) + for k, p in model.state_dict().items() + } + + def reset(self): + self.step = 0 + self.epoch = 0 + for k, v in self.state_dict.items(): + if k in self.ema_black_list: + self.state_dict[k] = v + else: + self.state_dict[k] = paddle.zeros_like(v) + + def resume(self, state_dict, step=0): + for k, v in state_dict.items(): + if k in self.state_dict: + if self.state_dict[k].dtype == v.dtype: + self.state_dict[k] = v + else: + self.state_dict[k] = v.astype(self.state_dict[k].dtype) + self.step = step + + def update(self, model=None): + if self.ema_decay_type == 'threshold': + decay = min(self.decay, (1 + self.step) / (10 + self.step)) + elif self.ema_decay_type == 'exponential': + decay = self.decay * (1 - math.exp(-(self.step + 1) / 2000)) + else: + decay = self.decay + self._decay = decay + + if model is not None: + model_dict = model.state_dict() + else: + model_dict = {k: p() for k, p in self._model_state.items()} + assert all( + [v is not None for _, v in model_dict.items()]), 'python gc.' + + for k, v in self.state_dict.items(): + if k not in self.ema_black_list: + v = decay * v + (1 - decay) * model_dict[k] + v.stop_gradient = True + self.state_dict[k] = v + self.step += 1 + + def apply(self): + if self.step == 0: + return self.state_dict + state_dict = dict() + for k, v in self.state_dict.items(): + if k in self.ema_black_list: + v.stop_gradient = True + state_dict[k] = v + else: + if self.ema_decay_type != 'exponential': + v = v / (1 - self._decay**self.step) + v.stop_gradient = True + state_dict[k] = v + self.epoch += 1 + if self.cycle_epoch > 0 and self.epoch == self.cycle_epoch: + self.reset() + + return state_dict + + def _match_ema_black_list(self, weight_name, ema_black_list=None): + out_list = set() + if ema_black_list: + for name in weight_name: + for key in ema_black_list: + if key in name: + out_list.add(name) + return out_list + + +class SimpleModelEMA(object): + """ + Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models + Keep a moving average of everything in the model state_dict (parameters and buffers). + This is intended to allow functionality like + https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage + A smoothed version of the weights is necessary for some training schemes to perform well. + This class is sensitive where it is initialized in the sequence of model init, + GPU assignment and distributed training wrappers. + """ + + def __init__(self, model=None, decay=0.9996): + """ + Args: + model (nn.Module): model to apply EMA. + decay (float): ema decay reate. + """ + self.model = deepcopy(model) + self.decay = decay + + def update(self, model, decay=None): + if decay is None: + decay = self.decay + + with paddle.no_grad(): + state = {} + msd = model.state_dict() + for k, v in self.model.state_dict().items(): + if paddle.is_floating_point(v): + v *= decay + v += (1.0 - decay) * msd[k].detach() + state[k] = v + self.model.set_state_dict(state) + + def resume(self, state_dict, step=0): + state = {} + msd = state_dict + for k, v in self.model.state_dict().items(): + if paddle.is_floating_point(v): + v = msd[k].detach() + state[k] = v + self.model.set_state_dict(state) + self.step = step diff --git a/rtdetr_paddle/ppdet/optimizer/optimizer.py b/rtdetr_paddle/ppdet/optimizer/optimizer.py new file mode 100644 index 0000000..37d1cf0 --- /dev/null +++ b/rtdetr_paddle/ppdet/optimizer/optimizer.py @@ -0,0 +1,350 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys +import math +import paddle +import paddle.nn as nn + +import paddle.optimizer as optimizer +import paddle.regularizer as regularizer + +from ppdet.core.workspace import register, serializable +import copy + +__all__ = ['LearningRate', 'OptimizerBuilder'] + +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + + +@serializable +class CosineDecay(object): + """ + Cosine learning rate decay + + Args: + max_epochs (int): max epochs for the training process. + if you commbine cosine decay with warmup, it is recommended that + the max_iters is much larger than the warmup iter + use_warmup (bool): whether to use warmup. Default: True. + min_lr_ratio (float): minimum learning rate ratio. Default: 0. + last_plateau_epochs (int): use minimum learning rate in + the last few epochs. Default: 0. + """ + + def __init__(self, + max_epochs=1000, + use_warmup=True, + min_lr_ratio=0., + last_plateau_epochs=0): + self.max_epochs = max_epochs + self.use_warmup = use_warmup + self.min_lr_ratio = min_lr_ratio + self.last_plateau_epochs = last_plateau_epochs + + def __call__(self, + base_lr=None, + boundary=None, + value=None, + step_per_epoch=None): + assert base_lr is not None, "either base LR or values should be provided" + + max_iters = self.max_epochs * int(step_per_epoch) + last_plateau_iters = self.last_plateau_epochs * int(step_per_epoch) + min_lr = base_lr * self.min_lr_ratio + if boundary is not None and value is not None and self.use_warmup: + # use warmup + warmup_iters = len(boundary) + for i in range(int(boundary[-1]), max_iters): + boundary.append(i) + if i < max_iters - last_plateau_iters: + decayed_lr = min_lr + (base_lr - min_lr) * 0.5 * (math.cos( + (i - warmup_iters) * math.pi / + (max_iters - warmup_iters - last_plateau_iters)) + 1) + value.append(decayed_lr) + else: + value.append(min_lr) + return optimizer.lr.PiecewiseDecay(boundary, value) + elif last_plateau_iters > 0: + # not use warmup, but set `last_plateau_epochs` > 0 + boundary = [] + value = [] + for i in range(max_iters): + if i < max_iters - last_plateau_iters: + decayed_lr = min_lr + (base_lr - min_lr) * 0.5 * (math.cos( + i * math.pi / (max_iters - last_plateau_iters)) + 1) + value.append(decayed_lr) + else: + value.append(min_lr) + if i > 0: + boundary.append(i) + return optimizer.lr.PiecewiseDecay(boundary, value) + + return optimizer.lr.CosineAnnealingDecay( + base_lr, T_max=max_iters, eta_min=min_lr) + + +@serializable +class PiecewiseDecay(object): + """ + Multi step learning rate decay + + Args: + gamma (float | list): decay factor + milestones (list): steps at which to decay learning rate + """ + + def __init__(self, + gamma=[0.1, 0.01], + milestones=[8, 11], + values=None, + use_warmup=True): + super(PiecewiseDecay, self).__init__() + if type(gamma) is not list: + self.gamma = [] + for i in range(len(milestones)): + self.gamma.append(gamma / 10**i) + else: + self.gamma = gamma + self.milestones = milestones + self.values = values + self.use_warmup = use_warmup + + def __call__(self, + base_lr=None, + boundary=None, + value=None, + step_per_epoch=None): + if boundary is not None and self.use_warmup: + boundary.extend([int(step_per_epoch) * i for i in self.milestones]) + else: + # do not use LinearWarmup + boundary = [int(step_per_epoch) * i for i in self.milestones] + value = [base_lr] # during step[0, boundary[0]] is base_lr + + # self.values is setted directly in config + if self.values is not None: + assert len(self.milestones) + 1 == len(self.values) + return optimizer.lr.PiecewiseDecay(boundary, self.values) + + # value is computed by self.gamma + value = value if value is not None else [base_lr] + for i in self.gamma: + value.append(base_lr * i) + + return optimizer.lr.PiecewiseDecay(boundary, value) + + +@serializable +class LinearWarmup(object): + """ + Warm up learning rate linearly + + Args: + steps (int): warm up steps + start_factor (float): initial learning rate factor + epochs (int|None): use epochs as warm up steps, the priority + of `epochs` is higher than `steps`. Default: None. + """ + + def __init__(self, steps=500, start_factor=1. / 3, epochs=None): + super(LinearWarmup, self).__init__() + self.steps = steps + self.start_factor = start_factor + self.epochs = epochs + + def __call__(self, base_lr, step_per_epoch): + boundary = [] + value = [] + warmup_steps = self.epochs * step_per_epoch \ + if self.epochs is not None else self.steps + warmup_steps = max(warmup_steps, 1) + for i in range(warmup_steps + 1): + if warmup_steps > 0: + alpha = i / warmup_steps + factor = self.start_factor * (1 - alpha) + alpha + lr = base_lr * factor + value.append(lr) + if i > 0: + boundary.append(i) + return boundary, value + + +@serializable +class ExpWarmup(object): + """ + Warm up learning rate in exponential mode + Args: + steps (int): warm up steps. + epochs (int|None): use epochs as warm up steps, the priority + of `epochs` is higher than `steps`. Default: None. + power (int): Exponential coefficient. Default: 2. + """ + + def __init__(self, steps=1000, epochs=None, power=2): + super(ExpWarmup, self).__init__() + self.steps = steps + self.epochs = epochs + self.power = power + + def __call__(self, base_lr, step_per_epoch): + boundary = [] + value = [] + warmup_steps = self.epochs * step_per_epoch if self.epochs is not None else self.steps + warmup_steps = max(warmup_steps, 1) + for i in range(warmup_steps + 1): + factor = (i / float(warmup_steps))**self.power + value.append(base_lr * factor) + if i > 0: + boundary.append(i) + return boundary, value + + +@register +class LearningRate(object): + """ + Learning Rate configuration + + Args: + base_lr (float): base learning rate + schedulers (list): learning rate schedulers + """ + __category__ = 'optim' + + def __init__(self, + base_lr=0.01, + schedulers=[PiecewiseDecay(), LinearWarmup()]): + super(LearningRate, self).__init__() + self.base_lr = base_lr + self.schedulers = [] + + schedulers = copy.deepcopy(schedulers) + for sched in schedulers: + if isinstance(sched, dict): + # support dict sched instantiate + module = sys.modules[__name__] + type = sched.pop("name") + scheduler = getattr(module, type)(**sched) + self.schedulers.append(scheduler) + else: + self.schedulers.append(sched) + + def __call__(self, step_per_epoch): + assert len(self.schedulers) >= 1 + if not self.schedulers[0].use_warmup: + return self.schedulers[0](base_lr=self.base_lr, + step_per_epoch=step_per_epoch) + + # TODO: split warmup & decay + # warmup + boundary, value = self.schedulers[1](self.base_lr, step_per_epoch) + # decay + decay_lr = self.schedulers[0](self.base_lr, boundary, value, + step_per_epoch) + return decay_lr + + +@register +class OptimizerBuilder(): + """ + Build optimizer handles + Args: + regularizer (object): an `Regularizer` instance + optimizer (object): an `Optimizer` instance + """ + __category__ = 'optim' + + def __init__(self, + clip_grad_by_norm=None, + clip_grad_by_value=None, + regularizer={'type': 'L2', + 'factor': .0001}, + optimizer={'type': 'Momentum', + 'momentum': .9}): + self.clip_grad_by_norm = clip_grad_by_norm + self.clip_grad_by_value = clip_grad_by_value + self.regularizer = regularizer + self.optimizer = optimizer + + def __call__(self, learning_rate, model=None): + if self.clip_grad_by_norm is not None: + grad_clip = nn.ClipGradByGlobalNorm( + clip_norm=self.clip_grad_by_norm) + elif self.clip_grad_by_value is not None: + var = abs(self.clip_grad_by_value) + grad_clip = nn.ClipGradByValue(min=-var, max=var) + else: + grad_clip = None + if self.regularizer and self.regularizer != 'None': + reg_type = self.regularizer['type'] + 'Decay' + reg_factor = self.regularizer['factor'] + regularization = getattr(regularizer, reg_type)(reg_factor) + else: + regularization = None + + optim_args = self.optimizer.copy() + optim_type = optim_args['type'] + del optim_args['type'] + + if optim_type != 'AdamW': + optim_args['weight_decay'] = regularization + + op = getattr(optimizer, optim_type) + + if 'param_groups' in optim_args: + assert isinstance(optim_args['param_groups'], list), '' + + param_groups = optim_args.pop('param_groups') + + params, visited = [], [] + for group in param_groups: + assert isinstance(group, + dict) and 'params' in group and isinstance( + group['params'], list), '' + _params = { + n: p + for n, p in model.named_parameters() + if any([k in n + for k in group['params']]) and p.trainable is True + } + _group = group.copy() + _group.update({'params': list(_params.values())}) + + params.append(_group) + visited.extend(list(_params.keys())) + + ext_params = [ + p for n, p in model.named_parameters() + if n not in visited and p.trainable is True + ] + + if len(ext_params) < len(model.parameters()): + params.append({'params': ext_params}) + + elif len(ext_params) > len(model.parameters()): + raise RuntimeError + + else: + _params = model.parameters() + params = [param for param in _params if param.trainable is True] + + return op(learning_rate=learning_rate, + parameters=params, + grad_clip=grad_clip, + **optim_args) diff --git a/rtdetr_paddle/ppdet/optimizer/utils.py b/rtdetr_paddle/ppdet/optimizer/utils.py new file mode 100644 index 0000000..6fc6dbd --- /dev/null +++ b/rtdetr_paddle/ppdet/optimizer/utils.py @@ -0,0 +1,36 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle.nn as nn + +from typing import List + + +def get_bn_running_state_names(model: nn.Layer) -> List[str]: + """Get all bn state full names including running mean and variance + """ + names = [] + for n, m in model.named_sublayers(): + if isinstance(m, (nn.BatchNorm2D, nn.SyncBatchNorm)): + assert hasattr(m, '_mean'), f'assert {m} has _mean' + assert hasattr(m, '_variance'), f'assert {m} has _variance' + running_mean = f'{n}._mean' + running_var = f'{n}._variance' + names.extend([running_mean, running_var]) + + return names diff --git a/rtdetr_paddle/ppdet/utils/__init__.py b/rtdetr_paddle/ppdet/utils/__init__.py new file mode 100644 index 0000000..d0c32e2 --- /dev/null +++ b/rtdetr_paddle/ppdet/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/rtdetr_paddle/ppdet/utils/cam_utils.py b/rtdetr_paddle/ppdet/utils/cam_utils.py new file mode 100644 index 0000000..d2f7a47 --- /dev/null +++ b/rtdetr_paddle/ppdet/utils/cam_utils.py @@ -0,0 +1,343 @@ +import numpy as np +import cv2 +import os +import sys +import glob +from ppdet.utils.logger import setup_logger +import copy +logger = setup_logger('ppdet_cam') + +import paddle +from ppdet.engine import Trainer + + +def get_test_images(infer_dir, infer_img): + """ + Get image path list in TEST mode + """ + assert infer_img is not None or infer_dir is not None, \ + "--infer_img or --infer_dir should be set" + assert infer_img is None or os.path.isfile(infer_img), \ + "{} is not a file".format(infer_img) + assert infer_dir is None or os.path.isdir(infer_dir), \ + "{} is not a directory".format(infer_dir) + + # infer_img has a higher priority + if infer_img and os.path.isfile(infer_img): + return [infer_img] + + images = set() + infer_dir = os.path.abspath(infer_dir) + assert os.path.isdir(infer_dir), \ + "infer_dir {} is not a directory".format(infer_dir) + exts = ['jpg', 'jpeg', 'png', 'bmp'] + exts += [ext.upper() for ext in exts] + for ext in exts: + images.update(glob.glob('{}/*.{}'.format(infer_dir, ext))) + images = list(images) + + assert len(images) > 0, "no image found in {}".format(infer_dir) + logger.info("Found {} inference images in total.".format(len(images))) + + return images + + +def compute_ious(boxes1, boxes2): + """[Compute pairwise IOU matrix for given two sets of boxes] + + Args: + boxes1 ([numpy ndarray with shape N,4]): [representing bounding boxes with format (xmin,ymin,xmax,ymax)] + boxes2 ([numpy ndarray with shape M,4]): [representing bounding boxes with format (xmin,ymin,xmax,ymax)] + Returns: + pairwise IOU maxtrix with shape (N,M),where the value at ith row jth column hold the iou between ith + box and jth box from box1 and box2 respectively. + """ + lu = np.maximum( + boxes1[:, None, :2], boxes2[:, :2] + ) # lu with shape N,M,2 ; boxes1[:,None,:2] with shape (N,1,2) boxes2 with shape(M,2) + rd = np.minimum(boxes1[:, None, 2:], boxes2[:, 2:]) # rd same to lu + intersection_wh = np.maximum(0.0, rd - lu) + intersection_area = intersection_wh[:, :, + 0] * intersection_wh[:, :, + 1] # with shape (N,M) + boxes1_wh = np.maximum(0.0, boxes1[:, 2:] - boxes1[:, :2]) + boxes1_area = boxes1_wh[:, 0] * boxes1_wh[:, 1] # with shape (N,) + boxes2_wh = np.maximum(0.0, boxes2[:, 2:] - boxes2[:, :2]) + boxes2_area = boxes2_wh[:, 0] * boxes2_wh[:, 1] # with shape (M,) + union_area = np.maximum( + boxes1_area[:, None] + boxes2_area - intersection_area, + 1e-8) # with shape (N,M) + ious = np.clip(intersection_area / union_area, 0.0, 1.0) + return ious + + +def grad_cam(feat, grad): + """ + + Args: + feat: CxHxW + grad: CxHxW + + Returns: + cam: HxW + """ + exp = (feat * grad.mean((1, 2), keepdims=True)).mean(axis=0) + exp = np.maximum(-exp, 0) + return exp + + +def resize_cam(explanation, resize_shape) -> np.ndarray: + """ + + Args: + explanation: (width, height) + resize_shape: (width, height) + + Returns: + + """ + assert len(explanation.shape) == 2, f"{explanation.shape}. " \ + f"Currently support 2D explanation results for visualization. " \ + "Reduce higher dimensions to 2D for visualization." + + explanation = (explanation - explanation.min()) / ( + explanation.max() - explanation.min()) + + explanation = cv2.resize(explanation, resize_shape) + explanation = np.uint8(255 * explanation) + explanation = cv2.applyColorMap(explanation, cv2.COLORMAP_JET) + explanation = cv2.cvtColor(explanation, cv2.COLOR_BGR2RGB) + + return explanation + + +class BBoxCAM: + def __init__(self, FLAGS, cfg): + self.FLAGS = FLAGS + self.cfg = cfg + # build model + self.trainer = self.build_trainer(cfg) + # num_class + self.num_class = cfg.num_classes + # set hook for extraction of featuremaps and grads + self.set_hook(cfg) + self.nms_idx_need_divid_numclass_arch = ['FasterRCNN', 'MaskRCNN', 'CascadeRCNN'] + """ + In these networks, the bbox array shape before nms contain num_class, + the nms_keep_idx of the bbox need to divide the num_class; + """ + + # cam image output_dir + try: + os.makedirs(FLAGS.cam_out) + except: + print('Path already exists.') + pass + + def build_trainer(self, cfg): + # build trainer + trainer = Trainer(cfg, mode='test') + # load weights + trainer.load_weights(cfg.weights) + + # set for get extra_data before nms + trainer.model.use_extra_data=True + # set for record the bbox index before nms + if cfg.architecture in ['FasterRCNN', 'MaskRCNN']: + trainer.model.bbox_post_process.nms.return_index = True + elif cfg.architecture in ['YOLOv3', 'PPYOLOE', 'PPYOLOEWithAuxHead']: + if trainer.model.post_process is not None: + # anchor based YOLOs: YOLOv3,PP-YOLO + trainer.model.post_process.nms.return_index = True + else: + # anchor free YOLOs: PP-YOLOE, PP-YOLOE+ + trainer.model.yolo_head.nms.return_index = True + elif cfg.architecture=='BlazeFace' or cfg.architecture=='SSD': + trainer.model.post_process.nms.return_index = True + elif cfg.architecture=='RetinaNet': + trainer.model.head.nms.return_index = True + else: + print( + cfg.architecture+' is not supported for cam temporarily!' + ) + sys.exit() + # Todo: Unify the head/post_process name in each model + + return trainer + + def set_hook(self, cfg): + # set hook for extraction of featuremaps and grads + self.target_feats = {} + self.target_layer_name = cfg.target_feature_layer_name + # such as trainer.model.backbone, trainer.model.bbox_head.roi_extractor + + def hook(layer, input, output): + self.target_feats[layer._layer_name_for_hook] = output + + try: + exec('self.trainer.'+self.target_layer_name+'._layer_name_for_hook = self.target_layer_name') + # self.trainer.target_layer_name._layer_name_for_hook = self.target_layer_name + exec('self.trainer.'+self.target_layer_name+'.register_forward_post_hook(hook)') + # self.trainer.target_layer_name.register_forward_post_hook(hook) + except: + print("Error! " + "The target_layer_name--"+self.target_layer_name+" is not in model! " + "Please check the spelling and " + "the network's architecture!") + sys.exit() + + def get_bboxes(self): + # get inference images + images = get_test_images(self.FLAGS.infer_dir, self.FLAGS.infer_img) + + # inference + result = self.trainer.predict( + images, + draw_threshold=self.FLAGS.draw_threshold, + output_dir=self.FLAGS.output_dir, + save_results=self.FLAGS.save_results, + visualize=False)[0] + return result + + def get_bboxes_cams(self): + # Get the bboxes prediction(after nms result) of the input + inference_result = self.get_bboxes() + + # read input image + # Todo: Support folder multi-images process + from PIL import Image + img = np.array(Image.open(self.cfg.infer_img)) + + # data for calaulate bbox grad_cam + extra_data = inference_result['extra_data'] + """ + Example of Faster_RCNN based architecture: + extra_data: {'scores': tensor with shape [num_of_bboxes_before_nms, num_classes], for example: [1000, 80] + 'nms_keep_idx': tensor with shape [num_of_bboxes_after_nms, 1], for example: [300, 1] + } + Example of YOLOv3 based architecture: + extra_data: {'scores': tensor with shape [1, num_classes, num_of_yolo_bboxes_before_nms], #for example: [1, 80, 8400] + 'nms_keep_idx': tensor with shape [num_of_yolo_bboxes_after_nms, 1], # for example: [300, 1] + } + """ + + # array index of the predicted bbox before nms + if self.cfg.architecture in self.nms_idx_need_divid_numclass_arch: + # some network's bbox array shape before nms may be like [num_of_bboxes_before_nms, num_classes, 4], + # we need to divide num_classes to get the before_nms_index; + # currently, only include the rcnn architectures (fasterrcnn, maskrcnn, cascadercnn); + before_nms_indexes = extra_data['nms_keep_idx'].cpu().numpy( + ) // self.num_class # num_class + else : + before_nms_indexes = extra_data['nms_keep_idx'].cpu().numpy() + + # Calculate and visualize the heatmap of per predict bbox + for index, target_bbox in enumerate(inference_result['bbox']): + # target_bbox: [cls, score, x1, y1, x2, y2] + # filter bboxes with low predicted scores + if target_bbox[1] < self.FLAGS.draw_threshold: + continue + + target_bbox_before_nms = int(before_nms_indexes[index]) + + if len(extra_data['scores'].shape)==2: + score_out = extra_data['scores'][target_bbox_before_nms] + else: + score_out = extra_data['scores'][0, :, target_bbox_before_nms] + """ + There are two kinds array shape of bbox score output : + 1) [num_of_bboxes_before_nms, num_classes], for example: [1000, 80] + 2) [num_of_image, num_classes, num_of_yolo_bboxes_before_nms], for example: [1, 80, 1000] + """ + + + # construct one_hot label and do backward to get the gradients + predicted_label = paddle.argmax(score_out) + label_onehot = paddle.nn.functional.one_hot( + predicted_label, num_classes=len(score_out)) + label_onehot = label_onehot.squeeze() + target = paddle.sum(score_out * label_onehot) + target.backward(retain_graph=True) + + + if 'backbone' in self.target_layer_name or \ + 'neck' in self.target_layer_name: # backbone/neck level feature + if isinstance(self.target_feats[self.target_layer_name], list): + # when the featuremap contains of multiple scales, + # take the featuremap of the last scale + # Todo: fuse the cam result from multisclae featuremaps + if self.target_feats[self.target_layer_name][ + -1].shape[-1]==1: + """ + if the last level featuremap is 1x1 size, + we take the second last one + """ + cam_grad = self.target_feats[self.target_layer_name][ + -2].grad.squeeze().cpu().numpy() + cam_feat = self.target_feats[self.target_layer_name][ + -2].squeeze().cpu().numpy() + else: + cam_grad = self.target_feats[self.target_layer_name][ + -1].grad.squeeze().cpu().numpy() + cam_feat = self.target_feats[self.target_layer_name][ + -1].squeeze().cpu().numpy() + else: + cam_grad = self.target_feats[ + self.target_layer_name].grad.squeeze().cpu().numpy() + cam_feat = self.target_feats[ + self.target_layer_name].squeeze().cpu().numpy() + else: # roi level feature + cam_grad = self.target_feats[ + self.target_layer_name].grad.squeeze().cpu().numpy()[target_bbox_before_nms] + cam_feat = self.target_feats[ + self.target_layer_name].squeeze().cpu().numpy()[target_bbox_before_nms] + + # grad_cam: + exp = grad_cam(cam_feat, cam_grad) + + if 'backbone' in self.target_layer_name or \ + 'neck' in self.target_layer_name: + """ + when use backbone/neck featuremap, + we first do the cam on whole image, + and then set the area outside the predic bbox to 0 + """ + # reshape the cam image to the input image size + resized_exp = resize_cam(exp, (img.shape[1], img.shape[0])) + mask = np.zeros((img.shape[0], img.shape[1], 3)) + mask[int(target_bbox[3]):int(target_bbox[5]), int(target_bbox[2]): + int(target_bbox[4]), :] = 1 + resized_exp = resized_exp * mask + # add the bbox cam back to the input image + overlay_vis = np.uint8(resized_exp * 0.4 + img * 0.6) + elif 'roi' in self.target_layer_name: + # get the bbox part of the image + bbox_img = copy.deepcopy(img[int(target_bbox[3]):int(target_bbox[5]), + int(target_bbox[2]):int(target_bbox[4]), :]) + # reshape the cam image to the bbox size + resized_exp = resize_cam(exp, (bbox_img.shape[1], bbox_img.shape[0])) + # add the bbox cam back to the bbox image + bbox_overlay_vis = np.uint8(resized_exp * 0.4 + bbox_img * 0.6) + # put the bbox_cam image to the original image + overlay_vis = copy.deepcopy(img) + overlay_vis[int(target_bbox[3]):int(target_bbox[5]), + int(target_bbox[2]):int(target_bbox[4]), :] = bbox_overlay_vis + else: + print( + 'Only supported cam for backbone/neck feature and roi feature, the others are not supported temporarily!' + ) + sys.exit() + + # put the bbox rectangle on image + cv2.rectangle( + overlay_vis, (int(target_bbox[2]), int(target_bbox[3])), + (int(target_bbox[4]), int(target_bbox[5])), (0, 0, 255), 2) + + # save visualization result + cam_image = Image.fromarray(overlay_vis) + cam_image.save(self.FLAGS.cam_out + '/' + str(index) + '.jpg') + + # clear gradients after each bbox grad_cam + target.clear_gradient() + for n, v in self.trainer.model.named_sublayers(): + v.clear_gradients() diff --git a/rtdetr_paddle/ppdet/utils/check.py b/rtdetr_paddle/ppdet/utils/check.py new file mode 100644 index 0000000..7690ade --- /dev/null +++ b/rtdetr_paddle/ppdet/utils/check.py @@ -0,0 +1,156 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys + +import paddle +import six +import paddle.version as paddle_version + +from .logger import setup_logger +logger = setup_logger(__name__) + +__all__ = [ + 'check_gpu', 'check_npu', 'check_xpu', 'check_mlu', 'check_version', + 'check_config' +] + + +def check_mlu(use_mlu): + """ + Log error and exit when set use_mlu=true in paddlepaddle + cpu/gpu/xpu/npu version. + """ + err = "Config use_mlu cannot be set as true while you are " \ + "using paddlepaddle cpu/gpu/xpu/npu version ! \nPlease try: \n" \ + "\t1. Install paddlepaddle-mlu to run model on MLU \n" \ + "\t2. Set use_mlu as false in config file to run " \ + "model on CPU/GPU/XPU/NPU" + + try: + if use_mlu and not paddle.is_compiled_with_mlu(): + logger.error(err) + sys.exit(1) + except Exception as e: + pass + + +def check_npu(use_npu): + """ + Log error and exit when set use_npu=true in paddlepaddle + version without paddle-custom-npu installed. + """ + err = "Config use_npu cannot be set as true while you are " \ + "using paddlepaddle version without paddle-custom-npu " \ + "installed! \nPlease try: \n" \ + "\t1. Install paddle-custom-npu to run model on NPU \n" \ + "\t2. Set use_npu as false in config file to run " \ + "model on other devices supported." + + try: + if use_npu and not 'npu' in paddle.device.get_all_custom_device_type(): + logger.error(err) + sys.exit(1) + except Exception as e: + pass + + +def check_xpu(use_xpu): + """ + Log error and exit when set use_xpu=true in paddlepaddle + cpu/gpu/npu version. + """ + err = "Config use_xpu cannot be set as true while you are " \ + "using paddlepaddle cpu/gpu/npu version ! \nPlease try: \n" \ + "\t1. Install paddlepaddle-xpu to run model on XPU \n" \ + "\t2. Set use_xpu as false in config file to run " \ + "model on CPU/GPU/NPU" + + try: + if use_xpu and not paddle.is_compiled_with_xpu(): + logger.error(err) + sys.exit(1) + except Exception as e: + pass + + +def check_gpu(use_gpu): + """ + Log error and exit when set use_gpu=true in paddlepaddle + cpu version. + """ + err = "Config use_gpu cannot be set as true while you are " \ + "using paddlepaddle cpu version ! \nPlease try: \n" \ + "\t1. Install paddlepaddle-gpu to run model on GPU \n" \ + "\t2. Set use_gpu as false in config file to run " \ + "model on CPU" + + try: + if use_gpu and not paddle.is_compiled_with_cuda(): + logger.error(err) + sys.exit(1) + except Exception as e: + pass + + +def check_version(version='2.2'): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version {} or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code.".format(version) + + version_installed = [ + paddle_version.major, paddle_version.minor, paddle_version.patch, + paddle_version.rc + ] + + if version_installed == ['0', '0', '0', '0']: + return + + version_split = version.split('.') + + length = min(len(version_installed), len(version_split)) + for i in six.moves.range(length): + if version_installed[i] > version_split[i]: + return + if version_installed[i] < version_split[i]: + raise Exception(err) + + +def check_config(cfg): + """ + Check the correctness of the configuration file. Log error and exit + when Config is not compliant. + """ + err = "'{}' not specified in config file. Please set it in config file." + check_list = ['architecture', 'num_classes'] + try: + for var in check_list: + if not var in cfg: + logger.error(err.format(var)) + sys.exit(1) + except Exception as e: + pass + + if 'log_iter' not in cfg: + cfg.log_iter = 20 + + return cfg diff --git a/rtdetr_paddle/ppdet/utils/checkpoint.py b/rtdetr_paddle/ppdet/utils/checkpoint.py new file mode 100644 index 0000000..f3dafd4 --- /dev/null +++ b/rtdetr_paddle/ppdet/utils/checkpoint.py @@ -0,0 +1,325 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import os +import numpy as np +import paddle +import paddle.nn as nn +from .download import get_weights_path + +from .logger import setup_logger +logger = setup_logger(__name__) + + +def is_url(path): + """ + Whether path is URL. + Args: + path (string): URL string or not. + """ + return path.startswith('http://') \ + or path.startswith('https://') \ + or path.startswith('ppdet://') + + +def _strip_postfix(path): + path, ext = os.path.splitext(path) + assert ext in ['', '.pdparams', '.pdopt', '.pdmodel'], \ + "Unknown postfix {} from weights".format(ext) + return path + + +def load_weight(model, weight, optimizer=None, ema=None, exchange=True): + if is_url(weight): + weight = get_weights_path(weight) + + path = _strip_postfix(weight) + pdparam_path = path + '.pdparams' + if not os.path.exists(pdparam_path): + raise ValueError("Model pretrain path {} does not " + "exists.".format(pdparam_path)) + + if ema is not None and os.path.exists(path + '.pdema'): + if exchange: + # Exchange model and ema_model to load + logger.info('Exchange model and ema_model to load:') + ema_state_dict = paddle.load(pdparam_path) + logger.info('Loading ema_model weights from {}'.format(path + + '.pdparams')) + param_state_dict = paddle.load(path + '.pdema') + logger.info('Loading model weights from {}'.format(path + '.pdema')) + else: + ema_state_dict = paddle.load(path + '.pdema') + logger.info('Loading ema_model weights from {}'.format(path + + '.pdema')) + param_state_dict = paddle.load(pdparam_path) + logger.info('Loading model weights from {}'.format(path + + '.pdparams')) + else: + ema_state_dict = None + param_state_dict = paddle.load(pdparam_path) + + if hasattr(model, 'modelTeacher') and hasattr(model, 'modelStudent'): + print('Loading pretrain weights for Teacher-Student framework.') + print('Loading pretrain weights for Student model.') + student_model_dict = model.modelStudent.state_dict() + student_param_state_dict = match_state_dict( + student_model_dict, param_state_dict, mode='student') + model.modelStudent.set_dict(student_param_state_dict) + print('Loading pretrain weights for Teacher model.') + teacher_model_dict = model.modelTeacher.state_dict() + + teacher_param_state_dict = match_state_dict( + teacher_model_dict, param_state_dict, mode='teacher') + model.modelTeacher.set_dict(teacher_param_state_dict) + + else: + model_dict = model.state_dict() + model_weight = {} + incorrect_keys = 0 + for key in model_dict.keys(): + if key in param_state_dict.keys(): + model_weight[key] = param_state_dict[key] + else: + logger.info('Unmatched key: {}'.format(key)) + incorrect_keys += 1 + assert incorrect_keys == 0, "Load weight {} incorrectly, \ + {} keys unmatched, please check again.".format(weight, + incorrect_keys) + logger.info('Finish resuming model weights: {}'.format(pdparam_path)) + model.set_dict(model_weight) + + last_epoch = 0 + if optimizer is not None and os.path.exists(path + '.pdopt'): + optim_state_dict = paddle.load(path + '.pdopt') + # to solve resume bug, will it be fixed in paddle 2.0 + for key in optimizer.state_dict().keys(): + if not key in optim_state_dict.keys(): + optim_state_dict[key] = optimizer.state_dict()[key] + if 'last_epoch' in optim_state_dict: + last_epoch = optim_state_dict.pop('last_epoch') + optimizer.set_state_dict(optim_state_dict) + + if ema_state_dict is not None: + ema.resume(ema_state_dict, + optim_state_dict['LR_Scheduler']['last_epoch']) + elif ema_state_dict is not None: + ema.resume(ema_state_dict) + return last_epoch + + +def match_state_dict(model_state_dict, weight_state_dict, mode='default'): + """ + Match between the model state dict and pretrained weight state dict. + Return the matched state dict. + + The method supposes that all the names in pretrained weight state dict are + subclass of the names in models`, if the prefix 'backbone.' in pretrained weight + keys is stripped. And we could get the candidates for each model key. Then we + select the name with the longest matched size as the final match result. For + example, the model state dict has the name of + 'backbone.res2.res2a.branch2a.conv.weight' and the pretrained weight as + name of 'res2.res2a.branch2a.conv.weight' and 'branch2a.conv.weight'. We + match the 'res2.res2a.branch2a.conv.weight' to the model key. + """ + + model_keys = sorted(model_state_dict.keys()) + weight_keys = sorted(weight_state_dict.keys()) + + def teacher_match(a, b): + # skip student params + if b.startswith('modelStudent'): + return False + return a == b or a.endswith("." + b) or b.endswith("." + a) + + def student_match(a, b): + # skip teacher params + if b.startswith('modelTeacher'): + return False + return a == b or a.endswith("." + b) or b.endswith("." + a) + + def match(a, b): + if b.startswith('backbone.res5'): + b = b[9:] + return a == b or a.endswith("." + b) + + if mode == 'student': + match_op = student_match + elif mode == 'teacher': + match_op = teacher_match + else: + match_op = match + + match_matrix = np.zeros([len(model_keys), len(weight_keys)]) + for i, m_k in enumerate(model_keys): + for j, w_k in enumerate(weight_keys): + if match_op(m_k, w_k): + match_matrix[i, j] = len(w_k) + max_id = match_matrix.argmax(1) + max_len = match_matrix.max(1) + max_id[max_len == 0] = -1 + load_id = set(max_id) + load_id.discard(-1) + not_load_weight_name = [] + if weight_keys[0].startswith('modelStudent') or weight_keys[0].startswith( + 'modelTeacher'): + for match_idx in range(len(max_id)): + if max_id[match_idx] == -1: + not_load_weight_name.append(model_keys[match_idx]) + if len(not_load_weight_name) > 0: + logger.info('{} in model is not matched with pretrained weights, ' + 'and its will be trained from scratch'.format( + not_load_weight_name)) + + else: + for idx in range(len(weight_keys)): + if idx not in load_id: + not_load_weight_name.append(weight_keys[idx]) + + if len(not_load_weight_name) > 0: + logger.info('{} in pretrained weight is not used in the model, ' + 'and its will not be loaded'.format( + not_load_weight_name)) + matched_keys = {} + result_state_dict = {} + for model_id, weight_id in enumerate(max_id): + if weight_id == -1: + continue + model_key = model_keys[model_id] + weight_key = weight_keys[weight_id] + weight_value = weight_state_dict[weight_key] + model_value_shape = list(model_state_dict[model_key].shape) + + if list(weight_value.shape) != model_value_shape: + logger.info( + 'The shape {} in pretrained weight {} is unmatched with ' + 'the shape {} in model {}. And the weight {} will not be ' + 'loaded'.format(weight_value.shape, weight_key, + model_value_shape, model_key, weight_key)) + continue + + assert model_key not in result_state_dict + result_state_dict[model_key] = weight_value + if weight_key in matched_keys: + raise ValueError('Ambiguity weight {} loaded, it matches at least ' + '{} and {} in the model'.format( + weight_key, model_key, matched_keys[ + weight_key])) + matched_keys[weight_key] = model_key + return result_state_dict + + +def load_pretrain_weight(model, pretrain_weight, ARSL_eval=False): + if is_url(pretrain_weight): + pretrain_weight = get_weights_path(pretrain_weight) + + path = _strip_postfix(pretrain_weight) + if not (os.path.isdir(path) or os.path.isfile(path) or + os.path.exists(path + '.pdparams')): + raise ValueError("Model pretrain path `{}` does not exists. " + "If you don't want to load pretrain model, " + "please delete `pretrain_weights` field in " + "config file.".format(path)) + teacher_student_flag = False + if not ARSL_eval: + if hasattr(model, 'modelTeacher') and hasattr(model, 'modelStudent'): + print('Loading pretrain weights for Teacher-Student framework.') + print( + 'Assert Teacher model has the same structure with Student model.' + ) + model_dict = model.modelStudent.state_dict() + teacher_student_flag = True + else: + model_dict = model.state_dict() + + weights_path = path + '.pdparams' + param_state_dict = paddle.load(weights_path) + param_state_dict = match_state_dict(model_dict, param_state_dict) + for k, v in param_state_dict.items(): + if isinstance(v, np.ndarray): + v = paddle.to_tensor(v) + if model_dict[k].dtype != v.dtype: + param_state_dict[k] = v.astype(model_dict[k].dtype) + + if teacher_student_flag: + model.modelStudent.set_dict(param_state_dict) + model.modelTeacher.set_dict(param_state_dict) + else: + model.set_dict(param_state_dict) + logger.info('Finish loading model weights: {}'.format(weights_path)) + + else: + weights_path = path + '.pdparams' + param_state_dict = paddle.load(weights_path) + student_model_dict = model.modelStudent.state_dict() + student_param_state_dict = match_state_dict( + student_model_dict, param_state_dict, mode='student') + model.modelStudent.set_dict(student_param_state_dict) + print('Loading pretrain weights for Teacher model.') + teacher_model_dict = model.modelTeacher.state_dict() + + teacher_param_state_dict = match_state_dict( + teacher_model_dict, param_state_dict, mode='teacher') + model.modelTeacher.set_dict(teacher_param_state_dict) + logger.info('Finish loading model weights: {}'.format(weights_path)) + + +def save_model(model, + optimizer, + save_dir, + save_name, + last_epoch, + ema_model=None): + """ + save model into disk. + + Args: + model (dict): the model state_dict to save parameters. + optimizer (paddle.optimizer.Optimizer): the Optimizer instance to + save optimizer states. + save_dir (str): the directory to be saved. + save_name (str): the path to be saved. + last_epoch (int): the epoch index. + ema_model (dict|None): the ema_model state_dict to save parameters. + """ + if paddle.distributed.get_rank() != 0: + return + if not os.path.exists(save_dir): + os.makedirs(save_dir) + save_path = os.path.join(save_dir, save_name) + # save model + if isinstance(model, nn.Layer): + paddle.save(model.state_dict(), save_path + ".pdparams") + else: + assert isinstance(model, + dict), 'model is not a instance of nn.layer or dict' + if ema_model is None: + paddle.save(model, save_path + ".pdparams") + else: + assert isinstance(ema_model, + dict), ("ema_model is not a instance of dict, " + "please call model.state_dict() to get.") + # Exchange model and ema_model to save + paddle.save(ema_model, save_path + ".pdparams") + paddle.save(model, save_path + ".pdema") + # save optimizer + state_dict = optimizer.state_dict() + state_dict['last_epoch'] = last_epoch + paddle.save(state_dict, save_path + ".pdopt") + logger.info("Save checkpoint: {}".format(save_dir)) diff --git a/rtdetr_paddle/ppdet/utils/cli.py b/rtdetr_paddle/ppdet/utils/cli.py new file mode 100644 index 0000000..2c5acc0 --- /dev/null +++ b/rtdetr_paddle/ppdet/utils/cli.py @@ -0,0 +1,158 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from argparse import ArgumentParser, RawDescriptionHelpFormatter + +import yaml +import re +from ppdet.core.workspace import get_registered_modules, dump_value + +__all__ = ['ColorTTY', 'ArgsParser'] + + +class ColorTTY(object): + def __init__(self): + super(ColorTTY, self).__init__() + self.colors = ['red', 'green', 'yellow', 'blue', 'magenta', 'cyan'] + + def __getattr__(self, attr): + if attr in self.colors: + color = self.colors.index(attr) + 31 + + def color_message(message): + return "[{}m{}".format(color, message) + + setattr(self, attr, color_message) + return color_message + + def bold(self, message): + return self.with_code('01', message) + + def with_code(self, code, message): + return "[{}m{}".format(code, message) + + +class ArgsParser(ArgumentParser): + def __init__(self): + super(ArgsParser, self).__init__( + formatter_class=RawDescriptionHelpFormatter) + self.add_argument("-c", "--config", help="configuration file to use") + self.add_argument( + "-o", "--opt", nargs='*', help="set configuration options") + + def parse_args(self, argv=None): + args = super(ArgsParser, self).parse_args(argv) + assert args.config is not None, \ + "Please specify --config=configure_file_path." + args.opt = self._parse_opt(args.opt) + return args + + def _parse_opt(self, opts): + config = {} + if not opts: + return config + for s in opts: + s = s.strip() + k, v = s.split('=', 1) + if '.' not in k: + config[k] = yaml.load(v, Loader=yaml.Loader) + else: + keys = k.split('.') + if keys[0] not in config: + config[keys[0]] = {} + cur = config[keys[0]] + for idx, key in enumerate(keys[1:]): + if idx == len(keys) - 2: + cur[key] = yaml.load(v, Loader=yaml.Loader) + else: + cur[key] = {} + cur = cur[key] + return config + + +def merge_args(config, args, exclude_args=['config', 'opt', 'slim_config']): + for k, v in vars(args).items(): + if k not in exclude_args: + config[k] = v + return config + + +def print_total_cfg(config): + modules = get_registered_modules() + color_tty = ColorTTY() + green = '___{}___'.format(color_tty.colors.index('green') + 31) + + styled = {} + for key in config.keys(): + if not config[key]: # empty schema + continue + + if key not in modules and not hasattr(config[key], '__dict__'): + styled[key] = config[key] + continue + elif key in modules: + module = modules[key] + else: + type_name = type(config[key]).__name__ + if type_name in modules: + module = modules[type_name].copy() + module.update({ + k: v + for k, v in config[key].__dict__.items() + if k in module.schema + }) + key += " ({})".format(type_name) + default = module.find_default_keys() + missing = module.find_missing_keys() + mismatch = module.find_mismatch_keys() + extra = module.find_extra_keys() + dep_missing = [] + for dep in module.inject: + if isinstance(module[dep], str) and module[dep] != '': + if module[dep] not in modules: # not a valid module + dep_missing.append(dep) + else: + dep_mod = modules[module[dep]] + # empty dict but mandatory + if not dep_mod and dep_mod.mandatory(): + dep_missing.append(dep) + override = list( + set(module.keys()) - set(default) - set(extra) - set(dep_missing)) + replacement = {} + for name in set(override + default + extra + mismatch + missing): + new_name = name + if name in missing: + value = "" + else: + value = module[name] + + if name in extra: + value = dump_value(value) + " " + elif name in mismatch: + value = dump_value(value) + " " + elif name in dep_missing: + value = dump_value(value) + " " + elif name in override and value != '': + mark = green + new_name = mark + name + replacement[new_name] = value + styled[key] = replacement + buffer = yaml.dump(styled, default_flow_style=False, default_style='') + buffer = (re.sub(r"", r"[31m[0m", buffer)) + buffer = (re.sub(r"", r"[33m[0m", buffer)) + buffer = (re.sub(r"", r"[31m[0m", buffer)) + buffer = (re.sub(r"", + r"[31m[0m", buffer)) + buffer = re.sub(r"___(\d+)___(.*?):", r"[\1m\2[0m:", buffer) + print(buffer) diff --git a/rtdetr_paddle/ppdet/utils/colormap.py b/rtdetr_paddle/ppdet/utils/colormap.py new file mode 100644 index 0000000..67c68dc --- /dev/null +++ b/rtdetr_paddle/ppdet/utils/colormap.py @@ -0,0 +1,58 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np + + +def colormap(rgb=False): + """ + Get colormap + + The code of this function is copied from https://github.com/facebookresearch/Detectron/blob/main/detectron/utils/colormap.py + """ + color_list = np.array([ + 0.000, 0.447, 0.741, 0.850, 0.325, 0.098, 0.929, 0.694, 0.125, 0.494, + 0.184, 0.556, 0.466, 0.674, 0.188, 0.301, 0.745, 0.933, 0.635, 0.078, + 0.184, 0.300, 0.300, 0.300, 0.600, 0.600, 0.600, 1.000, 0.000, 0.000, + 1.000, 0.500, 0.000, 0.749, 0.749, 0.000, 0.000, 1.000, 0.000, 0.000, + 0.000, 1.000, 0.667, 0.000, 1.000, 0.333, 0.333, 0.000, 0.333, 0.667, + 0.000, 0.333, 1.000, 0.000, 0.667, 0.333, 0.000, 0.667, 0.667, 0.000, + 0.667, 1.000, 0.000, 1.000, 0.333, 0.000, 1.000, 0.667, 0.000, 1.000, + 1.000, 0.000, 0.000, 0.333, 0.500, 0.000, 0.667, 0.500, 0.000, 1.000, + 0.500, 0.333, 0.000, 0.500, 0.333, 0.333, 0.500, 0.333, 0.667, 0.500, + 0.333, 1.000, 0.500, 0.667, 0.000, 0.500, 0.667, 0.333, 0.500, 0.667, + 0.667, 0.500, 0.667, 1.000, 0.500, 1.000, 0.000, 0.500, 1.000, 0.333, + 0.500, 1.000, 0.667, 0.500, 1.000, 1.000, 0.500, 0.000, 0.333, 1.000, + 0.000, 0.667, 1.000, 0.000, 1.000, 1.000, 0.333, 0.000, 1.000, 0.333, + 0.333, 1.000, 0.333, 0.667, 1.000, 0.333, 1.000, 1.000, 0.667, 0.000, + 1.000, 0.667, 0.333, 1.000, 0.667, 0.667, 1.000, 0.667, 1.000, 1.000, + 1.000, 0.000, 1.000, 1.000, 0.333, 1.000, 1.000, 0.667, 1.000, 0.167, + 0.000, 0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, + 0.000, 0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000, + 0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000, + 0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000, 0.000, + 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000, 0.833, + 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.143, 0.143, 0.143, 0.286, + 0.286, 0.286, 0.429, 0.429, 0.429, 0.571, 0.571, 0.571, 0.714, 0.714, + 0.714, 0.857, 0.857, 0.857, 1.000, 1.000, 1.000 + ]).astype(np.float32) + color_list = color_list.reshape((-1, 3)) * 255 + if not rgb: + color_list = color_list[:, ::-1] + return color_list.astype('int32') diff --git a/rtdetr_paddle/ppdet/utils/download.py b/rtdetr_paddle/ppdet/utils/download.py new file mode 100644 index 0000000..8fb95af --- /dev/null +++ b/rtdetr_paddle/ppdet/utils/download.py @@ -0,0 +1,559 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import os.path as osp +import sys +import yaml +import time +import shutil +import requests +import tqdm +import hashlib +import base64 +import binascii +import tarfile +import zipfile +import errno + +from paddle.utils.download import _get_unique_endpoints +from ppdet.core.workspace import BASE_KEY +from .logger import setup_logger +from .voc_utils import create_list + +logger = setup_logger(__name__) + +__all__ = [ + 'get_weights_path', 'get_dataset_path', 'get_config_path', + 'download_dataset', 'create_voc_list' +] + +WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/weights") +DATASET_HOME = osp.expanduser("~/.cache/paddle/dataset") +CONFIGS_HOME = osp.expanduser("~/.cache/paddle/configs") + +# dict of {dataset_name: (download_info, sub_dirs)} +# download info: [(url, md5sum)] +DATASETS = { + 'coco': ([ + ( + 'http://images.cocodataset.org/zips/train2017.zip', + 'cced6f7f71b7629ddf16f17bbcfab6b2', ), + ( + 'http://images.cocodataset.org/zips/val2017.zip', + '442b8da7639aecaf257c1dceb8ba8c80', ), + ( + 'http://images.cocodataset.org/annotations/annotations_trainval2017.zip', + 'f4bbac642086de4f52a3fdda2de5fa2c', ), + ], ["annotations", "train2017", "val2017"]), + 'voc': ([ + ( + 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar', + '6cd6e144f989b92b3379bac3b3de84fd', ), + ( + 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar', + 'c52e279531787c972589f7e41ab4ae64', ), + ( + 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar', + 'b6e924de25625d8de591ea690078ad9f', ), + ( + 'https://paddledet.bj.bcebos.com/data/label_list.txt', + '5ae5d62183cfb6f6d3ac109359d06a1b', ), + ], ["VOCdevkit/VOC2012", "VOCdevkit/VOC2007"]), + 'wider_face': ([ + ( + 'https://dataset.bj.bcebos.com/wider_face/WIDER_train.zip', + '3fedf70df600953d25982bcd13d91ba2', ), + ( + 'https://dataset.bj.bcebos.com/wider_face/WIDER_val.zip', + 'dfa7d7e790efa35df3788964cf0bbaea', ), + ( + 'https://dataset.bj.bcebos.com/wider_face/wider_face_split.zip', + 'a4a898d6193db4b9ef3260a68bad0dc7', ), + ], ["WIDER_train", "WIDER_val", "wider_face_split"]), + 'fruit': ([( + 'https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit.tar', + 'baa8806617a54ccf3685fa7153388ae6', ), ], + ['Annotations', 'JPEGImages']), + 'roadsign_voc': ([( + 'https://paddlemodels.bj.bcebos.com/object_detection/roadsign_voc.tar', + '8d629c0f880dd8b48de9aeff44bf1f3e', ), ], ['annotations', 'images']), + 'roadsign_coco': ([( + 'https://paddlemodels.bj.bcebos.com/object_detection/roadsign_coco.tar', + '49ce5a9b5ad0d6266163cd01de4b018e', ), ], ['annotations', 'images']), + 'spine_coco': ([( + 'https://paddledet.bj.bcebos.com/data/spine.tar', + '8a3a353c2c54a2284ad7d2780b65f6a6', ), ], ['annotations', 'images']), + 'coco_ce': ([( + 'https://paddledet.bj.bcebos.com/data/coco_ce.tar', + 'eadd1b79bc2f069f2744b1dd4e0c0329', ), ], []) +} + +DOWNLOAD_DATASETS_LIST = DATASETS.keys() + +DOWNLOAD_RETRY_LIMIT = 3 + +PPDET_WEIGHTS_DOWNLOAD_URL_PREFIX = 'https://paddledet.bj.bcebos.com/' + + +# When running unit tests, there could be multiple processes that +# trying to create DATA_HOME directory simultaneously, so we cannot +# use a if condition to check for the existence of the directory; +# instead, we use the filesystem as the synchronization mechanism by +# catching returned errors. +def must_mkdirs(path): + try: + os.makedirs(path) + except OSError as exc: + if exc.errno != errno.EEXIST: + raise + pass + + +def parse_url(url): + url = url.replace("ppdet://", PPDET_WEIGHTS_DOWNLOAD_URL_PREFIX) + return url + + +def get_weights_path(url): + """Get weights path from WEIGHTS_HOME, if not exists, + download it from url. + """ + url = parse_url(url) + path, _ = get_path(url, WEIGHTS_HOME) + return path + + +def get_config_path(url): + """Get weights path from CONFIGS_HOME, if not exists, + download it from url. + """ + url = parse_url(url) + path = map_path(url, CONFIGS_HOME, path_depth=2) + if os.path.isfile(path): + return path + + # config file not found, try download + # 1. clear configs directory + if osp.isdir(CONFIGS_HOME): + shutil.rmtree(CONFIGS_HOME) + + # 2. get url + try: + from ppdet import __version__ as version + except ImportError: + version = None + + cfg_url = "ppdet://configs/{}/configs.tar".format(version) \ + if version else "ppdet://configs/configs.tar" + cfg_url = parse_url(cfg_url) + + # 3. download and decompress + cfg_fullname = _download_dist(cfg_url, osp.dirname(CONFIGS_HOME)) + _decompress_dist(cfg_fullname) + + # 4. check config file existing + if os.path.isfile(path): + return path + else: + logger.error("Get config {} failed after download, please contact us on " \ + "https://github.com/PaddlePaddle/PaddleDetection/issues".format(path)) + sys.exit(1) + + +def get_dataset_path(path, annotation, image_dir): + """ + If path exists, return path. + Otherwise, get dataset path from DATASET_HOME, if not exists, + download it. + """ + if _dataset_exists(path, annotation, image_dir): + return path + + data_name = os.path.split(path.strip().lower())[-1] + if data_name not in DOWNLOAD_DATASETS_LIST: + raise ValueError( + "Dataset {} is not valid for reason above, please check again.". + format(osp.realpath(path))) + else: + logger.warning( + "Dataset {} is not valid for reason above, try searching {} or " + "downloading dataset...".format(osp.realpath(path), DATASET_HOME)) + + for name, dataset in DATASETS.items(): + if data_name == name: + logger.debug("Parse dataset_dir {} as dataset " + "{}".format(path, name)) + data_dir = osp.join(DATASET_HOME, name) + + if name == "spine_coco": + if _dataset_exists(data_dir, annotation, image_dir): + return data_dir + + # For voc, only check dir VOCdevkit/VOC2012, VOCdevkit/VOC2007 + if name in ['voc', 'fruit', 'roadsign_voc']: + exists = True + for sub_dir in dataset[1]: + check_dir = osp.join(data_dir, sub_dir) + if osp.exists(check_dir): + logger.info("Found {}".format(check_dir)) + else: + exists = False + if exists: + return data_dir + + # voc exist is checked above, voc is not exist here + check_exist = name != 'voc' and name != 'fruit' and name != 'roadsign_voc' + for url, md5sum in dataset[0]: + get_path(url, data_dir, md5sum, check_exist) + + # voc should create list after download + if name == 'voc': + create_voc_list(data_dir) + return data_dir + + raise ValueError("Dataset automaticly downloading Error.") + + +def create_voc_list(data_dir, devkit_subdir='VOCdevkit'): + logger.debug("Create voc file list...") + devkit_dir = osp.join(data_dir, devkit_subdir) + years = ['2007', '2012'] + + # NOTE: since using auto download VOC + # dataset, VOC default label list should be used, + # do not generate label_list.txt here. For default + # label, see ../data/source/voc.py + create_list(devkit_dir, years, data_dir) + logger.debug("Create voc file list finished") + + +def map_path(url, root_dir, path_depth=1): + # parse path after download to decompress under root_dir + assert path_depth > 0, "path_depth should be a positive integer" + dirname = url + for _ in range(path_depth): + dirname = osp.dirname(dirname) + fpath = osp.relpath(url, dirname) + + zip_formats = ['.zip', '.tar', '.gz'] + for zip_format in zip_formats: + fpath = fpath.replace(zip_format, '') + return osp.join(root_dir, fpath) + + +def get_path(url, root_dir, md5sum=None, check_exist=True): + """ Download from given url to root_dir. + if file or directory specified by url is exists under + root_dir, return the path directly, otherwise download + from url and decompress it, return the path. + + url (str): download url + root_dir (str): root dir for downloading, it should be + WEIGHTS_HOME or DATASET_HOME + md5sum (str): md5 sum of download package + """ + # parse path after download to decompress under root_dir + fullpath = map_path(url, root_dir) + + # For same zip file, decompressed directory name different + # from zip file name, rename by following map + decompress_name_map = { + "VOCtrainval_11-May-2012": "VOCdevkit/VOC2012", + "VOCtrainval_06-Nov-2007": "VOCdevkit/VOC2007", + "VOCtest_06-Nov-2007": "VOCdevkit/VOC2007", + "annotations_trainval": "annotations" + } + for k, v in decompress_name_map.items(): + if fullpath.find(k) >= 0: + fullpath = osp.join(osp.split(fullpath)[0], v) + + if osp.exists(fullpath) and check_exist: + if not osp.isfile(fullpath) or \ + _check_exist_file_md5(fullpath, md5sum, url): + logger.debug("Found {}".format(fullpath)) + return fullpath, True + else: + os.remove(fullpath) + + fullname = _download_dist(url, root_dir, md5sum) + + # new weights format which postfix is 'pdparams' not + # need to decompress + if osp.splitext(fullname)[-1] not in ['.pdparams', '.yml']: + _decompress_dist(fullname) + + return fullpath, False + + +def download_dataset(path, dataset=None): + if dataset not in DATASETS.keys(): + logger.error("Unknown dataset {}, it should be " + "{}".format(dataset, DATASETS.keys())) + return + dataset_info = DATASETS[dataset][0] + for info in dataset_info: + get_path(info[0], path, info[1], False) + logger.debug("Download dataset {} finished.".format(dataset)) + + +def _dataset_exists(path, annotation, image_dir): + """ + Check if user define dataset exists + """ + if not osp.exists(path): + logger.warning("Config dataset_dir {} is not exits, " + "dataset config is not valid".format(path)) + return False + + if annotation: + annotation_path = osp.join(path, annotation) + if not osp.isfile(annotation_path): + logger.warning("Config annotation {} is not a " + "file, dataset config is not " + "valid".format(annotation_path)) + return False + if image_dir: + image_path = osp.join(path, image_dir) + if not osp.isdir(image_path): + logger.warning("Config image_dir {} is not a " + "directory, dataset config is not " + "valid".format(image_path)) + return False + return True + + +def _download(url, path, md5sum=None): + """ + Download from url, save to path. + + url (str): download url + path (str): download to given path + """ + must_mkdirs(path) + + fname = osp.split(url)[-1] + fullname = osp.join(path, fname) + retry_cnt = 0 + + while not (osp.exists(fullname) and _check_exist_file_md5(fullname, md5sum, + url)): + if retry_cnt < DOWNLOAD_RETRY_LIMIT: + retry_cnt += 1 + else: + raise RuntimeError("Download from {} failed. " + "Retry limit reached".format(url)) + + logger.info("Downloading {} from {}".format(fname, url)) + + # NOTE: windows path join may incur \, which is invalid in url + if sys.platform == "win32": + url = url.replace('\\', '/') + + req = requests.get(url, stream=True) + if req.status_code != 200: + raise RuntimeError("Downloading from {} failed with code " + "{}!".format(url, req.status_code)) + + # For protecting download interupted, download to + # tmp_fullname firstly, move tmp_fullname to fullname + # after download finished + tmp_fullname = fullname + "_tmp" + total_size = req.headers.get('content-length') + with open(tmp_fullname, 'wb') as f: + if total_size: + for chunk in tqdm.tqdm( + req.iter_content(chunk_size=1024), + total=(int(total_size) + 1023) // 1024, + unit='KB'): + f.write(chunk) + else: + for chunk in req.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + shutil.move(tmp_fullname, fullname) + return fullname + + +def _download_dist(url, path, md5sum=None): + env = os.environ + if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env: + # Mainly used to solve the problem of downloading data from + # different machines in the case of multiple machines. + # Different nodes will download data, and the same node + # will only download data once. + # Reference https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/utils/download.py#L108 + rank_id_curr_node = int(os.environ.get("PADDLE_RANK_IN_NODE", 0)) + num_trainers = int(env['PADDLE_TRAINERS_NUM']) + if num_trainers <= 1: + return _download(url, path, md5sum) + else: + fname = osp.split(url)[-1] + fullname = osp.join(path, fname) + lock_path = fullname + '.download.lock' + + must_mkdirs(path) + + if not osp.exists(fullname): + with open(lock_path, 'w'): # touch + os.utime(lock_path, None) + if rank_id_curr_node == 0: + _download(url, path, md5sum) + os.remove(lock_path) + else: + while os.path.exists(lock_path): + time.sleep(0.5) + return fullname + else: + return _download(url, path, md5sum) + + +def _check_exist_file_md5(filename, md5sum, url): + # if md5sum is None, and file to check is weights file, + # read md5um from url and check, else check md5sum directly + return _md5check_from_url(filename, url) if md5sum is None \ + and filename.endswith('pdparams') \ + else _md5check(filename, md5sum) + + +def _md5check_from_url(filename, url): + # For weights in bcebos URLs, MD5 value is contained + # in request header as 'content_md5' + req = requests.get(url, stream=True) + content_md5 = req.headers.get('content-md5') + req.close() + if not content_md5 or _md5check( + filename, + binascii.hexlify(base64.b64decode(content_md5.strip('"'))).decode( + )): + return True + else: + return False + + +def _md5check(fullname, md5sum=None): + if md5sum is None: + return True + + logger.debug("File {} md5 checking...".format(fullname)) + md5 = hashlib.md5() + with open(fullname, 'rb') as f: + for chunk in iter(lambda: f.read(4096), b""): + md5.update(chunk) + calc_md5sum = md5.hexdigest() + + if calc_md5sum != md5sum: + logger.warning("File {} md5 check failed, {}(calc) != " + "{}(base)".format(fullname, calc_md5sum, md5sum)) + return False + return True + + +def _decompress(fname): + """ + Decompress for zip and tar file + """ + logger.info("Decompressing {}...".format(fname)) + + # For protecting decompressing interupted, + # decompress to fpath_tmp directory firstly, if decompress + # successed, move decompress files to fpath and delete + # fpath_tmp and remove download compress file. + fpath = osp.split(fname)[0] + fpath_tmp = osp.join(fpath, 'tmp') + if osp.isdir(fpath_tmp): + shutil.rmtree(fpath_tmp) + os.makedirs(fpath_tmp) + + if fname.find('tar') >= 0: + with tarfile.open(fname) as tf: + tf.extractall(path=fpath_tmp) + elif fname.find('zip') >= 0: + with zipfile.ZipFile(fname) as zf: + zf.extractall(path=fpath_tmp) + elif fname.find('.txt') >= 0: + return + else: + raise TypeError("Unsupport compress file type {}".format(fname)) + + for f in os.listdir(fpath_tmp): + src_dir = osp.join(fpath_tmp, f) + dst_dir = osp.join(fpath, f) + _move_and_merge_tree(src_dir, dst_dir) + + shutil.rmtree(fpath_tmp) + os.remove(fname) + + +def _decompress_dist(fname): + env = os.environ + if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env: + trainer_id = int(env['PADDLE_TRAINER_ID']) + num_trainers = int(env['PADDLE_TRAINERS_NUM']) + if num_trainers <= 1: + _decompress(fname) + else: + lock_path = fname + '.decompress.lock' + from paddle.distributed import ParallelEnv + unique_endpoints = _get_unique_endpoints(ParallelEnv() + .trainer_endpoints[:]) + # NOTE(dkp): _decompress_dist always performed after + # _download_dist, in _download_dist sub-trainers is waiting + # for download lock file release with sleeping, if decompress + # prograss is very fast and finished with in the sleeping gap + # time, e.g in tiny dataset such as coco_ce, spine_coco, main + # trainer may finish decompress and release lock file, so we + # only craete lock file in main trainer and all sub-trainer + # wait 1s for main trainer to create lock file, for 1s is + # twice as sleeping gap, this waiting time can keep all + # trainer pipeline in order + # **change this if you have more elegent methods** + if ParallelEnv().current_endpoint in unique_endpoints: + with open(lock_path, 'w'): # touch + os.utime(lock_path, None) + _decompress(fname) + os.remove(lock_path) + else: + time.sleep(1) + while os.path.exists(lock_path): + time.sleep(0.5) + else: + _decompress(fname) + + +def _move_and_merge_tree(src, dst): + """ + Move src directory to dst, if dst is already exists, + merge src to dst + """ + if not osp.exists(dst): + shutil.move(src, dst) + elif osp.isfile(src): + shutil.move(src, dst) + else: + for fp in os.listdir(src): + src_fp = osp.join(src, fp) + dst_fp = osp.join(dst, fp) + if osp.isdir(src_fp): + if osp.isdir(dst_fp): + _move_and_merge_tree(src_fp, dst_fp) + else: + shutil.move(src_fp, dst_fp) + elif osp.isfile(src_fp) and \ + not osp.isfile(dst_fp): + shutil.move(src_fp, dst_fp) diff --git a/rtdetr_paddle/ppdet/utils/fuse_utils.py b/rtdetr_paddle/ppdet/utils/fuse_utils.py new file mode 100644 index 0000000..647fa99 --- /dev/null +++ b/rtdetr_paddle/ppdet/utils/fuse_utils.py @@ -0,0 +1,179 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import paddle +import paddle.nn as nn + +__all__ = ['fuse_conv_bn'] + + +def fuse_conv_bn(model): + is_train = False + if model.training: + model.eval() + is_train = True + fuse_list = [] + tmp_pair = [None, None] + for name, layer in model.named_sublayers(): + if isinstance(layer, nn.Conv2D): + tmp_pair[0] = name + if isinstance(layer, nn.BatchNorm2D): + tmp_pair[1] = name + + if tmp_pair[0] and tmp_pair[1] and len(tmp_pair) == 2: + fuse_list.append(tmp_pair) + tmp_pair = [None, None] + model = fuse_layers(model, fuse_list) + if is_train: + model.train() + return model + + +def find_parent_layer_and_sub_name(model, name): + """ + Given the model and the name of a layer, find the parent layer and + the sub_name of the layer. + For example, if name is 'block_1/convbn_1/conv_1', the parent layer is + 'block_1/convbn_1' and the sub_name is `conv_1`. + Args: + model(paddle.nn.Layer): the model to be quantized. + name(string): the name of a layer + + Returns: + parent_layer, subname + """ + assert isinstance(model, nn.Layer), \ + "The model must be the instance of paddle.nn.Layer." + assert len(name) > 0, "The input (name) should not be empty." + + last_idx = 0 + idx = 0 + parent_layer = model + while idx < len(name): + if name[idx] == '.': + sub_name = name[last_idx:idx] + if hasattr(parent_layer, sub_name): + parent_layer = getattr(parent_layer, sub_name) + last_idx = idx + 1 + idx += 1 + sub_name = name[last_idx:idx] + return parent_layer, sub_name + + +class Identity(nn.Layer): + '''a layer to replace bn or relu layers''' + + def __init__(self, *args, **kwargs): + super(Identity, self).__init__() + + def forward(self, input): + return input + + +def fuse_layers(model, layers_to_fuse, inplace=False): + ''' + fuse layers in layers_to_fuse + + Args: + model(nn.Layer): The model to be fused. + layers_to_fuse(list): The layers' names to be fused. For + example,"fuse_list = [["conv1", "bn1"], ["conv2", "bn2"]]". + A TypeError would be raised if "fuse" was set as + True but "fuse_list" was None. + Default: None. + inplace(bool): Whether apply fusing to the input model. + Default: False. + + Return + fused_model(paddle.nn.Layer): The fused model. + ''' + if not inplace: + model = copy.deepcopy(model) + for layers_list in layers_to_fuse: + layer_list = [] + for layer_name in layers_list: + parent_layer, sub_name = find_parent_layer_and_sub_name(model, + layer_name) + layer_list.append(getattr(parent_layer, sub_name)) + new_layers = _fuse_func(layer_list) + for i, item in enumerate(layers_list): + parent_layer, sub_name = find_parent_layer_and_sub_name(model, item) + setattr(parent_layer, sub_name, new_layers[i]) + return model + + +def _fuse_func(layer_list): + '''choose the fuser method and fuse layers''' + types = tuple(type(m) for m in layer_list) + fusion_method = types_to_fusion_method.get(types, None) + new_layers = [None] * len(layer_list) + fused_layer = fusion_method(*layer_list) + for handle_id, pre_hook_fn in layer_list[0]._forward_pre_hooks.items(): + fused_layer.register_forward_pre_hook(pre_hook_fn) + del layer_list[0]._forward_pre_hooks[handle_id] + for handle_id, hook_fn in layer_list[-1]._forward_post_hooks.items(): + fused_layer.register_forward_post_hook(hook_fn) + del layer_list[-1]._forward_post_hooks[handle_id] + new_layers[0] = fused_layer + for i in range(1, len(layer_list)): + identity = Identity() + identity.training = layer_list[0].training + new_layers[i] = identity + return new_layers + + +def _fuse_conv_bn(conv, bn): + '''fuse conv and bn for train or eval''' + assert(conv.training == bn.training),\ + "Conv and BN both must be in the same mode (train or eval)." + if conv.training: + assert bn._num_features == conv._out_channels, 'Output channel of Conv2d must match num_features of BatchNorm2d' + raise NotImplementedError + else: + return _fuse_conv_bn_eval(conv, bn) + + +def _fuse_conv_bn_eval(conv, bn): + '''fuse conv and bn for eval''' + assert (not (conv.training or bn.training)), "Fusion only for eval!" + fused_conv = copy.deepcopy(conv) + + fused_weight, fused_bias = _fuse_conv_bn_weights( + fused_conv.weight, fused_conv.bias, bn._mean, bn._variance, bn._epsilon, + bn.weight, bn.bias) + fused_conv.weight.set_value(fused_weight) + if fused_conv.bias is None: + fused_conv.bias = paddle.create_parameter( + shape=[fused_conv._out_channels], is_bias=True, dtype=bn.bias.dtype) + fused_conv.bias.set_value(fused_bias) + return fused_conv + + +def _fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b): + '''fuse weights and bias of conv and bn''' + if conv_b is None: + conv_b = paddle.zeros_like(bn_rm) + if bn_w is None: + bn_w = paddle.ones_like(bn_rm) + if bn_b is None: + bn_b = paddle.zeros_like(bn_rm) + bn_var_rsqrt = paddle.rsqrt(bn_rv + bn_eps) + conv_w = conv_w * \ + (bn_w * bn_var_rsqrt).reshape([-1] + [1] * (len(conv_w.shape) - 1)) + conv_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b + return conv_w, conv_b + + +types_to_fusion_method = {(nn.Conv2D, nn.BatchNorm2D): _fuse_conv_bn, } diff --git a/rtdetr_paddle/ppdet/utils/logger.py b/rtdetr_paddle/ppdet/utils/logger.py new file mode 100644 index 0000000..51e2962 --- /dev/null +++ b/rtdetr_paddle/ppdet/utils/logger.py @@ -0,0 +1,70 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import sys + +import paddle.distributed as dist + +__all__ = ['setup_logger'] + +logger_initialized = [] + + +def setup_logger(name="ppdet", output=None): + """ + Initialize logger and set its verbosity level to INFO. + Args: + output (str): a file name or a directory to save log. If None, will not save log file. + If ends with ".txt" or ".log", assumed to be a file name. + Otherwise, logs will be saved to `output/log.txt`. + name (str): the root module name of this logger + + Returns: + logging.Logger: a logger + """ + logger = logging.getLogger(name) + if name in logger_initialized: + return logger + + logger.setLevel(logging.INFO) + logger.propagate = False + + formatter = logging.Formatter( + "[%(asctime)s] %(name)s %(levelname)s: %(message)s", + datefmt="%m/%d %H:%M:%S") + # stdout logging: master only + local_rank = dist.get_rank() + if local_rank == 0: + ch = logging.StreamHandler(stream=sys.stdout) + ch.setLevel(logging.DEBUG) + ch.setFormatter(formatter) + logger.addHandler(ch) + + # file logging: all workers + if output is not None: + if output.endswith(".txt") or output.endswith(".log"): + filename = output + else: + filename = os.path.join(output, "log.txt") + if local_rank > 0: + filename = filename + ".rank{}".format(local_rank) + os.makedirs(os.path.dirname(filename)) + fh = logging.FileHandler(filename, mode='a') + fh.setLevel(logging.DEBUG) + fh.setFormatter(logging.Formatter()) + logger.addHandler(fh) + logger_initialized.append(name) + return logger diff --git a/rtdetr_paddle/ppdet/utils/profiler.py b/rtdetr_paddle/ppdet/utils/profiler.py new file mode 100644 index 0000000..cae3773 --- /dev/null +++ b/rtdetr_paddle/ppdet/utils/profiler.py @@ -0,0 +1,111 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import paddle + +# A global variable to record the number of calling times for profiler +# functions. It is used to specify the tracing range of training steps. +_profiler_step_id = 0 + +# A global variable to avoid parsing from string every time. +_profiler_options = None + + +class ProfilerOptions(object): + ''' + Use a string to initialize a ProfilerOptions. + The string should be in the format: "key1=value1;key2=value;key3=value3". + For example: + "profile_path=model.profile" + "batch_range=[50, 60]; profile_path=model.profile" + "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile" + + ProfilerOptions supports following key-value pair: + batch_range - a integer list, e.g. [100, 110]. + state - a string, the optional values are 'CPU', 'GPU' or 'All'. + sorted_key - a string, the optional values are 'calls', 'total', + 'max', 'min' or 'ave. + tracer_option - a string, the optional values are 'Default', 'OpDetail', + 'AllOpDetail'. + profile_path - a string, the path to save the serialized profile data, + which can be used to generate a timeline. + exit_on_finished - a boolean. + ''' + + def __init__(self, options_str): + assert isinstance(options_str, str) + + self._options = { + 'batch_range': [10, 20], + 'state': 'All', + 'sorted_key': 'total', + 'tracer_option': 'Default', + 'profile_path': '/tmp/profile', + 'exit_on_finished': True + } + self._parse_from_string(options_str) + + def _parse_from_string(self, options_str): + for kv in options_str.replace(' ', '').split(';'): + key, value = kv.split('=') + if key == 'batch_range': + value_list = value.replace('[', '').replace(']', '').split(',') + value_list = list(map(int, value_list)) + if len(value_list) >= 2 and value_list[0] >= 0 and value_list[ + 1] > value_list[0]: + self._options[key] = value_list + elif key == 'exit_on_finished': + self._options[key] = value.lower() in ("yes", "true", "t", "1") + elif key in [ + 'state', 'sorted_key', 'tracer_option', 'profile_path' + ]: + self._options[key] = value + + def __getitem__(self, name): + if self._options.get(name, None) is None: + raise ValueError( + "ProfilerOptions does not have an option named %s." % name) + return self._options[name] + + +def add_profiler_step(options_str=None): + ''' + Enable the operator-level timing using PaddlePaddle's profiler. + The profiler uses a independent variable to count the profiler steps. + One call of this function is treated as a profiler step. + + Args: + profiler_options - a string to initialize the ProfilerOptions. + Default is None, and the profiler is disabled. + ''' + if options_str is None: + return + + global _profiler_step_id + global _profiler_options + + if _profiler_options is None: + _profiler_options = ProfilerOptions(options_str) + + if _profiler_step_id == _profiler_options['batch_range'][0]: + paddle.utils.profiler.start_profiler(_profiler_options['state'], + _profiler_options['tracer_option']) + elif _profiler_step_id == _profiler_options['batch_range'][1]: + paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'], + _profiler_options['profile_path']) + if _profiler_options['exit_on_finished']: + sys.exit(0) + + _profiler_step_id += 1 diff --git a/rtdetr_paddle/ppdet/utils/stats.py b/rtdetr_paddle/ppdet/utils/stats.py new file mode 100644 index 0000000..524b7dc --- /dev/null +++ b/rtdetr_paddle/ppdet/utils/stats.py @@ -0,0 +1,94 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import numpy as np + +__all__ = ['SmoothedValue', 'TrainingStats'] + + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=20, fmt=None): + if fmt is None: + fmt = "{median:.4f} ({avg:.4f})" + self.deque = collections.deque(maxlen=window_size) + self.fmt = fmt + self.total = 0. + self.count = 0 + + def update(self, value, n=1): + self.deque.append(value) + self.count += n + self.total += value * n + + @property + def median(self): + return np.median(self.deque) + + @property + def avg(self): + return np.mean(self.deque) + + @property + def max(self): + return np.max(self.deque) + + @property + def value(self): + return self.deque[-1] + + @property + def global_avg(self): + return self.total / self.count + + def __str__(self): + return self.fmt.format( + median=self.median, avg=self.avg, max=self.max, value=self.value) + + +class TrainingStats(object): + def __init__(self, window_size, delimiter=' '): + self.meters = None + self.window_size = window_size + self.delimiter = delimiter + + def update(self, stats): + if self.meters is None: + self.meters = { + k: SmoothedValue(self.window_size) + for k in stats.keys() + } + for k, v in self.meters.items(): + v.update(float(stats[k])) + + def get(self, extras=None): + stats = collections.OrderedDict() + if extras: + for k, v in extras.items(): + stats[k] = v + for k, v in self.meters.items(): + stats[k] = format(v.median, '.6f') + + return stats + + def log(self, extras=None): + d = self.get(extras) + strs = [] + for k, v in d.items(): + strs.append("{}: {}".format(k, str(v))) + return self.delimiter.join(strs) diff --git a/rtdetr_paddle/ppdet/utils/visualizer.py b/rtdetr_paddle/ppdet/utils/visualizer.py new file mode 100644 index 0000000..406589d --- /dev/null +++ b/rtdetr_paddle/ppdet/utils/visualizer.py @@ -0,0 +1,461 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np +import PIL +from PIL import Image, ImageDraw +import cv2 +import math + +from .colormap import colormap +from ppdet.utils.logger import setup_logger +logger = setup_logger(__name__) + +__all__ = ['visualize_results'] + + +def visualize_results(image, + bbox_res, + mask_res, + segm_res, + keypoint_res, + pose3d_res, + im_id, + catid2name, + threshold=0.5): + """ + Visualize bbox and mask results + """ + if bbox_res is not None: + image = draw_bbox(image, im_id, catid2name, bbox_res, threshold) + if mask_res is not None: + image = draw_mask(image, im_id, mask_res, threshold) + if segm_res is not None: + image = draw_segm(image, im_id, catid2name, segm_res, threshold) + if keypoint_res is not None: + image = draw_pose(image, keypoint_res, threshold) + if pose3d_res is not None: + pose3d = np.array(pose3d_res[0]['pose3d']) * 1000 + image = draw_pose3d(image, pose3d, visual_thread=threshold) + return image + + +def draw_mask(image, im_id, segms, threshold, alpha=0.7): + """ + Draw mask on image + """ + mask_color_id = 0 + w_ratio = .4 + color_list = colormap(rgb=True) + img_array = np.array(image).astype('float32') + for dt in np.array(segms): + if im_id != dt['image_id']: + continue + segm, score = dt['segmentation'], dt['score'] + if score < threshold: + continue + import pycocotools.mask as mask_util + mask = mask_util.decode(segm) * 255 + color_mask = color_list[mask_color_id % len(color_list), 0:3] + mask_color_id += 1 + for c in range(3): + color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255 + idx = np.nonzero(mask) + img_array[idx[0], idx[1], :] *= 1.0 - alpha + img_array[idx[0], idx[1], :] += alpha * color_mask + return Image.fromarray(img_array.astype('uint8')) + + +def draw_bbox(image, im_id, catid2name, bboxes, threshold): + """ + Draw bbox on image + """ + draw = ImageDraw.Draw(image) + + catid2color = {} + color_list = colormap(rgb=True)[:40] + for dt in np.array(bboxes): + if im_id != dt['image_id']: + continue + catid, bbox, score = dt['category_id'], dt['bbox'], dt['score'] + if score < threshold: + continue + + if catid not in catid2color: + idx = np.random.randint(len(color_list)) + catid2color[catid] = color_list[idx] + color = tuple(catid2color[catid]) + + # draw bbox + if len(bbox) == 4: + # draw bbox + xmin, ymin, w, h = bbox + xmax = xmin + w + ymax = ymin + h + draw.line( + [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin), + (xmin, ymin)], + width=2, + fill=color) + elif len(bbox) == 8: + x1, y1, x2, y2, x3, y3, x4, y4 = bbox + draw.line( + [(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)], + width=2, + fill=color) + xmin = min(x1, x2, x3, x4) + ymin = min(y1, y2, y3, y4) + else: + logger.error('the shape of bbox must be [M, 4] or [M, 8]!') + + # draw label + text = "{} {:.2f}".format(catid2name[catid], score) + # tw, th = draw.textsize(text) + left, top, right, bottom = draw.textbbox((0, 0), text) + tw, th = right - left, bottom - top + + draw.rectangle( + [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color) + draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255)) + + return image + + +def save_result(save_path, results, catid2name, threshold): + """ + save result as txt + """ + img_id = int(results["im_id"]) + with open(save_path, 'w') as f: + if "bbox_res" in results: + for dt in results["bbox_res"]: + catid, bbox, score = dt['category_id'], dt['bbox'], dt['score'] + if score < threshold: + continue + # each bbox result as a line + # for rbox: classname score x1 y1 x2 y2 x3 y3 x4 y4 + # for bbox: classname score x1 y1 w h + bbox_pred = '{} {} '.format(catid2name[catid], + score) + ' '.join( + [str(e) for e in bbox]) + f.write(bbox_pred + '\n') + elif "keypoint_res" in results: + for dt in results["keypoint_res"]: + kpts = dt['keypoints'] + scores = dt['score'] + keypoint_pred = [img_id, scores, kpts] + print(keypoint_pred, file=f) + else: + print("No valid results found, skip txt save") + + +def draw_segm(image, + im_id, + catid2name, + segms, + threshold, + alpha=0.7, + draw_box=True): + """ + Draw segmentation on image + """ + mask_color_id = 0 + w_ratio = .4 + color_list = colormap(rgb=True) + img_array = np.array(image).astype('float32') + for dt in np.array(segms): + if im_id != dt['image_id']: + continue + segm, score, catid = dt['segmentation'], dt['score'], dt['category_id'] + if score < threshold: + continue + import pycocotools.mask as mask_util + mask = mask_util.decode(segm) * 255 + color_mask = color_list[mask_color_id % len(color_list), 0:3] + mask_color_id += 1 + for c in range(3): + color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255 + idx = np.nonzero(mask) + img_array[idx[0], idx[1], :] *= 1.0 - alpha + img_array[idx[0], idx[1], :] += alpha * color_mask + + if not draw_box: + center_y, center_x = ndimage.measurements.center_of_mass(mask) + label_text = "{}".format(catid2name[catid]) + vis_pos = (max(int(center_x) - 10, 0), int(center_y)) + cv2.putText(img_array, label_text, vis_pos, + cv2.FONT_HERSHEY_COMPLEX, 0.3, (255, 255, 255)) + else: + mask = mask_util.decode(segm) * 255 + sum_x = np.sum(mask, axis=0) + x = np.where(sum_x > 0.5)[0] + sum_y = np.sum(mask, axis=1) + y = np.where(sum_y > 0.5)[0] + x0, x1, y0, y1 = x[0], x[-1], y[0], y[-1] + cv2.rectangle(img_array, (x0, y0), (x1, y1), + tuple(color_mask.astype('int32').tolist()), 1) + bbox_text = '%s %.2f' % (catid2name[catid], score) + t_size = cv2.getTextSize(bbox_text, 0, 0.3, thickness=1)[0] + cv2.rectangle(img_array, (x0, y0), (x0 + t_size[0], + y0 - t_size[1] - 3), + tuple(color_mask.astype('int32').tolist()), -1) + cv2.putText( + img_array, + bbox_text, (x0, y0 - 2), + cv2.FONT_HERSHEY_SIMPLEX, + 0.3, (0, 0, 0), + 1, + lineType=cv2.LINE_AA) + + return Image.fromarray(img_array.astype('uint8')) + + +def draw_pose(image, + results, + visual_thread=0.6, + save_name='pose.jpg', + save_dir='output', + returnimg=False, + ids=None): + try: + import matplotlib.pyplot as plt + import matplotlib + plt.switch_backend('agg') + except Exception as e: + logger.error('Matplotlib not found, please install matplotlib.' + 'for example: `pip install matplotlib`.') + raise e + + skeletons = np.array([item['keypoints'] for item in results]) + kpt_nums = 17 + if len(skeletons) > 0: + kpt_nums = int(skeletons.shape[1] / 3) + skeletons = skeletons.reshape(-1, kpt_nums, 3) + if kpt_nums == 17: #plot coco keypoint + EDGES = [(0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6), (5, 7), (6, 8), + (7, 9), (8, 10), (5, 11), (6, 12), (11, 13), (12, 14), + (13, 15), (14, 16), (11, 12)] + else: #plot mpii keypoint + EDGES = [(0, 1), (1, 2), (3, 4), (4, 5), (2, 6), (3, 6), (6, 7), (7, 8), + (8, 9), (10, 11), (11, 12), (13, 14), (14, 15), (8, 12), + (8, 13)] + NUM_EDGES = len(EDGES) + + colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \ + [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \ + [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]] + cmap = matplotlib.cm.get_cmap('hsv') + plt.figure() + + img = np.array(image).astype('float32') + + color_set = results['colors'] if 'colors' in results else None + + if 'bbox' in results and ids is None: + bboxs = results['bbox'] + for j, rect in enumerate(bboxs): + xmin, ymin, xmax, ymax = rect + color = colors[0] if color_set is None else colors[color_set[j] % + len(colors)] + cv2.rectangle(img, (xmin, ymin), (xmax, ymax), color, 1) + + canvas = img.copy() + for i in range(kpt_nums): + for j in range(len(skeletons)): + if skeletons[j][i, 2] < visual_thread: + continue + if ids is None: + color = colors[i] if color_set is None else colors[color_set[j] + % + len(colors)] + else: + color = get_color(ids[j]) + + cv2.circle( + canvas, + tuple(skeletons[j][i, 0:2].astype('int32')), + 2, + color, + thickness=-1) + + to_plot = cv2.addWeighted(img, 0.3, canvas, 0.7, 0) + fig = matplotlib.pyplot.gcf() + + stickwidth = 2 + + for i in range(NUM_EDGES): + for j in range(len(skeletons)): + edge = EDGES[i] + if skeletons[j][edge[0], 2] < visual_thread or skeletons[j][edge[ + 1], 2] < visual_thread: + continue + + cur_canvas = canvas.copy() + X = [skeletons[j][edge[0], 1], skeletons[j][edge[1], 1]] + Y = [skeletons[j][edge[0], 0], skeletons[j][edge[1], 0]] + mX = np.mean(X) + mY = np.mean(Y) + length = ((X[0] - X[1])**2 + (Y[0] - Y[1])**2)**0.5 + angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1])) + polygon = cv2.ellipse2Poly((int(mY), int(mX)), + (int(length / 2), stickwidth), + int(angle), 0, 360, 1) + if ids is None: + color = colors[i] if color_set is None else colors[color_set[j] + % + len(colors)] + else: + color = get_color(ids[j]) + cv2.fillConvexPoly(cur_canvas, polygon, color) + canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0) + image = Image.fromarray(canvas.astype('uint8')) + plt.close() + return image + + +def draw_pose3d(image, + pose3d, + pose2d=None, + visual_thread=0.6, + save_name='pose3d.jpg', + returnimg=True): + try: + import matplotlib.pyplot as plt + import matplotlib + plt.switch_backend('agg') + except Exception as e: + logger.error('Matplotlib not found, please install matplotlib.' + 'for example: `pip install matplotlib`.') + raise e + + if pose3d.shape[0] == 24: + joints_connectivity_dict = [ + [0, 1, 0], [1, 2, 0], [5, 4, 1], [4, 3, 1], [2, 3, 0], [2, 14, 1], + [3, 14, 1], [14, 16, 1], [15, 16, 1], [15, 12, 1], [6, 7, 0], + [7, 8, 0], [11, 10, 1], [10, 9, 1], [8, 12, 0], [9, 12, 1], + [12, 19, 1], [19, 18, 1], [19, 20, 0], [19, 21, 1], [22, 20, 0], + [23, 21, 1] + ] + elif pose3d.shape[0] == 14: + joints_connectivity_dict = [ + [0, 1, 0], [1, 2, 0], [5, 4, 1], [4, 3, 1], [2, 3, 0], [2, 12, 0], + [3, 12, 1], [6, 7, 0], [7, 8, 0], [11, 10, 1], [10, 9, 1], + [8, 12, 0], [9, 12, 1], [12, 13, 1] + ] + else: + print( + "not defined joints number :{}, cannot visualize because unknown of joint connectivity". + format(pose.shape[0])) + return + + def draw3Dpose(pose3d, + ax, + lcolor="#3498db", + rcolor="#e74c3c", + add_labels=False): + # pose3d = orthographic_projection(pose3d, cam) + for i in joints_connectivity_dict: + x, y, z = [ + np.array([pose3d[i[0], j], pose3d[i[1], j]]) for j in range(3) + ] + ax.plot(-x, -z, -y, lw=2, c=lcolor if i[2] else rcolor) + + RADIUS = 1000 + center_xy = 2 if pose3d.shape[0] == 14 else 14 + x, y, z = pose3d[center_xy, 0], pose3d[center_xy, 1], pose3d[center_xy, + 2] + ax.set_xlim3d([-RADIUS + x, RADIUS + x]) + ax.set_ylim3d([-RADIUS + y, RADIUS + y]) + ax.set_zlim3d([-RADIUS + z, RADIUS + z]) + + ax.set_xlabel("x") + ax.set_ylabel("y") + ax.set_zlabel("z") + + def draw2Dpose(pose2d, + ax, + lcolor="#3498db", + rcolor="#e74c3c", + add_labels=False): + for i in joints_connectivity_dict: + if pose2d[i[0], 2] and pose2d[i[1], 2]: + x, y = [ + np.array([pose2d[i[0], j], pose2d[i[1], j]]) + for j in range(2) + ] + ax.plot(x, y, 0, lw=2, c=lcolor if i[2] else rcolor) + + def draw_img_pose(pose3d, + pose2d=None, + frame=None, + figsize=(12, 12), + savepath=None): + fig = plt.figure(figsize=figsize, dpi=80) + # fig.clear() + fig.tight_layout() + + ax = fig.add_subplot(221) + if frame is not None: + ax.imshow(frame, interpolation='nearest') + if pose2d is not None: + draw2Dpose(pose2d, ax) + + ax = fig.add_subplot(222, projection='3d') + ax.view_init(45, 45) + draw3Dpose(pose3d, ax) + ax = fig.add_subplot(223, projection='3d') + ax.view_init(0, 0) + draw3Dpose(pose3d, ax) + ax = fig.add_subplot(224, projection='3d') + ax.view_init(0, 90) + draw3Dpose(pose3d, ax) + + if savepath is not None: + plt.savefig(savepath) + plt.close() + else: + return fig + + def fig2data(fig): + """ + fig = plt.figure() + image = fig2data(fig) + @brief Convert a Matplotlib figure to a 4D numpy array with RGBA channels and return it + @param fig a matplotlib figure + @return a numpy 3D array of RGBA values + """ + # draw the renderer + fig.canvas.draw() + + # Get the RGBA buffer from the figure + w, h = fig.canvas.get_width_height() + buf = np.fromstring(fig.canvas.tostring_argb(), dtype=np.uint8) + buf.shape = (w, h, 4) + + # canvas.tostring_argb give pixmap in ARGB mode. Roll the ALPHA channel to have it in RGBA mode + buf = np.roll(buf, 3, axis=2) + image = Image.frombytes("RGBA", (w, h), buf.tostring()) + return image.convert("RGB") + + fig = draw_img_pose(pose3d, pose2d, frame=image) + data = fig2data(fig) + if returnimg is False: + data.save(save_name) + else: + return data diff --git a/rtdetr_paddle/ppdet/utils/voc_utils.py b/rtdetr_paddle/ppdet/utils/voc_utils.py new file mode 100644 index 0000000..cd6d9f9 --- /dev/null +++ b/rtdetr_paddle/ppdet/utils/voc_utils.py @@ -0,0 +1,86 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import os.path as osp +import re +import random + +__all__ = ['create_list'] + + +def create_list(devkit_dir, years, output_dir): + """ + create following list: + 1. trainval.txt + 2. test.txt + """ + trainval_list = [] + test_list = [] + for year in years: + trainval, test = _walk_voc_dir(devkit_dir, year, output_dir) + trainval_list.extend(trainval) + test_list.extend(test) + + random.shuffle(trainval_list) + with open(osp.join(output_dir, 'trainval.txt'), 'w') as ftrainval: + for item in trainval_list: + ftrainval.write(item[0] + ' ' + item[1] + '\n') + + with open(osp.join(output_dir, 'test.txt'), 'w') as fval: + ct = 0 + for item in test_list: + ct += 1 + fval.write(item[0] + ' ' + item[1] + '\n') + + +def _get_voc_dir(devkit_dir, year, type): + return osp.join(devkit_dir, 'VOC' + year, type) + + +def _walk_voc_dir(devkit_dir, year, output_dir): + filelist_dir = _get_voc_dir(devkit_dir, year, 'ImageSets/Main') + annotation_dir = _get_voc_dir(devkit_dir, year, 'Annotations') + img_dir = _get_voc_dir(devkit_dir, year, 'JPEGImages') + trainval_list = [] + test_list = [] + added = set() + + for _, _, files in os.walk(filelist_dir): + for fname in files: + img_ann_list = [] + if re.match(r'[a-z]+_trainval\.txt', fname): + img_ann_list = trainval_list + elif re.match(r'[a-z]+_test\.txt', fname): + img_ann_list = test_list + else: + continue + fpath = osp.join(filelist_dir, fname) + for line in open(fpath): + name_prefix = line.strip().split()[0] + if name_prefix in added: + continue + added.add(name_prefix) + ann_path = osp.join( + osp.relpath(annotation_dir, output_dir), + name_prefix + '.xml') + img_path = osp.join( + osp.relpath(img_dir, output_dir), name_prefix + '.jpg') + img_ann_list.append((img_path, ann_path)) + + return trainval_list, test_list diff --git a/rtdetr_paddle/ppdet/version.py b/rtdetr_paddle/ppdet/version.py new file mode 100644 index 0000000..d4be0af --- /dev/null +++ b/rtdetr_paddle/ppdet/version.py @@ -0,0 +1,4 @@ +# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY +# +full_version = '2.4.0' +commit = '87ed5ba91eaeb332e8e5c3f4e7d5b1d765c75644' diff --git a/rtdetr_paddle/requirements.txt b/rtdetr_paddle/requirements.txt new file mode 100644 index 0000000..867b90f --- /dev/null +++ b/rtdetr_paddle/requirements.txt @@ -0,0 +1,12 @@ +paddlepaddle-gpu==2.4.2 +tqdm +typeguard +visualdl>=2.2.0 +opencv-python <= 4.6.0 +PyYAML +shapely +scipy +terminaltables +Cython +pycocotools +setuptools diff --git a/rtdetr_paddle/tools/eval.py b/rtdetr_paddle/tools/eval.py new file mode 100755 index 0000000..d390d70 --- /dev/null +++ b/rtdetr_paddle/tools/eval.py @@ -0,0 +1,198 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys + +# add python path of PaddleDetection to sys.path +parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2))) +sys.path.insert(0, parent_path) + +# ignore warning log +import warnings +warnings.filterwarnings('ignore') + +import paddle + +from ppdet.core.workspace import create, load_config, merge_config +from ppdet.utils.check import check_gpu, check_npu, check_xpu, check_mlu, check_version, check_config +from ppdet.utils.cli import ArgsParser, merge_args +from ppdet.engine import Trainer, init_parallel_env +from ppdet.metrics.coco_utils import json_eval_results + +from ppdet.utils.logger import setup_logger +logger = setup_logger('eval') + + +def parse_args(): + parser = ArgsParser() + parser.add_argument( + "--output_eval", + default=None, + type=str, + help="Evaluation directory, default is current directory.") + + parser.add_argument( + '--json_eval', + action='store_true', + default=False, + help='Whether to re eval with already exists bbox.json or mask.json') + + parser.add_argument( + "--slim_config", + default=None, + type=str, + help="Configuration file of slim method.") + + # TODO: bias should be unified + parser.add_argument( + "--bias", + action="store_true", + help="whether add bias or not while getting w and h") + + parser.add_argument( + "--classwise", + action="store_true", + help="whether per-category AP and draw P-R Curve or not.") + + parser.add_argument( + '--save_prediction_only', + action='store_true', + default=False, + help='Whether to save the evaluation results only') + + parser.add_argument( + "--amp", + action='store_true', + default=False, + help="Enable auto mixed precision eval.") + + # for smalldet slice_infer + parser.add_argument( + "--slice_infer", + action='store_true', + help="Whether to slice the image and merge the inference results for small object detection." + ) + parser.add_argument( + '--slice_size', + nargs='+', + type=int, + default=[640, 640], + help="Height of the sliced image.") + parser.add_argument( + "--overlap_ratio", + nargs='+', + type=float, + default=[0.25, 0.25], + help="Overlap height ratio of the sliced image.") + parser.add_argument( + "--combine_method", + type=str, + default='nms', + help="Combine method of the sliced images' detection results, choose in ['nms', 'nmm', 'concat']." + ) + parser.add_argument( + "--match_threshold", + type=float, + default=0.6, + help="Combine method matching threshold.") + parser.add_argument( + "--match_metric", + type=str, + default='ios', + help="Combine method matching metric, choose in ['iou', 'ios'].") + args = parser.parse_args() + return args + + +def run(FLAGS, cfg): + if FLAGS.json_eval: + logger.info( + "In json_eval mode, PaddleDetection will evaluate json files in " + "output_eval directly. And proposal.json, bbox.json and mask.json " + "will be detected by default.") + json_eval_results( + cfg.metric, + json_directory=FLAGS.output_eval, + dataset=create('EvalDataset')()) + return + + # init parallel environment if nranks > 1 + init_parallel_env() + + # build trainer + trainer = Trainer(cfg, mode='eval') + #load weights + trainer.load_weights(cfg.weights) + + # training + if FLAGS.slice_infer: + trainer.evaluate_slice( + slice_size=FLAGS.slice_size, + overlap_ratio=FLAGS.overlap_ratio, + combine_method=FLAGS.combine_method, + match_threshold=FLAGS.match_threshold, + match_metric=FLAGS.match_metric) + else: + trainer.evaluate() + + +def main(): + FLAGS = parse_args() + cfg = load_config(FLAGS.config) + merge_args(cfg, FLAGS) + merge_config(FLAGS.opt) + + # disable npu in config by default + if 'use_npu' not in cfg: + cfg.use_npu = False + + # disable xpu in config by default + if 'use_xpu' not in cfg: + cfg.use_xpu = False + + if 'use_gpu' not in cfg: + cfg.use_gpu = False + + # disable mlu in config by default + if 'use_mlu' not in cfg: + cfg.use_mlu = False + + if cfg.use_gpu: + place = paddle.set_device('gpu') + elif cfg.use_npu: + place = paddle.set_device('npu') + elif cfg.use_xpu: + place = paddle.set_device('xpu') + elif cfg.use_mlu: + place = paddle.set_device('mlu') + else: + place = paddle.set_device('cpu') + + check_config(cfg) + check_gpu(cfg.use_gpu) + check_npu(cfg.use_npu) + check_xpu(cfg.use_xpu) + check_mlu(cfg.use_mlu) + check_version() + + run(FLAGS, cfg) + + +if __name__ == '__main__': + main() diff --git a/rtdetr_paddle/tools/export_model.py b/rtdetr_paddle/tools/export_model.py new file mode 100644 index 0000000..621678c --- /dev/null +++ b/rtdetr_paddle/tools/export_model.py @@ -0,0 +1,101 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys + +# add python path of PaddleDetection to sys.path +parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2))) +sys.path.insert(0, parent_path) + +# ignore warning log +import warnings +warnings.filterwarnings('ignore') + +import paddle +from ppdet.core.workspace import load_config, merge_config +from ppdet.utils.check import check_gpu, check_version, check_config +from ppdet.utils.cli import ArgsParser +from ppdet.engine import Trainer + +from ppdet.utils.logger import setup_logger +logger = setup_logger('export_model') + + +def parse_args(): + parser = ArgsParser() + parser.add_argument( + "--output_dir", + type=str, + default="output_inference", + help="Directory for storing the output model files.") + parser.add_argument( + "--export_serving_model", + type=bool, + default=False, + help="Whether to export serving model or not.") + parser.add_argument( + "--slim_config", + default=None, + type=str, + help="Configuration file of slim method.") + args = parser.parse_args() + return args + + +def run(FLAGS, cfg): + trainer = Trainer(cfg, mode='test') + # load weights + trainer.load_weights(cfg.weights) + + # export model + trainer.export(FLAGS.output_dir) + + if FLAGS.export_serving_model: + from paddle_serving_client.io import inference_model_to_serving + model_name = os.path.splitext(os.path.split(cfg.filename)[-1])[0] + + inference_model_to_serving( + dirname="{}/{}".format(FLAGS.output_dir, model_name), + serving_server="{}/{}/serving_server".format(FLAGS.output_dir, + model_name), + serving_client="{}/{}/serving_client".format(FLAGS.output_dir, + model_name), + model_filename="model.pdmodel", + params_filename="model.pdiparams") + + +def main(): + paddle.set_device("cpu") + FLAGS = parse_args() + cfg = load_config(FLAGS.config) + merge_config(FLAGS.opt) + + # FIXME: Temporarily solve the priority problem of FLAGS.opt + merge_config(FLAGS.opt) + check_config(cfg) + if 'use_gpu' not in cfg: + cfg.use_gpu = False + check_gpu(cfg.use_gpu) + check_version() + + run(FLAGS, cfg) + + +if __name__ == '__main__': + main() diff --git a/rtdetr_paddle/tools/infer.py b/rtdetr_paddle/tools/infer.py new file mode 100755 index 0000000..485c6fa --- /dev/null +++ b/rtdetr_paddle/tools/infer.py @@ -0,0 +1,228 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys + +# add python path of PaddleDetection to sys.path +parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2))) +sys.path.insert(0, parent_path) + +# ignore warning log +import warnings +warnings.filterwarnings('ignore') +import glob +import ast + +import paddle +from ppdet.core.workspace import load_config, merge_config +from ppdet.engine import Trainer +from ppdet.utils.check import check_gpu, check_npu, check_xpu, check_mlu, check_version, check_config +from ppdet.utils.cli import ArgsParser, merge_args + +from ppdet.utils.logger import setup_logger +logger = setup_logger('train') + + +def parse_args(): + parser = ArgsParser() + parser.add_argument( + "--infer_dir", + type=str, + default=None, + help="Directory for images to perform inference on.") + parser.add_argument( + "--infer_img", + type=str, + default=None, + help="Image path, has higher priority over --infer_dir") + parser.add_argument( + "--output_dir", + type=str, + default="output", + help="Directory for storing the output visualization files.") + parser.add_argument( + "--draw_threshold", + type=float, + default=0.5, + help="Threshold to reserve the result for visualization.") + parser.add_argument( + "--slim_config", + default=None, + type=str, + help="Configuration file of slim method.") + parser.add_argument( + "--use_vdl", + type=bool, + default=False, + help="Whether to record the data to VisualDL.") + parser.add_argument( + '--vdl_log_dir', + type=str, + default="vdl_log_dir/image", + help='VisualDL logging directory for image.') + parser.add_argument( + "--save_results", + type=bool, + default=False, + help="Whether to save inference results to output_dir.") + parser.add_argument( + "--slice_infer", + action='store_true', + help="Whether to slice the image and merge the inference results for small object detection." + ) + parser.add_argument( + '--slice_size', + nargs='+', + type=int, + default=[640, 640], + help="Height of the sliced image.") + parser.add_argument( + "--overlap_ratio", + nargs='+', + type=float, + default=[0.25, 0.25], + help="Overlap height ratio of the sliced image.") + parser.add_argument( + "--combine_method", + type=str, + default='nms', + help="Combine method of the sliced images' detection results, choose in ['nms', 'nmm', 'concat']." + ) + parser.add_argument( + "--match_threshold", + type=float, + default=0.6, + help="Combine method matching threshold.") + parser.add_argument( + "--match_metric", + type=str, + default='ios', + help="Combine method matching metric, choose in ['iou', 'ios'].") + parser.add_argument( + "--visualize", + type=ast.literal_eval, + default=True, + help="Whether to save visualize results to output_dir.") + args = parser.parse_args() + return args + + +def get_test_images(infer_dir, infer_img): + """ + Get image path list in TEST mode + """ + assert infer_img is not None or infer_dir is not None, \ + "--infer_img or --infer_dir should be set" + assert infer_img is None or os.path.isfile(infer_img), \ + "{} is not a file".format(infer_img) + assert infer_dir is None or os.path.isdir(infer_dir), \ + "{} is not a directory".format(infer_dir) + + # infer_img has a higher priority + if infer_img and os.path.isfile(infer_img): + return [infer_img] + + images = set() + infer_dir = os.path.abspath(infer_dir) + assert os.path.isdir(infer_dir), \ + "infer_dir {} is not a directory".format(infer_dir) + exts = ['jpg', 'jpeg', 'png', 'bmp'] + exts += [ext.upper() for ext in exts] + for ext in exts: + images.update(glob.glob('{}/*.{}'.format(infer_dir, ext))) + images = list(images) + + assert len(images) > 0, "no image found in {}".format(infer_dir) + logger.info("Found {} inference images in total.".format(len(images))) + + return images + + +def run(FLAGS, cfg): + trainer = Trainer(cfg, mode='test') + trainer.load_weights(cfg.weights) + # get inference images + images = get_test_images(FLAGS.infer_dir, FLAGS.infer_img) + + # inference + if FLAGS.slice_infer: + trainer.slice_predict( + images, + slice_size=FLAGS.slice_size, + overlap_ratio=FLAGS.overlap_ratio, + combine_method=FLAGS.combine_method, + match_threshold=FLAGS.match_threshold, + match_metric=FLAGS.match_metric, + draw_threshold=FLAGS.draw_threshold, + output_dir=FLAGS.output_dir, + save_results=FLAGS.save_results, + visualize=FLAGS.visualize) + else: + trainer.predict( + images, + draw_threshold=FLAGS.draw_threshold, + output_dir=FLAGS.output_dir, + save_results=FLAGS.save_results, + visualize=FLAGS.visualize) + + +def main(): + FLAGS = parse_args() + cfg = load_config(FLAGS.config) + merge_args(cfg, FLAGS) + merge_config(FLAGS.opt) + + # disable npu in config by default + if 'use_npu' not in cfg: + cfg.use_npu = False + + # disable xpu in config by default + if 'use_xpu' not in cfg: + cfg.use_xpu = False + + if 'use_gpu' not in cfg: + cfg.use_gpu = False + + # disable mlu in config by default + if 'use_mlu' not in cfg: + cfg.use_mlu = False + + if cfg.use_gpu: + place = paddle.set_device('gpu') + elif cfg.use_npu: + place = paddle.set_device('npu') + elif cfg.use_xpu: + place = paddle.set_device('xpu') + elif cfg.use_mlu: + place = paddle.set_device('mlu') + else: + place = paddle.set_device('cpu') + + check_config(cfg) + check_gpu(cfg.use_gpu) + check_npu(cfg.use_npu) + check_xpu(cfg.use_xpu) + check_mlu(cfg.use_mlu) + check_version() + + run(FLAGS, cfg) + + +if __name__ == '__main__': + main() diff --git a/rtdetr_paddle/tools/slice_image.py b/rtdetr_paddle/tools/slice_image.py new file mode 100644 index 0000000..f739d74 --- /dev/null +++ b/rtdetr_paddle/tools/slice_image.py @@ -0,0 +1,56 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +from tqdm import tqdm + + +def slice_data(image_dir, dataset_json_path, output_dir, slice_size, + overlap_ratio): + try: + from sahi.scripts.slice_coco import slice + except Exception as e: + raise RuntimeError( + 'Unable to use sahi to slice images, please install sahi, for example: `pip install sahi`, see https://github.com/obss/sahi' + ) + tqdm.write( + f" slicing for slice_size={slice_size}, overlap_ratio={overlap_ratio}") + slice( + image_dir=image_dir, + dataset_json_path=dataset_json_path, + output_dir=output_dir, + slice_size=slice_size, + overlap_ratio=overlap_ratio, ) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + '--image_dir', type=str, default=None, help="The image folder path.") + parser.add_argument( + '--json_path', type=str, default=None, help="Dataset json path.") + parser.add_argument( + '--output_dir', type=str, default=None, help="Output dir.") + parser.add_argument( + '--slice_size', type=int, default=500, help="slice_size") + parser.add_argument( + '--overlap_ratio', type=float, default=0.25, help="overlap_ratio") + args = parser.parse_args() + + slice_data(args.image_dir, args.json_path, args.output_dir, args.slice_size, + args.overlap_ratio) + + +if __name__ == "__main__": + main() diff --git a/rtdetr_paddle/tools/train.py b/rtdetr_paddle/tools/train.py new file mode 100755 index 0000000..954b4ec --- /dev/null +++ b/rtdetr_paddle/tools/train.py @@ -0,0 +1,183 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys + +# add python path of PaddleDetection to sys.path +parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 2))) +sys.path.insert(0, parent_path) + +# ignore warning log +import warnings +warnings.filterwarnings('ignore') + +import paddle + +from ppdet.core.workspace import load_config, merge_config +from ppdet.engine import Trainer, init_parallel_env, set_random_seed, init_fleet_env +from ppdet.utils.cli import ArgsParser, merge_args +import ppdet.utils.check as check +from ppdet.utils.logger import setup_logger +logger = setup_logger('train') + + +def parse_args(): + parser = ArgsParser() + parser.add_argument( + "--eval", + action='store_true', + default=False, + help="Whether to perform evaluation in train") + parser.add_argument( + "-r", "--resume", default=None, help="weights path for resume") + parser.add_argument( + "--slim_config", + default=None, + type=str, + help="Configuration file of slim method.") + parser.add_argument( + "--enable_ce", + type=bool, + default=False, + help="If set True, enable continuous evaluation job." + "This flag is only used for internal test.") + parser.add_argument( + "--amp", + action='store_true', + default=False, + help="Enable auto mixed precision training.") + parser.add_argument( + "--fleet", action='store_true', default=False, help="Use fleet or not") + parser.add_argument( + "--use_vdl", + type=bool, + default=False, + help="whether to record the data to VisualDL.") + parser.add_argument( + '--vdl_log_dir', + type=str, + default="vdl_log_dir/scalar", + help='VisualDL logging directory for scalar.') + parser.add_argument( + "--use_wandb", + type=bool, + default=False, + help="whether to record the data to wandb.") + parser.add_argument( + '--save_prediction_only', + action='store_true', + default=False, + help='Whether to save the evaluation results only') + parser.add_argument( + '--profiler_options', + type=str, + default=None, + help="The option of profiler, which should be in " + "format \"key1=value1;key2=value2;key3=value3\"." + "please see ppdet/utils/profiler.py for detail.") + parser.add_argument( + '--save_proposals', + action='store_true', + default=False, + help='Whether to save the train proposals') + parser.add_argument( + '--proposals_path', + type=str, + default="sniper/proposals.json", + help='Train proposals directory') + parser.add_argument( + "--to_static", + action='store_true', + default=False, + help="Enable dy2st to train.") + + args = parser.parse_args() + return args + + +def run(FLAGS, cfg): + # init fleet environment + if cfg.fleet: + init_fleet_env(cfg.get('find_unused_parameters', False)) + else: + # init parallel environment if nranks > 1 + init_parallel_env() + + if FLAGS.enable_ce: + set_random_seed(0) + + # build trainer + trainer = Trainer(cfg, mode='train') + + # load weights + if FLAGS.resume is not None: + trainer.resume_weights(FLAGS.resume) + elif 'pretrain_weights' in cfg and cfg.pretrain_weights: + trainer.load_weights(cfg.pretrain_weights) + + # training + trainer.train(FLAGS.eval) + + +def main(): + FLAGS = parse_args() + cfg = load_config(FLAGS.config) + merge_args(cfg, FLAGS) + merge_config(FLAGS.opt) + + # disable npu in config by default + if 'use_npu' not in cfg: + cfg.use_npu = False + + # disable xpu in config by default + if 'use_xpu' not in cfg: + cfg.use_xpu = False + + if 'use_gpu' not in cfg: + cfg.use_gpu = False + + # disable mlu in config by default + if 'use_mlu' not in cfg: + cfg.use_mlu = False + + if cfg.use_gpu: + place = paddle.set_device('gpu') + elif cfg.use_npu: + place = paddle.set_device('npu') + elif cfg.use_xpu: + place = paddle.set_device('xpu') + elif cfg.use_mlu: + place = paddle.set_device('mlu') + else: + place = paddle.set_device('cpu') + + # FIXME: Temporarily solve the priority problem of FLAGS.opt + merge_config(FLAGS.opt) + check.check_config(cfg) + check.check_gpu(cfg.use_gpu) + check.check_npu(cfg.use_npu) + check.check_xpu(cfg.use_xpu) + check.check_mlu(cfg.use_mlu) + check.check_version() + + run(FLAGS, cfg) + + +if __name__ == "__main__": + main() diff --git a/rtdetr_paddle/tools/x2coco.py b/rtdetr_paddle/tools/x2coco.py new file mode 100644 index 0000000..78e8619 --- /dev/null +++ b/rtdetr_paddle/tools/x2coco.py @@ -0,0 +1,542 @@ +#!/usr/bin/env python +# coding: utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import glob +import json +import os +import os.path as osp +import shutil +import xml.etree.ElementTree as ET + +import numpy as np +import PIL.ImageDraw +from tqdm import tqdm +import cv2 + +label_to_num = {} +categories_list = [] +labels_list = [] + + +class MyEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + return float(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + else: + return super(MyEncoder, self).default(obj) + + +def images_labelme(data, num): + image = {} + image['height'] = data['imageHeight'] + image['width'] = data['imageWidth'] + image['id'] = num + 1 + if '\\' in data['imagePath']: + image['file_name'] = data['imagePath'].split('\\')[-1] + else: + image['file_name'] = data['imagePath'].split('/')[-1] + return image + + +def images_cityscape(data, num, img_file): + image = {} + image['height'] = data['imgHeight'] + image['width'] = data['imgWidth'] + image['id'] = num + 1 + image['file_name'] = img_file + return image + + +def categories(label, labels_list): + category = {} + category['supercategory'] = 'component' + category['id'] = len(labels_list) + 1 + category['name'] = label + return category + + +def annotations_rectangle(points, label, image_num, object_num, label_to_num): + annotation = {} + seg_points = np.asarray(points).copy() + seg_points[1, :] = np.asarray(points)[2, :] + seg_points[2, :] = np.asarray(points)[1, :] + annotation['segmentation'] = [list(seg_points.flatten())] + annotation['iscrowd'] = 0 + annotation['image_id'] = image_num + 1 + annotation['bbox'] = list( + map(float, [ + points[0][0], points[0][1], points[1][0] - points[0][0], points[1][ + 1] - points[0][1] + ])) + annotation['area'] = annotation['bbox'][2] * annotation['bbox'][3] + annotation['category_id'] = label_to_num[label] + annotation['id'] = object_num + 1 + return annotation + + +def annotations_polygon(height, width, points, label, image_num, object_num, + label_to_num): + annotation = {} + annotation['segmentation'] = [list(np.asarray(points).flatten())] + annotation['iscrowd'] = 0 + annotation['image_id'] = image_num + 1 + annotation['bbox'] = list(map(float, get_bbox(height, width, points))) + annotation['area'] = annotation['bbox'][2] * annotation['bbox'][3] + annotation['category_id'] = label_to_num[label] + annotation['id'] = object_num + 1 + return annotation + + +def get_bbox(height, width, points): + polygons = points + mask = np.zeros([height, width], dtype=np.uint8) + mask = PIL.Image.fromarray(mask) + xy = list(map(tuple, polygons)) + PIL.ImageDraw.Draw(mask).polygon(xy=xy, outline=1, fill=1) + mask = np.array(mask, dtype=bool) + index = np.argwhere(mask == 1) + rows = index[:, 0] + clos = index[:, 1] + left_top_r = np.min(rows) + left_top_c = np.min(clos) + right_bottom_r = np.max(rows) + right_bottom_c = np.max(clos) + return [ + left_top_c, left_top_r, right_bottom_c - left_top_c, + right_bottom_r - left_top_r + ] + + +def deal_json(ds_type, img_path, json_path): + data_coco = {} + images_list = [] + annotations_list = [] + image_num = -1 + object_num = -1 + for img_file in os.listdir(img_path): + img_label = os.path.splitext(img_file)[0] + if img_file.split('.')[ + -1] not in ['bmp', 'jpg', 'jpeg', 'png', 'JPEG', 'JPG', 'PNG']: + continue + label_file = osp.join(json_path, img_label + '.json') + print('Generating dataset from:', label_file) + image_num = image_num + 1 + with open(label_file) as f: + data = json.load(f) + if ds_type == 'labelme': + images_list.append(images_labelme(data, image_num)) + elif ds_type == 'cityscape': + images_list.append(images_cityscape(data, image_num, img_file)) + if ds_type == 'labelme': + for shapes in data['shapes']: + object_num = object_num + 1 + label = shapes['label'] + if label not in labels_list: + categories_list.append(categories(label, labels_list)) + labels_list.append(label) + label_to_num[label] = len(labels_list) + p_type = shapes['shape_type'] + if p_type == 'polygon': + points = shapes['points'] + annotations_list.append( + annotations_polygon(data['imageHeight'], data[ + 'imageWidth'], points, label, image_num, + object_num, label_to_num)) + + if p_type == 'rectangle': + (x1, y1), (x2, y2) = shapes['points'] + x1, x2 = sorted([x1, x2]) + y1, y2 = sorted([y1, y2]) + points = [[x1, y1], [x2, y2], [x1, y2], [x2, y1]] + annotations_list.append( + annotations_rectangle(points, label, image_num, + object_num, label_to_num)) + elif ds_type == 'cityscape': + for shapes in data['objects']: + object_num = object_num + 1 + label = shapes['label'] + if label not in labels_list: + categories_list.append(categories(label, labels_list)) + labels_list.append(label) + label_to_num[label] = len(labels_list) + points = shapes['polygon'] + annotations_list.append( + annotations_polygon(data['imgHeight'], data[ + 'imgWidth'], points, label, image_num, object_num, + label_to_num)) + data_coco['images'] = images_list + data_coco['categories'] = categories_list + data_coco['annotations'] = annotations_list + return data_coco + + +def voc_get_label_anno(ann_dir_path, ann_ids_path, labels_path): + with open(labels_path, 'r') as f: + labels_str = f.read().split() + labels_ids = list(range(1, len(labels_str) + 1)) + + with open(ann_ids_path, 'r') as f: + ann_ids = [lin.strip().split(' ')[-1] for lin in f.readlines()] + + ann_paths = [] + for aid in ann_ids: + if aid.endswith('xml'): + ann_path = os.path.join(ann_dir_path, aid) + else: + ann_path = os.path.join(ann_dir_path, aid + '.xml') + ann_paths.append(ann_path) + + return dict(zip(labels_str, labels_ids)), ann_paths + + +def voc_get_image_info(annotation_root, im_id): + filename = annotation_root.findtext('filename') + assert filename is not None + img_name = os.path.basename(filename) + + size = annotation_root.find('size') + width = float(size.findtext('width')) + height = float(size.findtext('height')) + + image_info = { + 'file_name': filename, + 'height': height, + 'width': width, + 'id': im_id + } + return image_info + + +def voc_get_coco_annotation(obj, label2id): + label = obj.findtext('name') + assert label in label2id, "label is not in label2id." + category_id = label2id[label] + bndbox = obj.find('bndbox') + xmin = float(bndbox.findtext('xmin')) + ymin = float(bndbox.findtext('ymin')) + xmax = float(bndbox.findtext('xmax')) + ymax = float(bndbox.findtext('ymax')) + assert xmax > xmin and ymax > ymin, "Box size error." + o_width = xmax - xmin + o_height = ymax - ymin + anno = { + 'area': o_width * o_height, + 'iscrowd': 0, + 'bbox': [xmin, ymin, o_width, o_height], + 'category_id': category_id, + 'ignore': 0, + } + return anno + + +def voc_xmls_to_cocojson(annotation_paths, label2id, output_dir, output_file): + output_json_dict = { + "images": [], + "type": "instances", + "annotations": [], + "categories": [] + } + bnd_id = 1 # bounding box start id + im_id = 0 + print('Start converting !') + for a_path in tqdm(annotation_paths): + # Read annotation xml + ann_tree = ET.parse(a_path) + ann_root = ann_tree.getroot() + + img_info = voc_get_image_info(ann_root, im_id) + output_json_dict['images'].append(img_info) + + for obj in ann_root.findall('object'): + ann = voc_get_coco_annotation(obj=obj, label2id=label2id) + ann.update({'image_id': im_id, 'id': bnd_id}) + output_json_dict['annotations'].append(ann) + bnd_id = bnd_id + 1 + im_id += 1 + + for label, label_id in label2id.items(): + category_info = {'supercategory': 'none', 'id': label_id, 'name': label} + output_json_dict['categories'].append(category_info) + output_file = os.path.join(output_dir, output_file) + with open(output_file, 'w') as f: + output_json = json.dumps(output_json_dict) + f.write(output_json) + + +def widerface_to_cocojson(root_path): + train_gt_txt = os.path.join(root_path, "wider_face_split", "wider_face_train_bbx_gt.txt") + val_gt_txt = os.path.join(root_path, "wider_face_split", "wider_face_val_bbx_gt.txt") + train_img_dir = os.path.join(root_path, "WIDER_train", "images") + val_img_dir = os.path.join(root_path, "WIDER_val", "images") + assert train_gt_txt + assert val_gt_txt + assert train_img_dir + assert val_img_dir + save_path = os.path.join(root_path, "widerface_train.json") + widerface_convert(train_gt_txt, train_img_dir, save_path) + print("Wider Face train dataset converts sucess, the json path: {}".format(save_path)) + save_path = os.path.join(root_path, "widerface_val.json") + widerface_convert(val_gt_txt, val_img_dir, save_path) + print("Wider Face val dataset converts sucess, the json path: {}".format(save_path)) + + +def widerface_convert(gt_txt, img_dir, save_path): + output_json_dict = { + "images": [], + "type": "instances", + "annotations": [], + "categories": [{'supercategory': 'none', 'id': 0, 'name': "human_face"}] + } + bnd_id = 1 # bounding box start id + im_id = 0 + print('Start converting !') + with open(gt_txt) as fd: + lines = fd.readlines() + + i = 0 + while i < len(lines): + image_name = lines[i].strip() + bbox_num = int(lines[i + 1].strip()) + i += 2 + img_info = get_widerface_image_info(img_dir, image_name, im_id) + if img_info: + output_json_dict["images"].append(img_info) + for j in range(i, i + bbox_num): + anno = get_widerface_ann_info(lines[j]) + anno.update({'image_id': im_id, 'id': bnd_id}) + output_json_dict['annotations'].append(anno) + bnd_id += 1 + else: + print("The image dose not exist: {}".format(os.path.join(img_dir, image_name))) + bbox_num = 1 if bbox_num == 0 else bbox_num + i += bbox_num + im_id += 1 + with open(save_path, 'w') as f: + output_json = json.dumps(output_json_dict) + f.write(output_json) + + +def get_widerface_image_info(img_root, img_relative_path, img_id): + image_info = {} + save_path = os.path.join(img_root, img_relative_path) + if os.path.exists(save_path): + img = cv2.imread(save_path) + image_info["file_name"] = os.path.join(os.path.basename( + os.path.dirname(img_root)), os.path.basename(img_root), + img_relative_path) + image_info["height"] = img.shape[0] + image_info["width"] = img.shape[1] + image_info["id"] = img_id + return image_info + + +def get_widerface_ann_info(info): + info = [int(x) for x in info.strip().split()] + anno = { + 'area': info[2] * info[3], + 'iscrowd': 0, + 'bbox': [info[0], info[1], info[2], info[3]], + 'category_id': 0, + 'ignore': 0, + 'blur': info[4], + 'expression': info[5], + 'illumination': info[6], + 'invalid': info[7], + 'occlusion': info[8], + 'pose': info[9] + } + return anno + + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + '--dataset_type', + help='the type of dataset, can be `voc`, `widerface`, `labelme` or `cityscape`') + parser.add_argument('--json_input_dir', help='input annotated directory') + parser.add_argument('--image_input_dir', help='image directory') + parser.add_argument( + '--output_dir', help='output dataset directory', default='./') + parser.add_argument( + '--train_proportion', + help='the proportion of train dataset', + type=float, + default=1.0) + parser.add_argument( + '--val_proportion', + help='the proportion of validation dataset', + type=float, + default=0.0) + parser.add_argument( + '--test_proportion', + help='the proportion of test dataset', + type=float, + default=0.0) + parser.add_argument( + '--voc_anno_dir', + help='In Voc format dataset, path to annotation files directory.', + type=str, + default=None) + parser.add_argument( + '--voc_anno_list', + help='In Voc format dataset, path to annotation files ids list.', + type=str, + default=None) + parser.add_argument( + '--voc_label_list', + help='In Voc format dataset, path to label list. The content of each line is a category.', + type=str, + default=None) + parser.add_argument( + '--voc_out_name', + type=str, + default='voc.json', + help='In Voc format dataset, path to output json file') + parser.add_argument( + '--widerface_root_dir', + help='The root_path for wider face dataset, which contains `wider_face_split`, `WIDER_train` and `WIDER_val`.And the json file will save in this path', + type=str, + default=None) + args = parser.parse_args() + try: + assert args.dataset_type in ['voc', 'labelme', 'cityscape', 'widerface'] + except AssertionError as e: + print( + 'Now only support the voc, cityscape dataset and labelme dataset!!') + os._exit(0) + + if args.dataset_type == 'voc': + assert args.voc_anno_dir and args.voc_anno_list and args.voc_label_list + label2id, ann_paths = voc_get_label_anno( + args.voc_anno_dir, args.voc_anno_list, args.voc_label_list) + voc_xmls_to_cocojson( + annotation_paths=ann_paths, + label2id=label2id, + output_dir=args.output_dir, + output_file=args.voc_out_name) + elif args.dataset_type == "widerface": + assert args.widerface_root_dir + widerface_to_cocojson(args.widerface_root_dir) + else: + try: + assert os.path.exists(args.json_input_dir) + except AssertionError as e: + print('The json folder does not exist!') + os._exit(0) + try: + assert os.path.exists(args.image_input_dir) + except AssertionError as e: + print('The image folder does not exist!') + os._exit(0) + try: + assert abs(args.train_proportion + args.val_proportion \ + + args.test_proportion - 1.0) < 1e-5 + except AssertionError as e: + print( + 'The sum of pqoportion of training, validation and test datase must be 1!' + ) + os._exit(0) + + # Allocate the dataset. + total_num = len(glob.glob(osp.join(args.json_input_dir, '*.json'))) + if args.train_proportion != 0: + train_num = int(total_num * args.train_proportion) + out_dir = args.output_dir + '/train' + if not os.path.exists(out_dir): + os.makedirs(out_dir) + else: + train_num = 0 + if args.val_proportion == 0.0: + val_num = 0 + test_num = total_num - train_num + out_dir = args.output_dir + '/test' + if args.test_proportion != 0.0 and not os.path.exists(out_dir): + os.makedirs(out_dir) + else: + val_num = int(total_num * args.val_proportion) + test_num = total_num - train_num - val_num + val_out_dir = args.output_dir + '/val' + if not os.path.exists(val_out_dir): + os.makedirs(val_out_dir) + test_out_dir = args.output_dir + '/test' + if args.test_proportion != 0.0 and not os.path.exists(test_out_dir): + os.makedirs(test_out_dir) + count = 1 + for img_name in os.listdir(args.image_input_dir): + if count <= train_num: + if osp.exists(args.output_dir + '/train/'): + shutil.copyfile( + osp.join(args.image_input_dir, img_name), + osp.join(args.output_dir + '/train/', img_name)) + else: + if count <= train_num + val_num: + if osp.exists(args.output_dir + '/val/'): + shutil.copyfile( + osp.join(args.image_input_dir, img_name), + osp.join(args.output_dir + '/val/', img_name)) + else: + if osp.exists(args.output_dir + '/test/'): + shutil.copyfile( + osp.join(args.image_input_dir, img_name), + osp.join(args.output_dir + '/test/', img_name)) + count = count + 1 + + # Deal with the json files. + if not os.path.exists(args.output_dir + '/annotations'): + os.makedirs(args.output_dir + '/annotations') + if args.train_proportion != 0: + train_data_coco = deal_json(args.dataset_type, + args.output_dir + '/train', + args.json_input_dir) + train_json_path = osp.join(args.output_dir + '/annotations', + 'instance_train.json') + json.dump( + train_data_coco, + open(train_json_path, 'w'), + indent=4, + cls=MyEncoder) + if args.val_proportion != 0: + val_data_coco = deal_json(args.dataset_type, + args.output_dir + '/val', + args.json_input_dir) + val_json_path = osp.join(args.output_dir + '/annotations', + 'instance_val.json') + json.dump( + val_data_coco, + open(val_json_path, 'w'), + indent=4, + cls=MyEncoder) + if args.test_proportion != 0: + test_data_coco = deal_json(args.dataset_type, + args.output_dir + '/test', + args.json_input_dir) + test_json_path = osp.join(args.output_dir + '/annotations', + 'instance_test.json') + json.dump( + test_data_coco, + open(test_json_path, 'w'), + indent=4, + cls=MyEncoder) + + +if __name__ == '__main__': + main() diff --git a/rtdetr_pytorch/README.md b/rtdetr_pytorch/README.md new file mode 100644 index 0000000..e16c898 --- /dev/null +++ b/rtdetr_pytorch/README.md @@ -0,0 +1,111 @@ +## TODO +
+ see details + +- [x] Training +- [x] Evaluation +- [x] Export onnx +- [x] Upload source code +- [x] Upload weight convert from paddle, see [*links*](https://github.com/lyuwenyu/RT-DETR/issues/42) +- [x] Align training details with the [*paddle version*](../rtdetr_paddle/) +- [x] Tuning rtdetr based on [*pretrained weights*](https://github.com/lyuwenyu/RT-DETR/issues/42) + +
+ + +## Model Zoo + +| Model | Dataset | Input Size | APval | AP50val | #Params(M) | FPS | checkpoint | +| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | +rtdetr_r18vd | COCO | 640 | 46.4 | 63.7 | 20 | 217 | [url*](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r18vd_dec3_6x_coco_from_paddle.pth) +rtdetr_r34vd | COCO | 640 | 48.9 | 66.8 | 31 | 161 | [url*](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r34vd_dec4_6x_coco_from_paddle.pth) +rtdetr_r50vd_m | COCO | 640 | 51.3 | 69.5 | 36 | 145 | [url*](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_m_6x_coco_from_paddle.pth) +rtdetr_r50vd | COCO | 640 | 53.1 | 71.2| 42 | 108 | [url*](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_6x_coco_from_paddle.pth) +rtdetr_r101vd | COCO | 640 | 54.3 | 72.8 | 76 | 74 | [url*](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r101vd_6x_coco_from_paddle.pth) +rtdetr_18vd | COCO+Objects365 | 640 | 49.0 | 66.5 | 20 | 217 | [url*](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r18vd_5x_coco_objects365_from_paddle.pth) +rtdetr_r50vd | COCO+Objects365 | 640 | 55.2 | 73.4 | 42 | 108 | [url*](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_2x_coco_objects365_from_paddle.pth) +rtdetr_r101vd | COCO+Objects365 | 640 | 56.2 | 74.5 | 76 | 74 | [url*](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r101vd_2x_coco_objects365_from_paddle.pth) +rtdetr_regnet | COCO | 640 | 51.6 | 69.6 | 38 | 67 | [url*](https://drive.google.com/file/d/1K2EXJgnaEUJcZCLULHrZ492EF4PdgVp9/view?usp=sharing) +rtdetr_dla34 | COCO | 640 | 49.6 | 67.4 | 34 | 83 | [url*](https://drive.google.com/file/d/1_rVpl-jIelwy2LDT3E4vdM4KCLBcOtzZ/view?usp=sharing) + +Notes +- `COCO + Objects365` in the table means finetuned model on `COCO` using pretrained weights trained on `Objects365`. +- `url``*` is the url of pretrained weights convert from paddle model for save energy. *It may have slight differences between this table and paper* + + +## Quick start + +
+Install + +```bash +pip install -r requirements.txt +``` + +
+ + +
+Data + +- Download and extract COCO 2017 train and val images. +``` +path/to/coco/ + annotations/ # annotation json files + train2017/ # train images + val2017/ # val images +``` +- Modify config [`img_folder`, `ann_file`](configs/dataset/coco_detection.yml) +
+ + + +
+Training & Evaluation + +- Training on a Single GPU: + +```shell +# training on single-gpu +export CUDA_VISIBLE_DEVICES=0 +python tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml +``` + +- Training on Multiple GPUs: + +```shell +# train on multi-gpu +export CUDA_VISIBLE_DEVICES=0,1,2,3 +torchrun --nproc_per_node=4 tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml +``` + +- Evaluation on Multiple GPUs: + +```shell +# val on multi-gpu +export CUDA_VISIBLE_DEVICES=0,1,2,3 +torchrun --nproc_per_node=4 tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml -r path/to/checkpoint --test-only +``` + +
+ + + +
+Export + +```shell +python tools/export_onnx.py -c configs/rtdetr/rtdetr_r18vd_6x_coco.yml -r path/to/checkpoint --check +``` +
+ + + + +
+Train custom data + +1. set `remap_mscoco_category: False`. This variable only works for ms-coco dataset. If you want to use `remap_mscoco_category` logic on your dataset, please modify variable [`mscoco_category2name`](https://github.com/lyuwenyu/RT-DETR/blob/main/rtdetr_pytorch/src/data/coco/coco_dataset.py#L154) based on your dataset. + +2. add `-t path/to/checkpoint` (optinal) to tuning rtdetr based on pretrained checkpoint. see [training script details](./tools/README.md). +
diff --git a/rtdetr_pytorch/configs/dataset/coco_detection.yml b/rtdetr_pytorch/configs/dataset/coco_detection.yml new file mode 100644 index 0000000..f71a4ef --- /dev/null +++ b/rtdetr_pytorch/configs/dataset/coco_detection.yml @@ -0,0 +1,34 @@ +task: detection + +num_classes: 80 +remap_mscoco_category: True + +train_dataloader: + type: DataLoader + dataset: + type: CocoDetection + img_folder: ./dataset/coco/train2017/ + ann_file: ./dataset/coco/annotations/instances_train2017.json + transforms: + type: Compose + ops: ~ + shuffle: True + batch_size: 8 + num_workers: 4 + drop_last: True + + +val_dataloader: + type: DataLoader + dataset: + type: CocoDetection + img_folder: ./dataset/coco/val2017/ + ann_file: ./dataset/coco/annotations/instances_val2017.json + transforms: + type: Compose + ops: ~ + + shuffle: False + batch_size: 8 + num_workers: 4 + drop_last: False \ No newline at end of file diff --git a/rtdetr_pytorch/configs/rtdetr/include/dataloader.yml b/rtdetr_pytorch/configs/rtdetr/include/dataloader.yml new file mode 100644 index 0000000..e3e6bc1 --- /dev/null +++ b/rtdetr_pytorch/configs/rtdetr/include/dataloader.yml @@ -0,0 +1,39 @@ +# num_classes: 91 +# remap_mscoco_category: True + +train_dataloader: + dataset: + return_masks: False + transforms: + ops: + - {type: RandomPhotometricDistort, p: 0.5} + - {type: RandomZoomOut, fill: 0} + - {type: RandomIoUCrop, p: 0.8} + - {type: SanitizeBoundingBox, min_size: 1} + - {type: RandomHorizontalFlip} + - {type: Resize, size: [640, 640], } + # - {type: Resize, size: 639, max_size: 640} + # - {type: PadToSize, spatial_size: 640} + - {type: ToImageTensor} + - {type: ConvertDtype} + - {type: SanitizeBoundingBox, min_size: 1} + - {type: ConvertBox, out_fmt: 'cxcywh', normalize: True} + shuffle: True + batch_size: 4 + num_workers: 4 + collate_fn: default_collate_fn + + +val_dataloader: + dataset: + transforms: + ops: + # - {type: Resize, size: 639, max_size: 640} + # - {type: PadToSize, spatial_size: 640} + - {type: Resize, size: [640, 640]} + - {type: ToImageTensor} + - {type: ConvertDtype} + shuffle: False + batch_size: 8 + num_workers: 4 + collate_fn: default_collate_fn diff --git a/rtdetr_pytorch/configs/rtdetr/include/dataloader_regnet.yml b/rtdetr_pytorch/configs/rtdetr/include/dataloader_regnet.yml new file mode 100644 index 0000000..ba0607b --- /dev/null +++ b/rtdetr_pytorch/configs/rtdetr/include/dataloader_regnet.yml @@ -0,0 +1,39 @@ +# num_classes: 91 +# remap_mscoco_category: True + +train_dataloader: + dataset: + return_masks: False + transforms: + ops: + - {type: RandomPhotometricDistort, p: 0.5} + - {type: RandomZoomOut, fill: 0} + - {type: RandomIoUCrop, p: 0.8} + - {type: SanitizeBoundingBox, min_size: 1} + - {type: RandomHorizontalFlip} + - {type: Resize, size: [640, 640], } + # - {type: Resize, size: 639, max_size: 640} + # - {type: PadToSize, spatial_size: 640} + - {type: ToImageTensor} + - {type: ConvertDtype} + - {type: SanitizeBoundingBox, min_size: 1} + - {type: ConvertBox, out_fmt: 'cxcywh', normalize: True} + shuffle: True + batch_size: 8 + num_workers: 2 + collate_fn: default_collate_fn + + +val_dataloader: + dataset: + transforms: + ops: + # - {type: Resize, size: 639, max_size: 640} + # - {type: PadToSize, spatial_size: 640} + - {type: Resize, size: [640, 640]} + - {type: ToImageTensor} + - {type: ConvertDtype} + shuffle: False + batch_size: 8 + num_workers: 2 + collate_fn: default_collate_fn diff --git a/rtdetr_pytorch/configs/rtdetr/include/optimizer.yml b/rtdetr_pytorch/configs/rtdetr/include/optimizer.yml new file mode 100644 index 0000000..af2ad65 --- /dev/null +++ b/rtdetr_pytorch/configs/rtdetr/include/optimizer.yml @@ -0,0 +1,36 @@ + +use_ema: True +ema: + type: ModelEMA + decay: 0.9999 + warmups: 2000 + + +find_unused_parameters: True + +epoches: 72 +clip_max_norm: 0.1 + +optimizer: + type: AdamW + params: + - + params: 'backbone' + lr: 0.00001 + - + params: '^(?=.*encoder(?=.*bias|.*norm.*weight)).*$' + weight_decay: 0. + - + params: '^(?=.*decoder(?=.*bias|.*norm.*weight)).*$' + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 + + +lr_scheduler: + type: MultiStepLR + milestones: [1000] + gamma: 0.1 + diff --git a/rtdetr_pytorch/configs/rtdetr/include/optimizer_regnet.yml b/rtdetr_pytorch/configs/rtdetr/include/optimizer_regnet.yml new file mode 100644 index 0000000..52bd7a3 --- /dev/null +++ b/rtdetr_pytorch/configs/rtdetr/include/optimizer_regnet.yml @@ -0,0 +1,33 @@ + +use_ema: True +ema: + type: ModelEMA + decay: 0.9999 + warmups: 2000 + + +find_unused_parameters: True + +epoches: 72 +clip_max_norm: 0.1 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*encoder(?=.*bias|.*norm.*weight)).*$' + weight_decay: 0. + - + params: '^(?=.*decoder(?=.*bias|.*norm.*weight)).*$' + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 + + +lr_scheduler: + type: MultiStepLR + milestones: [1000] + gamma: 0.1 + diff --git a/rtdetr_pytorch/configs/rtdetr/include/rtdetr_dla34.yml b/rtdetr_pytorch/configs/rtdetr/include/rtdetr_dla34.yml new file mode 100644 index 0000000..209d344 --- /dev/null +++ b/rtdetr_pytorch/configs/rtdetr/include/rtdetr_dla34.yml @@ -0,0 +1,78 @@ +task: detection + +model: RTDETR +criterion: SetCriterion +postprocessor: RTDETRPostProcessor + + +RTDETR: + backbone: DLANet + encoder: HybridEncoder + decoder: RTDETRTransformer + multi_scale: [480, 512, 544, 576, 608, 640, 640, 640, 672, 704, 736, 768, 800] + +DLANet: + dla: dla34 + pretrained: True + return_idx: [1, 2, 3] + + +HybridEncoder: + in_channels: [128, 256, 512] + feat_strides: [8, 16, 32] + + # intra + hidden_dim: 256 + use_encoder_idx: [2] + num_encoder_layers: 1 + nhead: 8 + dim_feedforward: 1024 + dropout: 0. + enc_act: 'gelu' + pe_temperature: 10000 + + # cross + expansion: 1.0 + depth_mult: 1 + act: 'silu' + + # eval + eval_spatial_size: [640, 640] + + +RTDETRTransformer: + feat_channels: [256, 256, 256] + feat_strides: [8, 16, 32] + hidden_dim: 256 + num_levels: 3 + + num_queries: 300 + + num_decoder_layers: 6 + num_denoising: 100 + + eval_idx: -1 + eval_spatial_size: [640, 640] + + +use_focal_loss: True + +RTDETRPostProcessor: + num_top_queries: 300 + + +SetCriterion: + weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2,} + losses: ['vfl', 'boxes', ] + alpha: 0.75 + gamma: 2.0 + + matcher: + type: HungarianMatcher + weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2} + # use_focal_loss: True + alpha: 0.25 + gamma: 2.0 + + + diff --git a/rtdetr_pytorch/configs/rtdetr/include/rtdetr_r50vd.yml b/rtdetr_pytorch/configs/rtdetr/include/rtdetr_r50vd.yml new file mode 100644 index 0000000..7f2e1f3 --- /dev/null +++ b/rtdetr_pytorch/configs/rtdetr/include/rtdetr_r50vd.yml @@ -0,0 +1,81 @@ +task: detection + +model: RTDETR +criterion: SetCriterion +postprocessor: RTDETRPostProcessor + + +RTDETR: + backbone: PResNet + encoder: HybridEncoder + decoder: RTDETRTransformer + multi_scale: [480, 512, 544, 576, 608, 640, 640, 640, 672, 704, 736, 768, 800] + +PResNet: + depth: 50 + variant: d + freeze_at: 0 + return_idx: [1, 2, 3] + num_stages: 4 + freeze_norm: True + pretrained: True + +HybridEncoder: + in_channels: [512, 1024, 2048] + feat_strides: [8, 16, 32] + + # intra + hidden_dim: 256 + use_encoder_idx: [2] + num_encoder_layers: 1 + nhead: 8 + dim_feedforward: 1024 + dropout: 0. + enc_act: 'gelu' + pe_temperature: 10000 + + # cross + expansion: 1.0 + depth_mult: 1 + act: 'silu' + + # eval + eval_spatial_size: [640, 640] + + +RTDETRTransformer: + feat_channels: [256, 256, 256] + feat_strides: [8, 16, 32] + hidden_dim: 256 + num_levels: 3 + + num_queries: 300 + + num_decoder_layers: 6 + num_denoising: 100 + + eval_idx: -1 + eval_spatial_size: [640, 640] + + +use_focal_loss: True + +RTDETRPostProcessor: + num_top_queries: 300 + + +SetCriterion: + weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2,} + losses: ['vfl', 'boxes', ] + alpha: 0.75 + gamma: 2.0 + + matcher: + type: HungarianMatcher + weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2} + # use_focal_loss: True + alpha: 0.25 + gamma: 2.0 + + + diff --git a/rtdetr_pytorch/configs/rtdetr/include/rtdetr_regnet.yml b/rtdetr_pytorch/configs/rtdetr/include/rtdetr_regnet.yml new file mode 100644 index 0000000..0bc8cce --- /dev/null +++ b/rtdetr_pytorch/configs/rtdetr/include/rtdetr_regnet.yml @@ -0,0 +1,77 @@ +task: detection + +model: RTDETR +criterion: SetCriterion +postprocessor: RTDETRPostProcessor + + +RTDETR: + backbone: RegNet + encoder: HybridEncoder + decoder: RTDETRTransformer + multi_scale: [480, 512, 544, 576, 608, 640, 640, 640, 672, 704, 736, 768, 800] + + +RegNet: + return_idx: [1, 2, 3] + configuration: RegNetConfig() + +HybridEncoder: + in_channels: [192, 512, 1088] + feat_strides: [8, 16, 32] + + # intra + hidden_dim: 256 + use_encoder_idx: [2] + num_encoder_layers: 1 + nhead: 8 + dim_feedforward: 1024 + dropout: 0. + enc_act: 'gelu' + pe_temperature: 10000 + + # cross + expansion: 1.0 + depth_mult: 1 + act: 'silu' + + # eval + eval_spatial_size: [640, 640] + + +RTDETRTransformer: + feat_channels: [256, 256, 256] + feat_strides: [8, 16, 32] + hidden_dim: 256 + num_levels: 3 + + num_queries: 300 + + num_decoder_layers: 6 + num_denoising: 100 + + eval_idx: -1 + eval_spatial_size: [640, 640] + + +use_focal_loss: True + +RTDETRPostProcessor: + num_top_queries: 300 + + +SetCriterion: + weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2,} + losses: ['vfl', 'boxes', ] + alpha: 0.75 + gamma: 2.0 + + matcher: + type: HungarianMatcher + weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2} + # use_focal_loss: True + alpha: 0.25 + gamma: 2.0 + + + diff --git a/rtdetr_pytorch/configs/rtdetr/rtdetr_dla34_6x_coco.yml b/rtdetr_pytorch/configs/rtdetr/rtdetr_dla34_6x_coco.yml new file mode 100644 index 0000000..81d8339 --- /dev/null +++ b/rtdetr_pytorch/configs/rtdetr/rtdetr_dla34_6x_coco.yml @@ -0,0 +1,9 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetr_dla34.yml', +] + +output_dir: ./output/rtdetr_dla34_6x_coco diff --git a/rtdetr_pytorch/configs/rtdetr/rtdetr_r101vd_6x_coco.yml b/rtdetr_pytorch/configs/rtdetr/rtdetr_r101vd_6x_coco.yml new file mode 100644 index 0000000..c6be6e0 --- /dev/null +++ b/rtdetr_pytorch/configs/rtdetr/rtdetr_r101vd_6x_coco.yml @@ -0,0 +1,28 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetr_r50vd.yml', +] + +PResNet: + depth: 101 + + +HybridEncoder: + # intra + hidden_dim: 384 + dim_feedforward: 2048 + + +RTDETRTransformer: + feat_channels: [384, 384, 384] + + +optimizer: + type: AdamW + params: + - + params: 'backbone' + lr: 0.000001 \ No newline at end of file diff --git a/rtdetr_pytorch/configs/rtdetr/rtdetr_r18vd_6x_coco.yml b/rtdetr_pytorch/configs/rtdetr/rtdetr_r18vd_6x_coco.yml new file mode 100644 index 0000000..791dd4b --- /dev/null +++ b/rtdetr_pytorch/configs/rtdetr/rtdetr_r18vd_6x_coco.yml @@ -0,0 +1,49 @@ + +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetr_r50vd.yml', +] + + +output_dir: ./output/rtdetr_r18vd_6x_coco + +PResNet: + depth: 18 + freeze_at: -1 + freeze_norm: False + pretrained: True + +HybridEncoder: + in_channels: [128, 256, 512] + hidden_dim: 256 + expansion: 0.5 + + +RTDETRTransformer: + eval_idx: -1 + num_decoder_layers: 3 + num_denoising: 100 + + + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?=.*norm).*$' + lr: 0.00001 + weight_decay: 0. + - + params: '^(?=.*backbone)(?!.*norm).*$' + lr: 0.00001 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bias)).*$' + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 + diff --git a/rtdetr_pytorch/configs/rtdetr/rtdetr_r34vd_6x_coco.yml b/rtdetr_pytorch/configs/rtdetr/rtdetr_r34vd_6x_coco.yml new file mode 100644 index 0000000..e7779a3 --- /dev/null +++ b/rtdetr_pytorch/configs/rtdetr/rtdetr_r34vd_6x_coco.yml @@ -0,0 +1,48 @@ + +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetr_r50vd.yml', +] + + +output_dir: ./output/rtdetr_r34vd_6x_coco + + +PResNet: + depth: 34 + freeze_at: -1 + freeze_norm: False + pretrained: True + + +HybridEncoder: + in_channels: [128, 256, 512] + hidden_dim: 256 + expansion: 0.5 + + +RTDETRTransformer: + num_decoder_layers: 4 + + + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?=.*norm|bn).*$' + weight_decay: 0. + lr: 0.00001 + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.00001 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$' + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 diff --git a/rtdetr_pytorch/configs/rtdetr/rtdetr_r50vd_6x_coco.yml b/rtdetr_pytorch/configs/rtdetr/rtdetr_r50vd_6x_coco.yml new file mode 100644 index 0000000..1294971 --- /dev/null +++ b/rtdetr_pytorch/configs/rtdetr/rtdetr_r50vd_6x_coco.yml @@ -0,0 +1,9 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetr_r50vd.yml', +] + +output_dir: ./output/rtdetr_r50vd_6x_coco diff --git a/rtdetr_pytorch/configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml b/rtdetr_pytorch/configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml new file mode 100644 index 0000000..6e61823 --- /dev/null +++ b/rtdetr_pytorch/configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml @@ -0,0 +1,16 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetr_r50vd.yml', +] + +output_dir: ./output/rtdetr_r50vd_m_6x_coco + + +HybridEncoder: + expansion: 0.5 + +RTDETRTransformer: + eval_idx: 2 # use 3th decoder layer to eval \ No newline at end of file diff --git a/rtdetr_pytorch/configs/rtdetr/rtdetr_regnet_6x_coco.yml b/rtdetr_pytorch/configs/rtdetr/rtdetr_regnet_6x_coco.yml new file mode 100644 index 0000000..a5d8672 --- /dev/null +++ b/rtdetr_pytorch/configs/rtdetr/rtdetr_regnet_6x_coco.yml @@ -0,0 +1,9 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader_regnet.yml', + './include/optimizer_regnet.yml', + './include/rtdetr_regnet.yml', +] + +output_dir: ./output/rtdetr_regnet_6x_coco diff --git a/rtdetr_pytorch/configs/runtime.yml b/rtdetr_pytorch/configs/runtime.yml new file mode 100644 index 0000000..f08620f --- /dev/null +++ b/rtdetr_pytorch/configs/runtime.yml @@ -0,0 +1,17 @@ +sync_bn: True +find_unused_parameters: False + + +use_amp: False + +scaler: + type: GradScaler + enabled: True + + +use_ema: False +ema: + type: ModelEMA + decay: 0.9999 + warmups: 2000 + diff --git a/rtdetr_pytorch/requirements.txt b/rtdetr_pytorch/requirements.txt new file mode 100644 index 0000000..eb0fb1c --- /dev/null +++ b/rtdetr_pytorch/requirements.txt @@ -0,0 +1,8 @@ +torch==2.0.1 +torchvision==0.15.2 +onnx==1.14.0 +onnxruntime==1.15.1 +pycocotools +PyYAML +scipy +transformers diff --git a/rtdetr_pytorch/src/__init__.py b/rtdetr_pytorch/src/__init__.py new file mode 100644 index 0000000..6cb1033 --- /dev/null +++ b/rtdetr_pytorch/src/__init__.py @@ -0,0 +1,5 @@ + +from . import data +from . import nn +from . import optim +from . import zoo diff --git a/rtdetr_pytorch/src/core/__init__.py b/rtdetr_pytorch/src/core/__init__.py new file mode 100644 index 0000000..35c455c --- /dev/null +++ b/rtdetr_pytorch/src/core/__init__.py @@ -0,0 +1,7 @@ +"""by lyuwenyu +""" + +# from .yaml_utils import register, create, load_config, merge_config, merge_dict +from .yaml_utils import * +from .config import BaseConfig +from .yaml_config import YAMLConfig diff --git a/rtdetr_pytorch/src/core/config.py b/rtdetr_pytorch/src/core/config.py new file mode 100644 index 0000000..cf803ef --- /dev/null +++ b/rtdetr_pytorch/src/core/config.py @@ -0,0 +1,264 @@ +"""by lyuwenyu +""" + +from pprint import pprint +import torch +import torch.nn as nn +from torch.utils.data import Dataset, DataLoader +from torch.optim import Optimizer +from torch.optim.lr_scheduler import LRScheduler +from torch.cuda.amp.grad_scaler import GradScaler + +from typing import Callable, List, Dict + + +__all__ = ['BaseConfig', ] + + + +class BaseConfig(object): + # TODO property + + + def __init__(self) -> None: + super().__init__() + + self.task :str = None + + self._model :nn.Module = None + self._postprocessor :nn.Module = None + self._criterion :nn.Module = None + self._optimizer :Optimizer = None + self._lr_scheduler :LRScheduler = None + self._train_dataloader :DataLoader = None + self._val_dataloader :DataLoader = None + self._ema :nn.Module = None + self._scaler :GradScaler = None + + self.train_dataset :Dataset = None + self.val_dataset :Dataset = None + self.num_workers :int = 0 + self.collate_fn :Callable = None + + self.batch_size :int = None + self._train_batch_size :int = None + self._val_batch_size :int = None + self._train_shuffle: bool = None + self._val_shuffle: bool = None + + self.evaluator :Callable[[nn.Module, DataLoader, str], ] = None + + # runtime + self.resume :str = None + self.tuning :str = None + + self.epoches :int = None + self.last_epoch :int = -1 + self.end_epoch :int = None + + self.use_amp :bool = False + self.use_ema :bool = False + self.sync_bn :bool = False + self.clip_max_norm : float = None + self.find_unused_parameters :bool = None + # self.ema_decay: float = 0.9999 + # self.grad_clip_: Callable = None + + self.log_dir :str = './logs/' + self.log_step :int = 10 + self._output_dir :str = None + self._print_freq :int = None + self.checkpoint_step :int = 1 + + # self.device :str = torch.device('cpu') + device = 'cuda' if torch.cuda.is_available() else 'cpu' + self.device = torch.device(device) + + + @property + def model(self, ) -> nn.Module: + return self._model + + @model.setter + def model(self, m): + assert isinstance(m, nn.Module), f'{type(m)} != nn.Module, please check your model class' + self._model = m + + @property + def postprocessor(self, ) -> nn.Module: + return self._postprocessor + + @postprocessor.setter + def postprocessor(self, m): + assert isinstance(m, nn.Module), f'{type(m)} != nn.Module, please check your model class' + self._postprocessor = m + + @property + def criterion(self, ) -> nn.Module: + return self._criterion + + @criterion.setter + def criterion(self, m): + assert isinstance(m, nn.Module), f'{type(m)} != nn.Module, please check your model class' + self._criterion = m + + @property + def optimizer(self, ) -> Optimizer: + return self._optimizer + + @optimizer.setter + def optimizer(self, m): + assert isinstance(m, Optimizer), f'{type(m)} != optim.Optimizer, please check your model class' + self._optimizer = m + + @property + def lr_scheduler(self, ) -> LRScheduler: + return self._lr_scheduler + + @lr_scheduler.setter + def lr_scheduler(self, m): + assert isinstance(m, LRScheduler), f'{type(m)} != LRScheduler, please check your model class' + self._lr_scheduler = m + + + @property + def train_dataloader(self): + if self._train_dataloader is None and self.train_dataset is not None: + loader = DataLoader(self.train_dataset, + batch_size=self.train_batch_size, + num_workers=self.num_workers, + collate_fn=self.collate_fn, + shuffle=self.train_shuffle, ) + loader.shuffle = self.train_shuffle + self._train_dataloader = loader + + return self._train_dataloader + + @train_dataloader.setter + def train_dataloader(self, loader): + self._train_dataloader = loader + + @property + def val_dataloader(self): + if self._val_dataloader is None and self.val_dataset is not None: + loader = DataLoader(self.val_dataset, + batch_size=self.val_batch_size, + num_workers=self.num_workers, + drop_last=False, + collate_fn=self.collate_fn, + shuffle=self.val_shuffle) + loader.shuffle = self.val_shuffle + self._val_dataloader = loader + + return self._val_dataloader + + @val_dataloader.setter + def val_dataloader(self, loader): + self._val_dataloader = loader + + + # TODO method + # @property + # def ema(self, ) -> nn.Module: + # if self._ema is None and self.use_ema and self.model is not None: + # self._ema = ModelEMA(self.model, self.ema_decay) + # return self._ema + + @property + def ema(self, ) -> nn.Module: + return self._ema + + @ema.setter + def ema(self, obj): + self._ema = obj + + + @property + def scaler(self) -> GradScaler: + if self._scaler is None and self.use_amp and torch.cuda.is_available(): + self._scaler = GradScaler() + return self._scaler + + @scaler.setter + def scaler(self, obj: GradScaler): + self._scaler = obj + + + @property + def val_shuffle(self): + if self._val_shuffle is None: + print('warning: set default val_shuffle=False') + return False + return self._val_shuffle + + @val_shuffle.setter + def val_shuffle(self, shuffle): + assert isinstance(shuffle, bool), 'shuffle must be bool' + self._val_shuffle = shuffle + + @property + def train_shuffle(self): + if self._train_shuffle is None: + print('warning: set default train_shuffle=True') + return True + return self._train_shuffle + + @train_shuffle.setter + def train_shuffle(self, shuffle): + assert isinstance(shuffle, bool), 'shuffle must be bool' + self._train_shuffle = shuffle + + + @property + def train_batch_size(self): + if self._train_batch_size is None and isinstance(self.batch_size, int): + print(f'warning: set train_batch_size=batch_size={self.batch_size}') + return self.batch_size + return self._train_batch_size + + @train_batch_size.setter + def train_batch_size(self, batch_size): + assert isinstance(batch_size, int), 'batch_size must be int' + self._train_batch_size = batch_size + + @property + def val_batch_size(self): + if self._val_batch_size is None: + print(f'warning: set val_batch_size=batch_size={self.batch_size}') + return self.batch_size + return self._val_batch_size + + @val_batch_size.setter + def val_batch_size(self, batch_size): + assert isinstance(batch_size, int), 'batch_size must be int' + self._val_batch_size = batch_size + + + @property + def output_dir(self): + if self._output_dir is None: + return self.log_dir + return self._output_dir + + @output_dir.setter + def output_dir(self, root): + self._output_dir = root + + @property + def print_freq(self): + if self._print_freq is None: + # self._print_freq = self.log_step + return self.log_step + return self._print_freq + + @print_freq.setter + def print_freq(self, n): + assert isinstance(n, int), 'print_freq must be int' + self._print_freq = n + + + # def __repr__(self) -> str: + # pass + + + diff --git a/rtdetr_pytorch/src/core/yaml_config.py b/rtdetr_pytorch/src/core/yaml_config.py new file mode 100644 index 0000000..6f8f7ef --- /dev/null +++ b/rtdetr_pytorch/src/core/yaml_config.py @@ -0,0 +1,152 @@ +"""by lyuwenyu +""" + +import torch +import torch.nn as nn + +import re +import copy + +from .config import BaseConfig +from .yaml_utils import load_config, merge_config, create, merge_dict + + +class YAMLConfig(BaseConfig): + def __init__(self, cfg_path: str, **kwargs) -> None: + super().__init__() + + cfg = load_config(cfg_path) + merge_dict(cfg, kwargs) + + # pprint(cfg) + + self.yaml_cfg = cfg + + self.log_step = cfg.get('log_step', 100) + self.checkpoint_step = cfg.get('checkpoint_step', 1) + self.epoches = cfg.get('epoches', -1) + self.resume = cfg.get('resume', '') + self.tuning = cfg.get('tuning', '') + self.sync_bn = cfg.get('sync_bn', False) + self.output_dir = cfg.get('output_dir', None) + + self.use_ema = cfg.get('use_ema', False) + self.use_amp = cfg.get('use_amp', False) + self.autocast = cfg.get('autocast', dict()) + self.find_unused_parameters = cfg.get('find_unused_parameters', None) + self.clip_max_norm = cfg.get('clip_max_norm', 0.) + + + @property + def model(self, ) -> torch.nn.Module: + if self._model is None and 'model' in self.yaml_cfg: + merge_config(self.yaml_cfg) + self._model = create(self.yaml_cfg['model']) + return self._model + + @property + def postprocessor(self, ) -> torch.nn.Module: + if self._postprocessor is None and 'postprocessor' in self.yaml_cfg: + merge_config(self.yaml_cfg) + self._postprocessor = create(self.yaml_cfg['postprocessor']) + return self._postprocessor + + @property + def criterion(self, ): + if self._criterion is None and 'criterion' in self.yaml_cfg: + merge_config(self.yaml_cfg) + self._criterion = create(self.yaml_cfg['criterion']) + return self._criterion + + + @property + def optimizer(self, ): + if self._optimizer is None and 'optimizer' in self.yaml_cfg: + merge_config(self.yaml_cfg) + params = self.get_optim_params(self.yaml_cfg['optimizer'], self.model) + self._optimizer = create('optimizer', params=params) + + return self._optimizer + + @property + def lr_scheduler(self, ): + if self._lr_scheduler is None and 'lr_scheduler' in self.yaml_cfg: + merge_config(self.yaml_cfg) + self._lr_scheduler = create('lr_scheduler', optimizer=self.optimizer) + print('Initial lr: ', self._lr_scheduler.get_last_lr()) + + return self._lr_scheduler + + @property + def train_dataloader(self, ): + if self._train_dataloader is None and 'train_dataloader' in self.yaml_cfg: + merge_config(self.yaml_cfg) + self._train_dataloader = create('train_dataloader') + self._train_dataloader.shuffle = self.yaml_cfg['train_dataloader'].get('shuffle', False) + + return self._train_dataloader + + @property + def val_dataloader(self, ): + if self._val_dataloader is None and 'val_dataloader' in self.yaml_cfg: + merge_config(self.yaml_cfg) + self._val_dataloader = create('val_dataloader') + self._val_dataloader.shuffle = self.yaml_cfg['val_dataloader'].get('shuffle', False) + + return self._val_dataloader + + + @property + def ema(self, ): + if self._ema is None and self.yaml_cfg.get('use_ema', False): + merge_config(self.yaml_cfg) + self._ema = create('ema', model=self.model) + + return self._ema + + + @property + def scaler(self, ): + if self._scaler is None and self.yaml_cfg.get('use_amp', False): + merge_config(self.yaml_cfg) + self._scaler = create('scaler') + + return self._scaler + + + @staticmethod + def get_optim_params(cfg: dict, model: nn.Module): + ''' + E.g.: + ^(?=.*a)(?=.*b).*$ means including a and b + ^((?!b.)*a((?!b).)*$ means including a but not b + ^((?!b|c).)*a((?!b|c).)*$ means including a but not (b | c) + ''' + assert 'type' in cfg, '' + cfg = copy.deepcopy(cfg) + + if 'params' not in cfg: + return model.parameters() + + assert isinstance(cfg['params'], list), '' + + param_groups = [] + visited = [] + for pg in cfg['params']: + pattern = pg['params'] + params = {k: v for k, v in model.named_parameters() if v.requires_grad and len(re.findall(pattern, k)) > 0} + pg['params'] = params.values() + param_groups.append(pg) + visited.extend(list(params.keys())) + + names = [k for k, v in model.named_parameters() if v.requires_grad] + + if len(visited) < len(names): + unseen = set(names) - set(visited) + params = {k: v for k, v in model.named_parameters() if v.requires_grad and k in unseen} + param_groups.append({'params': params.values()}) + visited.extend(list(params.keys())) + + assert len(visited) == len(names), '' + + return param_groups diff --git a/rtdetr_pytorch/src/core/yaml_utils.py b/rtdetr_pytorch/src/core/yaml_utils.py new file mode 100644 index 0000000..c9ed259 --- /dev/null +++ b/rtdetr_pytorch/src/core/yaml_utils.py @@ -0,0 +1,208 @@ +""""by lyuwenyu +""" + +import os +import yaml +import inspect +import importlib + +__all__ = ['GLOBAL_CONFIG', 'register', 'create', 'load_config', 'merge_config', 'merge_dict'] + + +GLOBAL_CONFIG = dict() +INCLUDE_KEY = '__include__' + + +def register(cls: type): + ''' + Args: + cls (type): Module class to be registered. + ''' + if cls.__name__ in GLOBAL_CONFIG: + raise ValueError('{} already registered'.format(cls.__name__)) + + if inspect.isfunction(cls): + GLOBAL_CONFIG[cls.__name__] = cls + + elif inspect.isclass(cls): + GLOBAL_CONFIG[cls.__name__] = extract_schema(cls) + + else: + raise ValueError(f'register {cls}') + + return cls + + +def extract_schema(cls: type): + ''' + Args: + cls (type), + Return: + Dict, + ''' + argspec = inspect.getfullargspec(cls.__init__) + arg_names = [arg for arg in argspec.args if arg != 'self'] + num_defualts = len(argspec.defaults) if argspec.defaults is not None else 0 + num_requires = len(arg_names) - num_defualts + + schame = dict() + schame['_name'] = cls.__name__ + schame['_pymodule'] = importlib.import_module(cls.__module__) + schame['_inject'] = getattr(cls, '__inject__', []) + schame['_share'] = getattr(cls, '__share__', []) + + for i, name in enumerate(arg_names): + if name in schame['_share']: + assert i >= num_requires, 'share config must have default value.' + value = argspec.defaults[i - num_requires] + + elif i >= num_requires: + value = argspec.defaults[i - num_requires] + + else: + value = None + + schame[name] = value + + return schame + + + +def create(type_or_name, **kwargs): + ''' + ''' + assert type(type_or_name) in (type, str), 'create should be class or name.' + + name = type_or_name if isinstance(type_or_name, str) else type_or_name.__name__ + + if name in GLOBAL_CONFIG: + if hasattr(GLOBAL_CONFIG[name], '__dict__'): + return GLOBAL_CONFIG[name] + else: + raise ValueError('The module {} is not registered'.format(name)) + + cfg = GLOBAL_CONFIG[name] + + if isinstance(cfg, dict) and 'type' in cfg: + _cfg: dict = GLOBAL_CONFIG[cfg['type']] + _cfg.update(cfg) # update global cls default args + _cfg.update(kwargs) # TODO + name = _cfg.pop('type') + + return create(name) + + + cls = getattr(cfg['_pymodule'], name) + argspec = inspect.getfullargspec(cls.__init__) + arg_names = [arg for arg in argspec.args if arg != 'self'] + + cls_kwargs = {} + cls_kwargs.update(cfg) + + # shared var + for k in cfg['_share']: + if k in GLOBAL_CONFIG: + cls_kwargs[k] = GLOBAL_CONFIG[k] + else: + cls_kwargs[k] = cfg[k] + + # inject + for k in cfg['_inject']: + _k = cfg[k] + + if _k is None: + continue + + if isinstance(_k, str): + if _k not in GLOBAL_CONFIG: + raise ValueError(f'Missing inject config of {_k}.') + + _cfg = GLOBAL_CONFIG[_k] + + if isinstance(_cfg, dict): + cls_kwargs[k] = create(_cfg['_name']) + else: + cls_kwargs[k] = _cfg + + elif isinstance(_k, dict): + if 'type' not in _k.keys(): + raise ValueError(f'Missing inject for `type` style.') + + _type = str(_k['type']) + if _type not in GLOBAL_CONFIG: + raise ValueError(f'Missing {_type} in inspect stage.') + + # TODO modified inspace, maybe get wrong result for using `> 1` + _cfg: dict = GLOBAL_CONFIG[_type] + # _cfg_copy = copy.deepcopy(_cfg) + _cfg.update(_k) # update + cls_kwargs[k] = create(_type) + # _cfg.update(_cfg_copy) # resume + + else: + raise ValueError(f'Inject does not support {_k}') + + + cls_kwargs = {n: cls_kwargs[n] for n in arg_names} + + return cls(**cls_kwargs) + + + +def load_config(file_path, cfg=dict()): + '''load config + ''' + _, ext = os.path.splitext(file_path) + assert ext in ['.yml', '.yaml'], "only support yaml files for now" + + with open(file_path) as f: + file_cfg = yaml.load(f, Loader=yaml.Loader) + if file_cfg is None: + return {} + + if INCLUDE_KEY in file_cfg: + base_yamls = list(file_cfg[INCLUDE_KEY]) + for base_yaml in base_yamls: + if base_yaml.startswith('~'): + base_yaml = os.path.expanduser(base_yaml) + + if not base_yaml.startswith('/'): + base_yaml = os.path.join(os.path.dirname(file_path), base_yaml) + + with open(base_yaml) as f: + base_cfg = load_config(base_yaml, cfg) + merge_config(base_cfg, cfg) + + return merge_config(file_cfg, cfg) + + + +def merge_dict(dct, another_dct): + '''merge another_dct into dct + ''' + for k in another_dct: + if (k in dct and isinstance(dct[k], dict) and isinstance(another_dct[k], dict)): + merge_dict(dct[k], another_dct[k]) + else: + dct[k] = another_dct[k] + + return dct + + + +def merge_config(config, another_cfg=None): + """ + Merge config into global config or another_cfg. + + Args: + config (dict): Config to be merged. + + Returns: global config + """ + global GLOBAL_CONFIG + dct = GLOBAL_CONFIG if another_cfg is None else another_cfg + + return merge_dict(dct, config) + + + diff --git a/rtdetr_pytorch/src/data/__init__.py b/rtdetr_pytorch/src/data/__init__.py new file mode 100644 index 0000000..95715f8 --- /dev/null +++ b/rtdetr_pytorch/src/data/__init__.py @@ -0,0 +1,7 @@ + +from .coco import * +from .cifar10 import CIFAR10 + +from .dataloader import * +from .transforms import * + diff --git a/rtdetr_pytorch/src/data/cifar10/__init__.py b/rtdetr_pytorch/src/data/cifar10/__init__.py new file mode 100644 index 0000000..e5267dc --- /dev/null +++ b/rtdetr_pytorch/src/data/cifar10/__init__.py @@ -0,0 +1,14 @@ + +import torchvision +from typing import Optional, Callable + +from src.core import register + + +@register +class CIFAR10(torchvision.datasets.CIFAR10): + __inject__ = ['transform', 'target_transform'] + + def __init__(self, root: str, train: bool = True, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None, download: bool = False) -> None: + super().__init__(root, train, transform, target_transform, download) + diff --git a/rtdetr_pytorch/src/data/coco/__init__.py b/rtdetr_pytorch/src/data/coco/__init__.py new file mode 100644 index 0000000..c83b002 --- /dev/null +++ b/rtdetr_pytorch/src/data/coco/__init__.py @@ -0,0 +1,9 @@ +from .coco_dataset import ( + CocoDetection, + mscoco_category2label, + mscoco_label2category, + mscoco_category2name, +) +from .coco_eval import * + +from .coco_utils import get_coco_api_from_dataset \ No newline at end of file diff --git a/rtdetr_pytorch/src/data/coco/coco_dataset.py b/rtdetr_pytorch/src/data/coco/coco_dataset.py new file mode 100644 index 0000000..0ef7849 --- /dev/null +++ b/rtdetr_pytorch/src/data/coco/coco_dataset.py @@ -0,0 +1,238 @@ +""" +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +COCO dataset which returns image_id for evaluation. +Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py +""" + +import torch +import torch.utils.data + +import torchvision +torchvision.disable_beta_transforms_warning() + +from torchvision import datapoints + +from pycocotools import mask as coco_mask + +from src.core import register + +__all__ = ['CocoDetection'] + + +@register +class CocoDetection(torchvision.datasets.CocoDetection): + __inject__ = ['transforms'] + __share__ = ['remap_mscoco_category'] + + def __init__(self, img_folder, ann_file, transforms, return_masks, remap_mscoco_category=False): + super(CocoDetection, self).__init__(img_folder, ann_file) + self._transforms = transforms + self.prepare = ConvertCocoPolysToMask(return_masks, remap_mscoco_category) + self.img_folder = img_folder + self.ann_file = ann_file + self.return_masks = return_masks + self.remap_mscoco_category = remap_mscoco_category + + def __getitem__(self, idx): + img, target = super(CocoDetection, self).__getitem__(idx) + image_id = self.ids[idx] + target = {'image_id': image_id, 'annotations': target} + img, target = self.prepare(img, target) + + # ['boxes', 'masks', 'labels']: + if 'boxes' in target: + target['boxes'] = datapoints.BoundingBox( + target['boxes'], + format=datapoints.BoundingBoxFormat.XYXY, + spatial_size=img.size[::-1]) # h w + + if 'masks' in target: + target['masks'] = datapoints.Mask(target['masks']) + + if self._transforms is not None: + img, target = self._transforms(img, target) + + return img, target + + def extra_repr(self) -> str: + s = f' img_folder: {self.img_folder}\n ann_file: {self.ann_file}\n' + s += f' return_masks: {self.return_masks}\n' + if hasattr(self, '_transforms') and self._transforms is not None: + s += f' transforms:\n {repr(self._transforms)}' + + return s + + +def convert_coco_poly_to_mask(segmentations, height, width): + masks = [] + for polygons in segmentations: + rles = coco_mask.frPyObjects(polygons, height, width) + mask = coco_mask.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = torch.as_tensor(mask, dtype=torch.uint8) + mask = mask.any(dim=2) + masks.append(mask) + if masks: + masks = torch.stack(masks, dim=0) + else: + masks = torch.zeros((0, height, width), dtype=torch.uint8) + return masks + + +class ConvertCocoPolysToMask(object): + def __init__(self, return_masks=False, remap_mscoco_category=False): + self.return_masks = return_masks + self.remap_mscoco_category = remap_mscoco_category + + def __call__(self, image, target): + w, h = image.size + + image_id = target["image_id"] + image_id = torch.tensor([image_id]) + + anno = target["annotations"] + + anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0] + + boxes = [obj["bbox"] for obj in anno] + # guard against no boxes via resizing + boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + + if self.remap_mscoco_category: + classes = [mscoco_category2label[obj["category_id"]] for obj in anno] + else: + classes = [obj["category_id"] for obj in anno] + + classes = torch.tensor(classes, dtype=torch.int64) + + if self.return_masks: + segmentations = [obj["segmentation"] for obj in anno] + masks = convert_coco_poly_to_mask(segmentations, h, w) + + keypoints = None + if anno and "keypoints" in anno[0]: + keypoints = [obj["keypoints"] for obj in anno] + keypoints = torch.as_tensor(keypoints, dtype=torch.float32) + num_keypoints = keypoints.shape[0] + if num_keypoints: + keypoints = keypoints.view(num_keypoints, -1, 3) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + boxes = boxes[keep] + classes = classes[keep] + if self.return_masks: + masks = masks[keep] + if keypoints is not None: + keypoints = keypoints[keep] + + target = {} + target["boxes"] = boxes + target["labels"] = classes + if self.return_masks: + target["masks"] = masks + target["image_id"] = image_id + if keypoints is not None: + target["keypoints"] = keypoints + + # for conversion to coco api + area = torch.tensor([obj["area"] for obj in anno]) + iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno]) + target["area"] = area[keep] + target["iscrowd"] = iscrowd[keep] + + target["orig_size"] = torch.as_tensor([int(w), int(h)]) + target["size"] = torch.as_tensor([int(w), int(h)]) + + return image, target + + +mscoco_category2name = { + 1: 'person', + 2: 'bicycle', + 3: 'car', + 4: 'motorcycle', + 5: 'airplane', + 6: 'bus', + 7: 'train', + 8: 'truck', + 9: 'boat', + 10: 'traffic light', + 11: 'fire hydrant', + 13: 'stop sign', + 14: 'parking meter', + 15: 'bench', + 16: 'bird', + 17: 'cat', + 18: 'dog', + 19: 'horse', + 20: 'sheep', + 21: 'cow', + 22: 'elephant', + 23: 'bear', + 24: 'zebra', + 25: 'giraffe', + 27: 'backpack', + 28: 'umbrella', + 31: 'handbag', + 32: 'tie', + 33: 'suitcase', + 34: 'frisbee', + 35: 'skis', + 36: 'snowboard', + 37: 'sports ball', + 38: 'kite', + 39: 'baseball bat', + 40: 'baseball glove', + 41: 'skateboard', + 42: 'surfboard', + 43: 'tennis racket', + 44: 'bottle', + 46: 'wine glass', + 47: 'cup', + 48: 'fork', + 49: 'knife', + 50: 'spoon', + 51: 'bowl', + 52: 'banana', + 53: 'apple', + 54: 'sandwich', + 55: 'orange', + 56: 'broccoli', + 57: 'carrot', + 58: 'hot dog', + 59: 'pizza', + 60: 'donut', + 61: 'cake', + 62: 'chair', + 63: 'couch', + 64: 'potted plant', + 65: 'bed', + 67: 'dining table', + 70: 'toilet', + 72: 'tv', + 73: 'laptop', + 74: 'mouse', + 75: 'remote', + 76: 'keyboard', + 77: 'cell phone', + 78: 'microwave', + 79: 'oven', + 80: 'toaster', + 81: 'sink', + 82: 'refrigerator', + 84: 'book', + 85: 'clock', + 86: 'vase', + 87: 'scissors', + 88: 'teddy bear', + 89: 'hair drier', + 90: 'toothbrush' +} + +mscoco_category2label = {k: i for i, k in enumerate(mscoco_category2name.keys())} +mscoco_label2category = {v: k for k, v in mscoco_category2label.items()} \ No newline at end of file diff --git a/rtdetr_pytorch/src/data/coco/coco_eval.py b/rtdetr_pytorch/src/data/coco/coco_eval.py new file mode 100644 index 0000000..2d629f5 --- /dev/null +++ b/rtdetr_pytorch/src/data/coco/coco_eval.py @@ -0,0 +1,269 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +COCO evaluator that works in distributed mode. + +Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py +The difference is that there is less copy-pasting from pycocotools +in the end of the file, as python3 can suppress prints with contextlib +""" +import os +import contextlib +import copy +import numpy as np +import torch + +from pycocotools.cocoeval import COCOeval +from pycocotools.coco import COCO +import pycocotools.mask as mask_util + +from src.misc import dist + + +__all__ = ['CocoEvaluator',] + + +class CocoEvaluator(object): + def __init__(self, coco_gt, iou_types): + assert isinstance(iou_types, (list, tuple)) + coco_gt = copy.deepcopy(coco_gt) + self.coco_gt = coco_gt + + self.iou_types = iou_types + self.coco_eval = {} + for iou_type in iou_types: + self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type) + + self.img_ids = [] + self.eval_imgs = {k: [] for k in iou_types} + + def update(self, predictions): + img_ids = list(np.unique(list(predictions.keys()))) + self.img_ids.extend(img_ids) + + for iou_type in self.iou_types: + results = self.prepare(predictions, iou_type) + + # suppress pycocotools prints + with open(os.devnull, 'w') as devnull: + with contextlib.redirect_stdout(devnull): + coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO() + coco_eval = self.coco_eval[iou_type] + + coco_eval.cocoDt = coco_dt + coco_eval.params.imgIds = list(img_ids) + img_ids, eval_imgs = evaluate(coco_eval) + + self.eval_imgs[iou_type].append(eval_imgs) + + def synchronize_between_processes(self): + for iou_type in self.iou_types: + self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2) + create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type]) + + def accumulate(self): + for coco_eval in self.coco_eval.values(): + coco_eval.accumulate() + + def summarize(self): + for iou_type, coco_eval in self.coco_eval.items(): + print("IoU metric: {}".format(iou_type)) + coco_eval.summarize() + + def prepare(self, predictions, iou_type): + if iou_type == "bbox": + return self.prepare_for_coco_detection(predictions) + elif iou_type == "segm": + return self.prepare_for_coco_segmentation(predictions) + elif iou_type == "keypoints": + return self.prepare_for_coco_keypoint(predictions) + else: + raise ValueError("Unknown iou type {}".format(iou_type)) + + def prepare_for_coco_detection(self, predictions): + coco_results = [] + for original_id, prediction in predictions.items(): + if len(prediction) == 0: + continue + + boxes = prediction["boxes"] + boxes = convert_to_xywh(boxes).tolist() + scores = prediction["scores"].tolist() + labels = prediction["labels"].tolist() + + coco_results.extend( + [ + { + "image_id": original_id, + "category_id": labels[k], + "bbox": box, + "score": scores[k], + } + for k, box in enumerate(boxes) + ] + ) + return coco_results + + def prepare_for_coco_segmentation(self, predictions): + coco_results = [] + for original_id, prediction in predictions.items(): + if len(prediction) == 0: + continue + + scores = prediction["scores"] + labels = prediction["labels"] + masks = prediction["masks"] + + masks = masks > 0.5 + + scores = prediction["scores"].tolist() + labels = prediction["labels"].tolist() + + rles = [ + mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0] + for mask in masks + ] + for rle in rles: + rle["counts"] = rle["counts"].decode("utf-8") + + coco_results.extend( + [ + { + "image_id": original_id, + "category_id": labels[k], + "segmentation": rle, + "score": scores[k], + } + for k, rle in enumerate(rles) + ] + ) + return coco_results + + def prepare_for_coco_keypoint(self, predictions): + coco_results = [] + for original_id, prediction in predictions.items(): + if len(prediction) == 0: + continue + + boxes = prediction["boxes"] + boxes = convert_to_xywh(boxes).tolist() + scores = prediction["scores"].tolist() + labels = prediction["labels"].tolist() + keypoints = prediction["keypoints"] + keypoints = keypoints.flatten(start_dim=1).tolist() + + coco_results.extend( + [ + { + "image_id": original_id, + "category_id": labels[k], + 'keypoints': keypoint, + "score": scores[k], + } + for k, keypoint in enumerate(keypoints) + ] + ) + return coco_results + + +def convert_to_xywh(boxes): + xmin, ymin, xmax, ymax = boxes.unbind(1) + return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1) + + +def merge(img_ids, eval_imgs): + all_img_ids = dist.all_gather(img_ids) + all_eval_imgs = dist.all_gather(eval_imgs) + + merged_img_ids = [] + for p in all_img_ids: + merged_img_ids.extend(p) + + merged_eval_imgs = [] + for p in all_eval_imgs: + merged_eval_imgs.append(p) + + merged_img_ids = np.array(merged_img_ids) + merged_eval_imgs = np.concatenate(merged_eval_imgs, 2) + + # keep only unique (and in sorted order) images + merged_img_ids, idx = np.unique(merged_img_ids, return_index=True) + merged_eval_imgs = merged_eval_imgs[..., idx] + + return merged_img_ids, merged_eval_imgs + + +def create_common_coco_eval(coco_eval, img_ids, eval_imgs): + img_ids, eval_imgs = merge(img_ids, eval_imgs) + img_ids = list(img_ids) + eval_imgs = list(eval_imgs.flatten()) + + coco_eval.evalImgs = eval_imgs + coco_eval.params.imgIds = img_ids + coco_eval._paramsEval = copy.deepcopy(coco_eval.params) + + +################################################################# +# From pycocotools, just removed the prints and fixed +# a Python3 bug about unicode not defined +################################################################# + + +# import io +# from contextlib import redirect_stdout +# def evaluate(imgs): +# with redirect_stdout(io.StringIO()): +# imgs.evaluate() +# return imgs.params.imgIds, np.asarray(imgs.evalImgs).reshape(-1, len(imgs.params.areaRng), len(imgs.params.imgIds)) + + +def evaluate(self): + ''' + Run per image evaluation on given images and store results (a list of dict) in self.evalImgs + :return: None + ''' + # tic = time.time() + # print('Running per image evaluation...') + p = self.params + # add backward compatibility if useSegm is specified in params + if p.useSegm is not None: + p.iouType = 'segm' if p.useSegm == 1 else 'bbox' + print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType)) + # print('Evaluate annotation type *{}*'.format(p.iouType)) + p.imgIds = list(np.unique(p.imgIds)) + if p.useCats: + p.catIds = list(np.unique(p.catIds)) + p.maxDets = sorted(p.maxDets) + self.params = p + + self._prepare() + # loop through images, area range, max detection number + catIds = p.catIds if p.useCats else [-1] + + if p.iouType == 'segm' or p.iouType == 'bbox': + computeIoU = self.computeIoU + elif p.iouType == 'keypoints': + computeIoU = self.computeOks + self.ious = { + (imgId, catId): computeIoU(imgId, catId) + for imgId in p.imgIds + for catId in catIds} + + evaluateImg = self.evaluateImg + maxDet = p.maxDets[-1] + evalImgs = [ + evaluateImg(imgId, catId, areaRng, maxDet) + for catId in catIds + for areaRng in p.areaRng + for imgId in p.imgIds + ] + # this is NOT in the pycocotools code, but could be done outside + evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds)) + self._paramsEval = copy.deepcopy(self.params) + # toc = time.time() + # print('DONE (t={:0.2f}s).'.format(toc-tic)) + return p.imgIds, evalImgs + +################################################################# +# end of straight copy from pycocotools, just removing the prints +################################################################# + diff --git a/rtdetr_pytorch/src/data/coco/coco_utils.py b/rtdetr_pytorch/src/data/coco/coco_utils.py new file mode 100644 index 0000000..48c0994 --- /dev/null +++ b/rtdetr_pytorch/src/data/coco/coco_utils.py @@ -0,0 +1,184 @@ +import os + +import torch +import torch.utils.data +import torchvision +from pycocotools import mask as coco_mask +from pycocotools.coco import COCO + + +def convert_coco_poly_to_mask(segmentations, height, width): + masks = [] + for polygons in segmentations: + rles = coco_mask.frPyObjects(polygons, height, width) + mask = coco_mask.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = torch.as_tensor(mask, dtype=torch.uint8) + mask = mask.any(dim=2) + masks.append(mask) + if masks: + masks = torch.stack(masks, dim=0) + else: + masks = torch.zeros((0, height, width), dtype=torch.uint8) + return masks + + +class ConvertCocoPolysToMask: + def __call__(self, image, target): + w, h = image.size + + image_id = target["image_id"] + + anno = target["annotations"] + + anno = [obj for obj in anno if obj["iscrowd"] == 0] + + boxes = [obj["bbox"] for obj in anno] + # guard against no boxes via resizing + boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + + classes = [obj["category_id"] for obj in anno] + classes = torch.tensor(classes, dtype=torch.int64) + + segmentations = [obj["segmentation"] for obj in anno] + masks = convert_coco_poly_to_mask(segmentations, h, w) + + keypoints = None + if anno and "keypoints" in anno[0]: + keypoints = [obj["keypoints"] for obj in anno] + keypoints = torch.as_tensor(keypoints, dtype=torch.float32) + num_keypoints = keypoints.shape[0] + if num_keypoints: + keypoints = keypoints.view(num_keypoints, -1, 3) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + boxes = boxes[keep] + classes = classes[keep] + masks = masks[keep] + if keypoints is not None: + keypoints = keypoints[keep] + + target = {} + target["boxes"] = boxes + target["labels"] = classes + target["masks"] = masks + target["image_id"] = image_id + if keypoints is not None: + target["keypoints"] = keypoints + + # for conversion to coco api + area = torch.tensor([obj["area"] for obj in anno]) + iscrowd = torch.tensor([obj["iscrowd"] for obj in anno]) + target["area"] = area + target["iscrowd"] = iscrowd + + return image, target + + +def _coco_remove_images_without_annotations(dataset, cat_list=None): + def _has_only_empty_bbox(anno): + return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno) + + def _count_visible_keypoints(anno): + return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno) + + min_keypoints_per_image = 10 + + def _has_valid_annotation(anno): + # if it's empty, there is no annotation + if len(anno) == 0: + return False + # if all boxes have close to zero area, there is no annotation + if _has_only_empty_bbox(anno): + return False + # keypoints task have a slight different criteria for considering + # if an annotation is valid + if "keypoints" not in anno[0]: + return True + # for keypoint detection tasks, only consider valid images those + # containing at least min_keypoints_per_image + if _count_visible_keypoints(anno) >= min_keypoints_per_image: + return True + return False + + ids = [] + for ds_idx, img_id in enumerate(dataset.ids): + ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None) + anno = dataset.coco.loadAnns(ann_ids) + if cat_list: + anno = [obj for obj in anno if obj["category_id"] in cat_list] + if _has_valid_annotation(anno): + ids.append(ds_idx) + + dataset = torch.utils.data.Subset(dataset, ids) + return dataset + + +def convert_to_coco_api(ds): + coco_ds = COCO() + # annotation IDs need to start at 1, not 0, see torchvision issue #1530 + ann_id = 1 + dataset = {"images": [], "categories": [], "annotations": []} + categories = set() + for img_idx in range(len(ds)): + # find better way to get target + # targets = ds.get_annotations(img_idx) + img, targets = ds[img_idx] + image_id = targets["image_id"].item() + img_dict = {} + img_dict["id"] = image_id + img_dict["height"] = img.shape[-2] + img_dict["width"] = img.shape[-1] + dataset["images"].append(img_dict) + bboxes = targets["boxes"].clone() + bboxes[:, 2:] -= bboxes[:, :2] + bboxes = bboxes.tolist() + labels = targets["labels"].tolist() + areas = targets["area"].tolist() + iscrowd = targets["iscrowd"].tolist() + if "masks" in targets: + masks = targets["masks"] + # make masks Fortran contiguous for coco_mask + masks = masks.permute(0, 2, 1).contiguous().permute(0, 2, 1) + if "keypoints" in targets: + keypoints = targets["keypoints"] + keypoints = keypoints.reshape(keypoints.shape[0], -1).tolist() + num_objs = len(bboxes) + for i in range(num_objs): + ann = {} + ann["image_id"] = image_id + ann["bbox"] = bboxes[i] + ann["category_id"] = labels[i] + categories.add(labels[i]) + ann["area"] = areas[i] + ann["iscrowd"] = iscrowd[i] + ann["id"] = ann_id + if "masks" in targets: + ann["segmentation"] = coco_mask.encode(masks[i].numpy()) + if "keypoints" in targets: + ann["keypoints"] = keypoints[i] + ann["num_keypoints"] = sum(k != 0 for k in keypoints[i][2::3]) + dataset["annotations"].append(ann) + ann_id += 1 + dataset["categories"] = [{"id": i} for i in sorted(categories)] + coco_ds.dataset = dataset + coco_ds.createIndex() + return coco_ds + + +def get_coco_api_from_dataset(dataset): + # FIXME: This is... awful? + for _ in range(10): + if isinstance(dataset, torchvision.datasets.CocoDetection): + break + if isinstance(dataset, torch.utils.data.Subset): + dataset = dataset.dataset + if isinstance(dataset, torchvision.datasets.CocoDetection): + return dataset.coco + return convert_to_coco_api(dataset) + + diff --git a/rtdetr_pytorch/src/data/dataloader.py b/rtdetr_pytorch/src/data/dataloader.py new file mode 100644 index 0000000..4db7cad --- /dev/null +++ b/rtdetr_pytorch/src/data/dataloader.py @@ -0,0 +1,28 @@ +import torch +import torch.utils.data as data + +from src.core import register + + +__all__ = ['DataLoader'] + + +@register +class DataLoader(data.DataLoader): + __inject__ = ['dataset', 'collate_fn'] + + def __repr__(self) -> str: + format_string = self.__class__.__name__ + "(" + for n in ['dataset', 'batch_size', 'num_workers', 'drop_last', 'collate_fn']: + format_string += "\n" + format_string += " {0}: {1}".format(n, getattr(self, n)) + format_string += "\n)" + return format_string + + + +@register +def default_collate_fn(items): + '''default collate_fn + ''' + return torch.cat([x[0][None] for x in items], dim=0), [x[1] for x in items] diff --git a/rtdetr_pytorch/src/data/functional.py b/rtdetr_pytorch/src/data/functional.py new file mode 100644 index 0000000..336baa2 --- /dev/null +++ b/rtdetr_pytorch/src/data/functional.py @@ -0,0 +1,169 @@ +import torch +import torchvision.transforms.functional as F + +from packaging import version +from typing import Optional, List +from torch import Tensor + +# needed due to empty tensor bug in pytorch and torchvision 0.5 +import torchvision +if version.parse(torchvision.__version__) < version.parse('0.7'): + from torchvision.ops import _new_empty_tensor + from torchvision.ops.misc import _output_size + + +def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None): + # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor + """ + Equivalent to nn.functional.interpolate, but with support for empty batch sizes. + This will eventually be supported natively by PyTorch, and this + class can go away. + """ + if version.parse(torchvision.__version__) < version.parse('0.7'): + if input.numel() > 0: + return torch.nn.functional.interpolate( + input, size, scale_factor, mode, align_corners + ) + + output_shape = _output_size(2, input, size, scale_factor) + output_shape = list(input.shape[:-2]) + list(output_shape) + return _new_empty_tensor(input, output_shape) + else: + return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners) + + + +def crop(image, target, region): + cropped_image = F.crop(image, *region) + + target = target.copy() + i, j, h, w = region + + # should we do something wrt the original size? + target["size"] = torch.tensor([h, w]) + + fields = ["labels", "area", "iscrowd"] + + if "boxes" in target: + boxes = target["boxes"] + max_size = torch.as_tensor([w, h], dtype=torch.float32) + cropped_boxes = boxes - torch.as_tensor([j, i, j, i]) + cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size) + cropped_boxes = cropped_boxes.clamp(min=0) + area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1) + target["boxes"] = cropped_boxes.reshape(-1, 4) + target["area"] = area + fields.append("boxes") + + if "masks" in target: + # FIXME should we update the area here if there are no boxes? + target['masks'] = target['masks'][:, i:i + h, j:j + w] + fields.append("masks") + + # remove elements for which the boxes or masks that have zero area + if "boxes" in target or "masks" in target: + # favor boxes selection when defining which elements to keep + # this is compatible with previous implementation + if "boxes" in target: + cropped_boxes = target['boxes'].reshape(-1, 2, 2) + keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1) + else: + keep = target['masks'].flatten(1).any(1) + + for field in fields: + target[field] = target[field][keep] + + return cropped_image, target + + +def hflip(image, target): + flipped_image = F.hflip(image) + + w, h = image.size + + target = target.copy() + if "boxes" in target: + boxes = target["boxes"] + boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0]) + target["boxes"] = boxes + + if "masks" in target: + target['masks'] = target['masks'].flip(-1) + + return flipped_image, target + + +def resize(image, target, size, max_size=None): + # size can be min_size (scalar) or (w, h) tuple + + def get_size_with_aspect_ratio(image_size, size, max_size=None): + w, h = image_size + if max_size is not None: + min_original_size = float(min((w, h))) + max_original_size = float(max((w, h))) + if max_original_size / min_original_size * size > max_size: + size = int(round(max_size * min_original_size / max_original_size)) + + if (w <= h and w == size) or (h <= w and h == size): + return (h, w) + + if w < h: + ow = size + oh = int(size * h / w) + else: + oh = size + ow = int(size * w / h) + + # r = min(size / min(h, w), max_size / max(h, w)) + # ow = int(w * r) + # oh = int(h * r) + + return (oh, ow) + + def get_size(image_size, size, max_size=None): + if isinstance(size, (list, tuple)): + return size[::-1] + else: + return get_size_with_aspect_ratio(image_size, size, max_size) + + size = get_size(image.size, size, max_size) + rescaled_image = F.resize(image, size) + + if target is None: + return rescaled_image, None + + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size)) + ratio_width, ratio_height = ratios + + target = target.copy() + if "boxes" in target: + boxes = target["boxes"] + scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height]) + target["boxes"] = scaled_boxes + + if "area" in target: + area = target["area"] + scaled_area = area * (ratio_width * ratio_height) + target["area"] = scaled_area + + h, w = size + target["size"] = torch.tensor([h, w]) + + if "masks" in target: + target['masks'] = interpolate( + target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5 + + return rescaled_image, target + + +def pad(image, target, padding): + # assumes that we only pad on the bottom right corners + padded_image = F.pad(image, (0, 0, padding[0], padding[1])) + if target is None: + return padded_image, None + target = target.copy() + # should we do something wrt the original size? + target["size"] = torch.tensor(padded_image.size[::-1]) + if "masks" in target: + target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1])) + return padded_image, target diff --git a/rtdetr_pytorch/src/data/transforms.py b/rtdetr_pytorch/src/data/transforms.py new file mode 100644 index 0000000..13f469e --- /dev/null +++ b/rtdetr_pytorch/src/data/transforms.py @@ -0,0 +1,150 @@ +""""by lyuwenyu +""" + + +import torch +import torch.nn as nn + +import torchvision +torchvision.disable_beta_transforms_warning() +from torchvision import datapoints + +import torchvision.transforms.v2 as T +import torchvision.transforms.v2.functional as F + +from PIL import Image +from typing import Any, Dict, List, Optional + +from src.core import register, GLOBAL_CONFIG + + +__all__ = ['Compose', ] + + +RandomPhotometricDistort = register(T.RandomPhotometricDistort) +RandomZoomOut = register(T.RandomZoomOut) +# RandomIoUCrop = register(T.RandomIoUCrop) +RandomHorizontalFlip = register(T.RandomHorizontalFlip) +Resize = register(T.Resize) +ToImageTensor = register(T.ToImageTensor) +ConvertDtype = register(T.ConvertDtype) +SanitizeBoundingBox = register(T.SanitizeBoundingBox) +RandomCrop = register(T.RandomCrop) +Normalize = register(T.Normalize) + + + +@register +class Compose(T.Compose): + def __init__(self, ops) -> None: + transforms = [] + if ops is not None: + for op in ops: + if isinstance(op, dict): + name = op.pop('type') + transfom = getattr(GLOBAL_CONFIG[name]['_pymodule'], name)(**op) + transforms.append(transfom) + # op['type'] = name + elif isinstance(op, nn.Module): + transforms.append(op) + + else: + raise ValueError('') + else: + transforms =[EmptyTransform(), ] + + super().__init__(transforms=transforms) + + +@register +class EmptyTransform(T.Transform): + def __init__(self, ) -> None: + super().__init__() + + def forward(self, *inputs): + inputs = inputs if len(inputs) > 1 else inputs[0] + return inputs + + +@register +class PadToSize(T.Pad): + _transformed_types = ( + Image.Image, + datapoints.Image, + datapoints.Video, + datapoints.Mask, + datapoints.BoundingBox, + ) + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: + sz = F.get_spatial_size(flat_inputs[0]) + h, w = self.spatial_size[0] - sz[0], self.spatial_size[1] - sz[1] + self.padding = [0, 0, w, h] + return dict(padding=self.padding) + + def make_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: + return self._get_params(flat_inputs) + + def __init__(self, spatial_size, fill=0, padding_mode='constant') -> None: + if isinstance(spatial_size, int): + spatial_size = (spatial_size, spatial_size) + + self.spatial_size = spatial_size + super().__init__(0, fill, padding_mode) + + def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: + fill = self._fill[type(inpt)] + padding = params['padding'] + return F.pad(inpt, padding=padding, fill=fill, padding_mode=self.padding_mode) # type: ignore[arg-type] + + def transform(self, inpt: Any, params: Dict[str, Any]) -> Any: + return self._transform(inpt, params) + + def __call__(self, *inputs: Any) -> Any: + outputs = super().forward(*inputs) + if len(outputs) > 1 and isinstance(outputs[1], dict): + outputs[1]['padding'] = torch.tensor(self.padding) + return outputs + + +@register +class RandomIoUCrop(T.RandomIoUCrop): + def __init__(self, min_scale: float = 0.3, max_scale: float = 1, min_aspect_ratio: float = 0.5, max_aspect_ratio: float = 2, sampler_options: Optional[List[float]] = None, trials: int = 40, p: float = 1.0): + super().__init__(min_scale, max_scale, min_aspect_ratio, max_aspect_ratio, sampler_options, trials) + self.p = p + + def __call__(self, *inputs: Any) -> Any: + if torch.rand(1) >= self.p: + return inputs if len(inputs) > 1 else inputs[0] + + return super().forward(*inputs) + + +@register +class ConvertBox(T.Transform): + _transformed_types = ( + datapoints.BoundingBox, + ) + def __init__(self, out_fmt='', normalize=False) -> None: + super().__init__() + self.out_fmt = out_fmt + self.normalize = normalize + + self.data_fmt = { + 'xyxy': datapoints.BoundingBoxFormat.XYXY, + 'cxcywh': datapoints.BoundingBoxFormat.CXCYWH + } + + def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: + if self.out_fmt: + spatial_size = inpt.spatial_size + in_fmt = inpt.format.value.lower() + inpt = torchvision.ops.box_convert(inpt, in_fmt=in_fmt, out_fmt=self.out_fmt) + inpt = datapoints.BoundingBox(inpt, format=self.data_fmt[self.out_fmt], spatial_size=spatial_size) + + if self.normalize: + inpt = inpt / torch.tensor(inpt.spatial_size[::-1]).tile(2)[None] + + return inpt + + def transform(self, inpt: Any, params: Dict[str, Any]) -> Any: + return self._transform(inpt, params) diff --git a/rtdetr_pytorch/src/misc/__init__.py b/rtdetr_pytorch/src/misc/__init__.py new file mode 100644 index 0000000..802b61e --- /dev/null +++ b/rtdetr_pytorch/src/misc/__init__.py @@ -0,0 +1,3 @@ + +from .logger import * +from .visualizer import * diff --git a/rtdetr_pytorch/src/misc/dist.py b/rtdetr_pytorch/src/misc/dist.py new file mode 100644 index 0000000..4293de5 --- /dev/null +++ b/rtdetr_pytorch/src/misc/dist.py @@ -0,0 +1,189 @@ +""" +reference +- https://github.com/pytorch/vision/blob/main/references/detection/utils.py +- https://github.com/facebookresearch/detr/blob/master/util/misc.py#L406 + +by lyuwenyu +""" + +import random +import numpy as np + +import torch +import torch.nn as nn +import torch.distributed +import torch.distributed as tdist + +from torch.nn.parallel import DistributedDataParallel as DDP + +from torch.utils.data import DistributedSampler +from torch.utils.data.dataloader import DataLoader + + +def init_distributed(): + ''' + distributed setup + args: + backend (str), ('nccl', 'gloo') + ''' + try: + # # https://pytorch.org/docs/stable/elastic/run.html + # LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1)) + # RANK = int(os.getenv('RANK', -1)) + # WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1)) + + tdist.init_process_group(init_method='env://', ) + torch.distributed.barrier() + + rank = get_rank() + device = torch.device(f'cuda:{rank}') + torch.cuda.set_device(device) + + setup_print(rank == 0) + print('Initialized distributed mode...') + + return True + + except: + print('Not init distributed mode.') + return False + + +def setup_print(is_main): + '''This function disables printing when not in master process + ''' + import builtins as __builtin__ + builtin_print = __builtin__.print + + def print(*args, **kwargs): + force = kwargs.pop('force', False) + if is_main or force: + builtin_print(*args, **kwargs) + + __builtin__.print = print + + +def is_dist_available_and_initialized(): + if not tdist.is_available(): + return False + if not tdist.is_initialized(): + return False + return True + + +def get_rank(): + if not is_dist_available_and_initialized(): + return 0 + return tdist.get_rank() + + +def get_world_size(): + if not is_dist_available_and_initialized(): + return 1 + return tdist.get_world_size() + + +def is_main_process(): + return get_rank() == 0 + + +def save_on_master(*args, **kwargs): + if is_main_process(): + torch.save(*args, **kwargs) + + + +def warp_model(model, find_unused_parameters=False, sync_bn=False,): + if is_dist_available_and_initialized(): + rank = get_rank() + model = nn.SyncBatchNorm.convert_sync_batchnorm(model) if sync_bn else model + model = DDP(model, device_ids=[rank], output_device=rank, find_unused_parameters=find_unused_parameters) + return model + + +def warp_loader(loader, shuffle=False): + if is_dist_available_and_initialized(): + sampler = DistributedSampler(loader.dataset, shuffle=shuffle) + loader = DataLoader(loader.dataset, + loader.batch_size, + sampler=sampler, + drop_last=loader.drop_last, + collate_fn=loader.collate_fn, + pin_memory=loader.pin_memory, + num_workers=loader.num_workers, ) + return loader + + + +def is_parallel(model) -> bool: + # Returns True if model is of type DP or DDP + return type(model) in (torch.nn.parallel.DataParallel, torch.nn.parallel.DistributedDataParallel) + + +def de_parallel(model) -> nn.Module: + # De-parallelize a model: returns single-GPU model if model is of type DP or DDP + return model.module if is_parallel(model) else model + + +def reduce_dict(data, avg=True): + ''' + Args + data dict: input, {k: v, ...} + avg bool: true + ''' + world_size = get_world_size() + if world_size < 2: + return data + + with torch.no_grad(): + keys, values = [], [] + for k in sorted(data.keys()): + keys.append(k) + values.append(data[k]) + + values = torch.stack(values, dim=0) + tdist.all_reduce(values) + + if avg is True: + values /= world_size + + _data = {k: v for k, v in zip(keys, values)} + + return _data + + + +def all_gather(data): + """ + Run all_gather on arbitrary picklable data (not necessarily tensors) + Args: + data: any picklable object + Returns: + list[data]: list of data gathered from each rank + """ + world_size = get_world_size() + if world_size == 1: + return [data] + data_list = [None] * world_size + tdist.all_gather_object(data_list, data) + return data_list + + +import time +def sync_time(): + '''sync_time + ''' + if torch.cuda.is_available(): + torch.cuda.synchronize() + + return time.time() + + + +def set_seed(seed): + # fix the seed for reproducibility + torch.manual_seed(seed) + np.random.seed(seed) + random.seed(seed) + + diff --git a/rtdetr_pytorch/src/misc/logger.py b/rtdetr_pytorch/src/misc/logger.py new file mode 100644 index 0000000..6740530 --- /dev/null +++ b/rtdetr_pytorch/src/misc/logger.py @@ -0,0 +1,239 @@ +""" +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +https://github.com/facebookresearch/detr/blob/main/util/misc.py +Mostly copy-paste from torchvision references. +""" + +import time +import pickle +import datetime +from collections import defaultdict, deque +from typing import Dict + +import torch +import torch.distributed as tdist + +from .dist import is_dist_available_and_initialized, get_world_size + + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=20, fmt=None): + if fmt is None: + fmt = "{median:.4f} ({global_avg:.4f})" + self.deque = deque(maxlen=window_size) + self.total = 0.0 + self.count = 0 + self.fmt = fmt + + def update(self, value, n=1): + self.deque.append(value) + self.count += n + self.total += value * n + + def synchronize_between_processes(self): + """ + Warning: does not synchronize the deque! + """ + if not is_dist_available_and_initialized(): + return + t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') + tdist.barrier() + tdist.all_reduce(t) + t = t.tolist() + self.count = int(t[0]) + self.total = t[1] + + @property + def median(self): + d = torch.tensor(list(self.deque)) + return d.median().item() + + @property + def avg(self): + d = torch.tensor(list(self.deque), dtype=torch.float32) + return d.mean().item() + + @property + def global_avg(self): + return self.total / self.count + + @property + def max(self): + return max(self.deque) + + @property + def value(self): + return self.deque[-1] + + def __str__(self): + return self.fmt.format( + median=self.median, + avg=self.avg, + global_avg=self.global_avg, + max=self.max, + value=self.value) + + +def all_gather(data): + """ + Run all_gather on arbitrary picklable data (not necessarily tensors) + Args: + data: any picklable object + Returns: + list[data]: list of data gathered from each rank + """ + world_size = get_world_size() + if world_size == 1: + return [data] + + # serialized to a Tensor + buffer = pickle.dumps(data) + storage = torch.ByteStorage.from_buffer(buffer) + tensor = torch.ByteTensor(storage).to("cuda") + + # obtain Tensor size of each rank + local_size = torch.tensor([tensor.numel()], device="cuda") + size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)] + tdist.all_gather(size_list, local_size) + size_list = [int(size.item()) for size in size_list] + max_size = max(size_list) + + # receiving Tensor from all ranks + # we pad the tensor because torch all_gather does not support + # gathering tensors of different shapes + tensor_list = [] + for _ in size_list: + tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda")) + if local_size != max_size: + padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda") + tensor = torch.cat((tensor, padding), dim=0) + tdist.all_gather(tensor_list, tensor) + + data_list = [] + for size, tensor in zip(size_list, tensor_list): + buffer = tensor.cpu().numpy().tobytes()[:size] + data_list.append(pickle.loads(buffer)) + + return data_list + + +def reduce_dict(input_dict, average=True) -> Dict[str, torch.Tensor]: + """ + Args: + input_dict (dict): all the values will be reduced + average (bool): whether to do average or sum + Reduce the values in the dictionary from all processes so that all processes + have the averaged results. Returns a dict with the same fields as + input_dict, after reduction. + """ + world_size = get_world_size() + if world_size < 2: + return input_dict + with torch.no_grad(): + names = [] + values = [] + # sort the keys so that they are consistent across processes + for k in sorted(input_dict.keys()): + names.append(k) + values.append(input_dict[k]) + values = torch.stack(values, dim=0) + tdist.all_reduce(values) + if average: + values /= world_size + reduced_dict = {k: v for k, v in zip(names, values)} + return reduced_dict + + +class MetricLogger(object): + def __init__(self, delimiter="\t"): + self.meters = defaultdict(SmoothedValue) + self.delimiter = delimiter + + def update(self, **kwargs): + for k, v in kwargs.items(): + if isinstance(v, torch.Tensor): + v = v.item() + assert isinstance(v, (float, int)) + self.meters[k].update(v) + + def __getattr__(self, attr): + if attr in self.meters: + return self.meters[attr] + if attr in self.__dict__: + return self.__dict__[attr] + raise AttributeError("'{}' object has no attribute '{}'".format( + type(self).__name__, attr)) + + def __str__(self): + loss_str = [] + for name, meter in self.meters.items(): + loss_str.append( + "{}: {}".format(name, str(meter)) + ) + return self.delimiter.join(loss_str) + + def synchronize_between_processes(self): + for meter in self.meters.values(): + meter.synchronize_between_processes() + + def add_meter(self, name, meter): + self.meters[name] = meter + + def log_every(self, iterable, print_freq, header=None): + i = 0 + if not header: + header = '' + start_time = time.time() + end = time.time() + iter_time = SmoothedValue(fmt='{avg:.4f}') + data_time = SmoothedValue(fmt='{avg:.4f}') + space_fmt = ':' + str(len(str(len(iterable)))) + 'd' + if torch.cuda.is_available(): + log_msg = self.delimiter.join([ + header, + '[{0' + space_fmt + '}/{1}]', + 'eta: {eta}', + '{meters}', + 'time: {time}', + 'data: {data}', + 'max mem: {memory:.0f}' + ]) + else: + log_msg = self.delimiter.join([ + header, + '[{0' + space_fmt + '}/{1}]', + 'eta: {eta}', + '{meters}', + 'time: {time}', + 'data: {data}' + ]) + MB = 1024.0 * 1024.0 + for obj in iterable: + data_time.update(time.time() - end) + yield obj + iter_time.update(time.time() - end) + if i % print_freq == 0 or i == len(iterable) - 1: + eta_seconds = iter_time.global_avg * (len(iterable) - i) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + if torch.cuda.is_available(): + print(log_msg.format( + i, len(iterable), eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time), + memory=torch.cuda.max_memory_allocated() / MB)) + else: + print(log_msg.format( + i, len(iterable), eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time))) + i += 1 + end = time.time() + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('{} Total time: {} ({:.4f} s / it)'.format( + header, total_time_str, total_time / len(iterable))) + diff --git a/rtdetr_pytorch/src/misc/visualizer.py b/rtdetr_pytorch/src/misc/visualizer.py new file mode 100644 index 0000000..843f8eb --- /dev/null +++ b/rtdetr_pytorch/src/misc/visualizer.py @@ -0,0 +1,34 @@ +""""by lyuwenyu +""" + +import torch +import torch.utils.data + +import torchvision +torchvision.disable_beta_transforms_warning() + +import PIL + +__all__ = ['show_sample'] + +def show_sample(sample): + """for coco dataset/dataloader + """ + import matplotlib.pyplot as plt + from torchvision.transforms.v2 import functional as F + from torchvision.utils import draw_bounding_boxes + + image, target = sample + if isinstance(image, PIL.Image.Image): + image = F.to_image_tensor(image) + + image = F.convert_dtype(image, torch.uint8) + annotated_image = draw_bounding_boxes(image, target["boxes"], colors="yellow", width=3) + + fig, ax = plt.subplots() + ax.imshow(annotated_image.permute(1, 2, 0).numpy()) + ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[]) + fig.tight_layout() + fig.show() + plt.show() + diff --git a/rtdetr_pytorch/src/nn/__init__.py b/rtdetr_pytorch/src/nn/__init__.py new file mode 100644 index 0000000..7df8a1c --- /dev/null +++ b/rtdetr_pytorch/src/nn/__init__.py @@ -0,0 +1,7 @@ + +from .arch import * +from .criterion import * + +# +from .backbone import * + diff --git a/rtdetr_pytorch/src/nn/arch/__init__.py b/rtdetr_pytorch/src/nn/arch/__init__.py new file mode 100644 index 0000000..070f19b --- /dev/null +++ b/rtdetr_pytorch/src/nn/arch/__init__.py @@ -0,0 +1 @@ +from .classification import * diff --git a/rtdetr_pytorch/src/nn/arch/classification.py b/rtdetr_pytorch/src/nn/arch/classification.py new file mode 100644 index 0000000..2f1fa56 --- /dev/null +++ b/rtdetr_pytorch/src/nn/arch/classification.py @@ -0,0 +1,41 @@ +import torch +import torch.nn as nn + +from src.core import register + + +__all__ = ['Classification', 'ClassHead'] + + +@register +class Classification(nn.Module): + __inject__ = ['backbone', 'head'] + + def __init__(self, backbone: nn.Module, head: nn.Module=None): + super().__init__() + + self.backbone = backbone + self.head = head + + def forward(self, x): + x = self.backbone(x) + + if self.head is not None: + x = self.head(x) + + return x + + +@register +class ClassHead(nn.Module): + def __init__(self, hidden_dim, num_classes): + super().__init__() + self.pool = nn.AdaptiveAvgPool2d(1) + self.proj = nn.Linear(hidden_dim, num_classes) + + def forward(self, x): + x = x[0] if isinstance(x, (list, tuple)) else x + x = self.pool(x) + x = x.reshape(x.shape[0], -1) + x = self.proj(x) + return x diff --git a/rtdetr_pytorch/src/nn/backbone/__init__.py b/rtdetr_pytorch/src/nn/backbone/__init__.py new file mode 100644 index 0000000..f8571dc --- /dev/null +++ b/rtdetr_pytorch/src/nn/backbone/__init__.py @@ -0,0 +1,6 @@ + +from .presnet import * +from .test_resnet import * +from .regnet import * +from .common import * +from .dla import * \ No newline at end of file diff --git a/rtdetr_pytorch/src/nn/backbone/common.py b/rtdetr_pytorch/src/nn/backbone/common.py new file mode 100644 index 0000000..72e38d7 --- /dev/null +++ b/rtdetr_pytorch/src/nn/backbone/common.py @@ -0,0 +1,102 @@ +'''by lyuwenyu +''' + +import torch +import torch.nn as nn + + + +class ConvNormLayer(nn.Module): + def __init__(self, ch_in, ch_out, kernel_size, stride, padding=None, bias=False, act=None): + super().__init__() + self.conv = nn.Conv2d( + ch_in, + ch_out, + kernel_size, + stride, + padding=(kernel_size-1)//2 if padding is None else padding, + bias=bias) + self.norm = nn.BatchNorm2d(ch_out) + self.act = nn.Identity() if act is None else get_activation(act) + + def forward(self, x): + return self.act(self.norm(self.conv(x))) + + +class FrozenBatchNorm2d(nn.Module): + """copy and modified from https://github.com/facebookresearch/detr/blob/master/models/backbone.py + BatchNorm2d where the batch statistics and the affine parameters are fixed. + Copy-paste from torchvision.misc.ops with added eps before rqsrt, + without which any other models than torchvision.models.resnet[18,34,50,101] + produce nans. + """ + def __init__(self, num_features, eps=1e-5): + super(FrozenBatchNorm2d, self).__init__() + n = num_features + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + self.eps = eps + self.num_features = n + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + num_batches_tracked_key = prefix + 'num_batches_tracked' + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super(FrozenBatchNorm2d, self)._load_from_state_dict( + state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs) + + def forward(self, x): + # move reshapes to the beginning + # to make it fuser-friendly + w = self.weight.reshape(1, -1, 1, 1) + b = self.bias.reshape(1, -1, 1, 1) + rv = self.running_var.reshape(1, -1, 1, 1) + rm = self.running_mean.reshape(1, -1, 1, 1) + scale = w * (rv + self.eps).rsqrt() + bias = b - rm * scale + return x * scale + bias + + def extra_repr(self): + return ( + "{num_features}, eps={eps}".format(**self.__dict__) + ) + + +def get_activation(act: str, inpace: bool=True): + '''get activation + ''' + act = act.lower() + + if act == 'silu': + m = nn.SiLU() + + elif act == 'relu': + m = nn.ReLU() + + elif act == 'leaky_relu': + m = nn.LeakyReLU() + + elif act == 'silu': + m = nn.SiLU() + + elif act == 'gelu': + m = nn.GELU() + + elif act is None: + m = nn.Identity() + + elif isinstance(act, nn.Module): + m = act + + else: + raise RuntimeError('') + + if hasattr(m, 'inplace'): + m.inplace = inpace + + return m diff --git a/rtdetr_pytorch/src/nn/backbone/dla.py b/rtdetr_pytorch/src/nn/backbone/dla.py new file mode 100644 index 0000000..23c9cd6 --- /dev/null +++ b/rtdetr_pytorch/src/nn/backbone/dla.py @@ -0,0 +1,452 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import logging +from os.path import join + +import torch +from torch import nn +import torch.utils.model_zoo as model_zoo +# from mmdet.models.builder import BACKBONES +from src.core import register + + +BN_MOMENTUM = 0.1 +logger = logging.getLogger(__name__) + + +def get_model_url(data='imagenet', name='dla34', hash='ba72cf86'): + return join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash)) + + +def conv3x3(in_planes, out_planes, stride=1): + "3x3 convolution with padding" + return nn.Conv2d( + in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False + ) + + +class BasicBlock(nn.Module): + def __init__(self, inplanes, planes, stride=1, dilation=1): + super(BasicBlock, self).__init__() + self.conv1 = nn.Conv2d( + inplanes, + planes, + kernel_size=3, + stride=stride, + padding=dilation, + bias=False, + dilation=dilation, + ) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.conv2 = nn.Conv2d( + planes, + planes, + kernel_size=3, + stride=1, + padding=dilation, + bias=False, + dilation=dilation, + ) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.stride = stride + + def forward(self, x, residual=None): + if residual is None: + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 2 + + def __init__(self, inplanes, planes, stride=1, dilation=1): + super(Bottleneck, self).__init__() + expansion = Bottleneck.expansion + bottle_planes = planes // expansion + self.conv1 = nn.Conv2d(inplanes, bottle_planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d( + bottle_planes, + bottle_planes, + kernel_size=3, + stride=stride, + padding=dilation, + bias=False, + dilation=dilation, + ) + self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) + self.conv3 = nn.Conv2d(bottle_planes, planes, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.stride = stride + + def forward(self, x, residual=None): + if residual is None: + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + out += residual + out = self.relu(out) + + return out + + +class BottleneckX(nn.Module): + expansion = 2 + cardinality = 32 + + def __init__(self, inplanes, planes, stride=1, dilation=1): + super(BottleneckX, self).__init__() + cardinality = BottleneckX.cardinality + # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0))) + # bottle_planes = dim * cardinality + bottle_planes = planes * cardinality // 32 + self.conv1 = nn.Conv2d(inplanes, bottle_planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d( + bottle_planes, + bottle_planes, + kernel_size=3, + stride=stride, + padding=dilation, + bias=False, + dilation=dilation, + groups=cardinality, + ) + self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) + self.conv3 = nn.Conv2d(bottle_planes, planes, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.stride = stride + + def forward(self, x, residual=None): + if residual is None: + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + out += residual + out = self.relu(out) + + return out + + +class Root(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, residual): + super(Root, self).__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + 1, + stride=1, + bias=False, + padding=(kernel_size - 1) // 2, + ) + self.bn = nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.residual = residual + + def forward(self, *x): + children = x + x = self.conv(torch.cat(x, 1)) + x = self.bn(x) + if self.residual: + x += children[0] + x = self.relu(x) + + return x + + +class Tree(nn.Module): + def __init__( + self, + levels, + block, + in_channels, + out_channels, + stride=1, + level_root=False, + root_dim=0, + root_kernel_size=1, + dilation=1, + root_residual=False, + ): + super(Tree, self).__init__() + if root_dim == 0: + root_dim = 2 * out_channels + if level_root: + root_dim += in_channels + if levels == 1: + self.tree1 = block(in_channels, out_channels, stride, dilation=dilation) + self.tree2 = block(out_channels, out_channels, 1, dilation=dilation) + else: + self.tree1 = Tree( + levels - 1, + block, + in_channels, + out_channels, + stride, + root_dim=0, + root_kernel_size=root_kernel_size, + dilation=dilation, + root_residual=root_residual, + ) + self.tree2 = Tree( + levels - 1, + block, + out_channels, + out_channels, + root_dim=root_dim + out_channels, + root_kernel_size=root_kernel_size, + dilation=dilation, + root_residual=root_residual, + ) + if levels == 1: + self.root = Root(root_dim, out_channels, root_kernel_size, root_residual) + self.level_root = level_root + self.root_dim = root_dim + self.downsample = None + self.project = None + self.levels = levels + if stride > 1: + self.downsample = nn.MaxPool2d(stride, stride=stride) + if levels == 1 and in_channels != out_channels: + self.project = nn.Sequential( + nn.Conv2d( + in_channels, out_channels, kernel_size=1, stride=1, bias=False + ), + nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM), + ) + + def forward(self, x, residual=None, children=None): + children = [] if children is None else children + bottom = self.downsample(x) if self.downsample else x + residual = self.project(bottom) if self.project else bottom + if self.level_root: + children.append(bottom) + x1 = self.tree1(x, residual) + if self.levels == 1: + x2 = self.tree2(x1) + x = self.root(x2, x1, *children) + else: + children.append(x1) + x = self.tree2(x1, children=children) + return x + + +class DLA(nn.Module): + def __init__( + self, + levels, + channels, + num_classes=1000, + block=BasicBlock, + out_indices=(2, 3, 4, 5), + residual_root=False, + linear_root=False, + ): + super(DLA, self).__init__() + self.channels = channels + self.num_classes = num_classes + self.out_indices = out_indices + self.base_layer = nn.Sequential( + nn.Conv2d(3, channels[0], kernel_size=7, stride=1, padding=3, bias=False), + nn.BatchNorm2d(channels[0], momentum=BN_MOMENTUM), + nn.ReLU(inplace=True), + ) + self.level0 = self._make_conv_level(channels[0], channels[0], levels[0]) + self.level1 = self._make_conv_level( + channels[0], channels[1], levels[1], stride=2 + ) + self.level2 = Tree( + levels[2], + block, + channels[1], + channels[2], + 2, + level_root=False, + root_residual=residual_root, + ) + self.level3 = Tree( + levels[3], + block, + channels[2], + channels[3], + 2, + level_root=True, + root_residual=residual_root, + ) + self.level4 = Tree( + levels[4], + block, + channels[3], + channels[4], + 2, + level_root=True, + root_residual=residual_root, + ) + self.level5 = Tree( + levels[5], + block, + channels[4], + channels[5], + 2, + level_root=True, + root_residual=residual_root, + ) + + # for m in self.modules(): + # if isinstance(m, nn.Conv2d): + # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + # m.weight.data.normal_(0, math.sqrt(2. / n)) + # elif isinstance(m, nn.BatchNorm2d): + # m.weight.data.fill_(1) + # m.bias.data.zero_() + + def _make_level(self, block, inplanes, planes, blocks, stride=1): + downsample = None + if stride != 1 or inplanes != planes: + downsample = nn.Sequential( + nn.MaxPool2d(stride, stride=stride), + nn.Conv2d(inplanes, planes, kernel_size=1, stride=1, bias=False), + nn.BatchNorm2d(planes, momentum=BN_MOMENTUM), + ) + + layers = [] + layers.append(block(inplanes, planes, stride, downsample=downsample)) + for i in range(1, blocks): + layers.append(block(inplanes, planes)) + + return nn.Sequential(*layers) + + def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1): + modules = [] + for i in range(convs): + modules.extend( + [ + nn.Conv2d( + inplanes, + planes, + kernel_size=3, + stride=stride if i == 0 else 1, + padding=dilation, + bias=False, + dilation=dilation, + ), + nn.BatchNorm2d(planes, momentum=BN_MOMENTUM), + nn.ReLU(inplace=True), + ] + ) + inplanes = planes + return nn.Sequential(*modules) + + def forward(self, x): + y = [] + x = self.base_layer(x) + for i in range(6): + x = getattr(self, 'level{}'.format(i))(x) + if i in self.out_indices: + y.append(x) + return y + + def load_pretrained_model(self, data='imagenet', name='dla34', hash='ba72cf86'): + # fc = self.fc + if name.endswith('.pth'): + model_weights = torch.load(data + name) + else: + model_url = get_model_url(data, name, hash) + model_weights = model_zoo.load_url(model_url) + self.load_state_dict(model_weights, strict=False) + # self.fc = fc + + +def dla34(pretrained=True, levels=None, in_channels=None, **kwargs): # DLA-34 + model = DLA(levels=levels, channels=in_channels, block=BasicBlock, **kwargs) + if pretrained: + model.load_pretrained_model(data='imagenet', name='dla34', hash='ba72cf86') + return model + +@register +class DLANet(nn.Module): + def __init__( + self, + dla='dla34', + pretrained=True, + levels=[1, 1, 1, 2, 2, 1], + in_channels=[16, 32, 64, 128, 256, 512], + return_index = [1, 2, 3], + cfg=None, + ): + super(DLANet, self).__init__() + self.cfg = cfg + self.in_channels = in_channels + + self.model = eval(dla)( + pretrained=pretrained, levels=levels, in_channels=in_channels + ) + self.return_index = return_index + def forward(self, x): + x = self.model(x) + max_list = max(self.return_index) + min_list = min(self.return_index) + return x[min_list:max_list+1] + + +class Identity(nn.Module): + def __init__(self): + super(Identity, self).__init__() + + def forward(self, x): + return x + + +def fill_fc_weights(layers): + for m in layers.modules(): + if isinstance(m, nn.Conv2d): + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + +def fill_up_weights(up): + w = up.weight.data + f = math.ceil(w.size(2) / 2) + c = (2 * f - 1 - f % 2) / (2.0 * f) + for i in range(w.size(2)): + for j in range(w.size(3)): + w[0, 0, i, j] = (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c)) + for c in range(1, w.size(0)): + w[c, 0, :, :] = w[0, 0, :, :] \ No newline at end of file diff --git a/rtdetr_pytorch/src/nn/backbone/presnet.py b/rtdetr_pytorch/src/nn/backbone/presnet.py new file mode 100644 index 0000000..2a6b4ba --- /dev/null +++ b/rtdetr_pytorch/src/nn/backbone/presnet.py @@ -0,0 +1,225 @@ +'''by lyuwenyu +''' +import torch +import torch.nn as nn +import torch.nn.functional as F + +from collections import OrderedDict + +from .common import get_activation, ConvNormLayer, FrozenBatchNorm2d + +from src.core import register + + +__all__ = ['PResNet'] + + +ResNet_cfg = { + 18: [2, 2, 2, 2], + 34: [3, 4, 6, 3], + 50: [3, 4, 6, 3], + 101: [3, 4, 23, 3], + # 152: [3, 8, 36, 3], +} + + +donwload_url = { + 18: 'https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet18_vd_pretrained_from_paddle.pth', + 34: 'https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet34_vd_pretrained_from_paddle.pth', + 50: 'https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet50_vd_ssld_v2_pretrained_from_paddle.pth', + 101: 'https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet101_vd_ssld_pretrained_from_paddle.pth', +} + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, ch_in, ch_out, stride, shortcut, act='relu', variant='b'): + super().__init__() + + self.shortcut = shortcut + + if not shortcut: + if variant == 'd' and stride == 2: + self.short = nn.Sequential(OrderedDict([ + ('pool', nn.AvgPool2d(2, 2, 0, ceil_mode=True)), + ('conv', ConvNormLayer(ch_in, ch_out, 1, 1)) + ])) + else: + self.short = ConvNormLayer(ch_in, ch_out, 1, stride) + + self.branch2a = ConvNormLayer(ch_in, ch_out, 3, stride, act=act) + self.branch2b = ConvNormLayer(ch_out, ch_out, 3, 1, act=None) + self.act = nn.Identity() if act is None else get_activation(act) + + + def forward(self, x): + out = self.branch2a(x) + out = self.branch2b(out) + if self.shortcut: + short = x + else: + short = self.short(x) + + out = out + short + out = self.act(out) + + return out + + +class BottleNeck(nn.Module): + expansion = 4 + + def __init__(self, ch_in, ch_out, stride, shortcut, act='relu', variant='b'): + super().__init__() + + if variant == 'a': + stride1, stride2 = stride, 1 + else: + stride1, stride2 = 1, stride + + width = ch_out + + self.branch2a = ConvNormLayer(ch_in, width, 1, stride1, act=act) + self.branch2b = ConvNormLayer(width, width, 3, stride2, act=act) + self.branch2c = ConvNormLayer(width, ch_out * self.expansion, 1, 1) + + self.shortcut = shortcut + if not shortcut: + if variant == 'd' and stride == 2: + self.short = nn.Sequential(OrderedDict([ + ('pool', nn.AvgPool2d(2, 2, 0, ceil_mode=True)), + ('conv', ConvNormLayer(ch_in, ch_out * self.expansion, 1, 1)) + ])) + else: + self.short = ConvNormLayer(ch_in, ch_out * self.expansion, 1, stride) + + self.act = nn.Identity() if act is None else get_activation(act) + + def forward(self, x): + out = self.branch2a(x) + out = self.branch2b(out) + out = self.branch2c(out) + + if self.shortcut: + short = x + else: + short = self.short(x) + + out = out + short + out = self.act(out) + + return out + + +class Blocks(nn.Module): + def __init__(self, block, ch_in, ch_out, count, stage_num, act='relu', variant='b'): + super().__init__() + + self.blocks = nn.ModuleList() + for i in range(count): + self.blocks.append( + block( + ch_in, + ch_out, + stride=2 if i == 0 and stage_num != 2 else 1, + shortcut=False if i == 0 else True, + variant=variant, + act=act) + ) + + if i == 0: + ch_in = ch_out * block.expansion + + def forward(self, x): + out = x + for block in self.blocks: + out = block(out) + return out + + +@register +class PResNet(nn.Module): + def __init__( + self, + depth, + variant='d', + num_stages=4, + return_idx=[0, 1, 2, 3], + act='relu', + freeze_at=-1, + freeze_norm=True, + pretrained=False): + super().__init__() + + block_nums = ResNet_cfg[depth] + ch_in = 64 + if variant in ['c', 'd']: + conv_def = [ + [3, ch_in // 2, 3, 2, "conv1_1"], + [ch_in // 2, ch_in // 2, 3, 1, "conv1_2"], + [ch_in // 2, ch_in, 3, 1, "conv1_3"], + ] + else: + conv_def = [[3, ch_in, 7, 2, "conv1_1"]] + + self.conv1 = nn.Sequential(OrderedDict([ + (_name, ConvNormLayer(c_in, c_out, k, s, act=act)) for c_in, c_out, k, s, _name in conv_def + ])) + + ch_out_list = [64, 128, 256, 512] + block = BottleNeck if depth >= 50 else BasicBlock + + _out_channels = [block.expansion * v for v in ch_out_list] + _out_strides = [4, 8, 16, 32] + + self.res_layers = nn.ModuleList() + for i in range(num_stages): + stage_num = i + 2 + self.res_layers.append( + Blocks(block, ch_in, ch_out_list[i], block_nums[i], stage_num, act=act, variant=variant) + ) + ch_in = _out_channels[i] + + self.return_idx = return_idx + self.out_channels = [_out_channels[_i] for _i in return_idx] + self.out_strides = [_out_strides[_i] for _i in return_idx] + + if freeze_at >= 0: + self._freeze_parameters(self.conv1) + for i in range(min(freeze_at, num_stages)): + self._freeze_parameters(self.res_layers[i]) + + if freeze_norm: + self._freeze_norm(self) + + if pretrained: + state = torch.hub.load_state_dict_from_url(donwload_url[depth]) + self.load_state_dict(state) + print(f'Load PResNet{depth} state_dict') + + def _freeze_parameters(self, m: nn.Module): + for p in m.parameters(): + p.requires_grad = False + + def _freeze_norm(self, m: nn.Module): + if isinstance(m, nn.BatchNorm2d): + m = FrozenBatchNorm2d(m.num_features) + else: + for name, child in m.named_children(): + _child = self._freeze_norm(child) + if _child is not child: + setattr(m, name, _child) + return m + + def forward(self, x): + conv1 = self.conv1(x) + x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1) + outs = [] + for idx, stage in enumerate(self.res_layers): + x = stage(x) + if idx in self.return_idx: + outs.append(x) + return outs + + diff --git a/rtdetr_pytorch/src/nn/backbone/regnet.py b/rtdetr_pytorch/src/nn/backbone/regnet.py new file mode 100644 index 0000000..2282910 --- /dev/null +++ b/rtdetr_pytorch/src/nn/backbone/regnet.py @@ -0,0 +1,23 @@ +import torch +import torch.nn as nn +from transformers import RegNetModel + + +from src.core import register + +__all__ = ['RegNet'] + +@register +class RegNet(nn.Module): + def __init__(self, configuration, return_idx=[0, 1, 2, 3]): + super(RegNet, self).__init__() + self.model = RegNetModel.from_pretrained("facebook/regnet-y-040") + self.return_idx = return_idx + + + def forward(self, x): + + outputs = self.model(x, output_hidden_states = True) + x = outputs.hidden_states[2:5] + + return x \ No newline at end of file diff --git a/rtdetr_pytorch/src/nn/backbone/test_resnet.py b/rtdetr_pytorch/src/nn/backbone/test_resnet.py new file mode 100644 index 0000000..6639d79 --- /dev/null +++ b/rtdetr_pytorch/src/nn/backbone/test_resnet.py @@ -0,0 +1,81 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from collections import OrderedDict + + +from src.core import register + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, in_planes, planes, stride=1): + super(BasicBlock, self).__init__() + + self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,stride=1, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + + self.shortcut = nn.Sequential() + if stride != 1 or in_planes != self.expansion*planes: + self.shortcut = nn.Sequential( + nn.Conv2d(in_planes, self.expansion*planes,kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(self.expansion*planes) + ) + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = self.bn2(self.conv2(out)) + out += self.shortcut(x) + out = F.relu(out) + return out + + + +class _ResNet(nn.Module): + def __init__(self, block, num_blocks, num_classes=10): + super().__init__() + self.in_planes = 64 + + self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(64) + + self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) + self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) + self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) + self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) + + self.linear = nn.Linear(512 * block.expansion, num_classes) + + def _make_layer(self, block, planes, num_blocks, stride): + strides = [stride] + [1]*(num_blocks-1) + layers = [] + for stride in strides: + layers.append(block(self.in_planes, planes, stride)) + self.in_planes = planes * block.expansion + return nn.Sequential(*layers) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = F.avg_pool2d(out, 4) + out = out.view(out.size(0), -1) + out = self.linear(out) + return out + + +@register +class MResNet(nn.Module): + def __init__(self, num_classes=10, num_blocks=[2, 2, 2, 2]) -> None: + super().__init__() + self.model = _ResNet(BasicBlock, num_blocks, num_classes) + + def forward(self, x): + return self.model(x) + diff --git a/rtdetr_pytorch/src/nn/backbone/utils.py b/rtdetr_pytorch/src/nn/backbone/utils.py new file mode 100644 index 0000000..ee250b1 --- /dev/null +++ b/rtdetr_pytorch/src/nn/backbone/utils.py @@ -0,0 +1,58 @@ +""" +https://github.com/pytorch/vision/blob/main/torchvision/models/_utils.py + +by lyuwenyu +""" + +from collections import OrderedDict +from typing import Dict, List + + +import torch.nn as nn + + +class IntermediateLayerGetter(nn.ModuleDict): + """ + Module wrapper that returns intermediate layers from a model + + It has a strong assumption that the modules have been registered + into the model in the same order as they are used. + This means that one should **not** reuse the same nn.Module + twice in the forward if you want this to work. + + Additionally, it is only able to query submodules that are directly + assigned to the model. So if `model` is passed, `model.feature1` can + be returned, but not `model.feature1.layer2`. + """ + + _version = 3 + + def __init__(self, model: nn.Module, return_layers: List[str]) -> None: + if not set(return_layers).issubset([name for name, _ in model.named_children()]): + raise ValueError("return_layers are not present in model. {}"\ + .format([name for name, _ in model.named_children()])) + orig_return_layers = return_layers + return_layers = {str(k): str(k) for k in return_layers} + layers = OrderedDict() + for name, module in model.named_children(): + layers[name] = module + if name in return_layers: + del return_layers[name] + if not return_layers: + break + + super().__init__(layers) + self.return_layers = orig_return_layers + + def forward(self, x): + # out = OrderedDict() + outputs = [] + for name, module in self.items(): + x = module(x) + if name in self.return_layers: + # out_name = self.return_layers[name] + # out[out_name] = x + outputs.append(x) + + return outputs + diff --git a/rtdetr_pytorch/src/nn/criterion/__init__.py b/rtdetr_pytorch/src/nn/criterion/__init__.py new file mode 100644 index 0000000..9804569 --- /dev/null +++ b/rtdetr_pytorch/src/nn/criterion/__init__.py @@ -0,0 +1,6 @@ + +import torch.nn as nn +from src.core import register + +CrossEntropyLoss = register(nn.CrossEntropyLoss) + diff --git a/rtdetr_pytorch/src/nn/criterion/utils.py b/rtdetr_pytorch/src/nn/criterion/utils.py new file mode 100644 index 0000000..7d8833e --- /dev/null +++ b/rtdetr_pytorch/src/nn/criterion/utils.py @@ -0,0 +1,20 @@ +import torch +import torchvision + + + +def format_target(targets): + ''' + Args: + targets (List[Dict]), + Return: + tensor (Tensor), [im_id, label, bbox,] + ''' + outputs = [] + for i, tgt in enumerate(targets): + boxes = torchvision.ops.box_convert(tgt['boxes'], in_fmt='xyxy', out_fmt='cxcywh') + labels = tgt['labels'].reshape(-1, 1) + im_ids = torch.ones_like(labels) * i + outputs.append(torch.cat([im_ids, labels, boxes], dim=1)) + + return torch.cat(outputs, dim=0) diff --git a/rtdetr_pytorch/src/optim/__init__.py b/rtdetr_pytorch/src/optim/__init__.py new file mode 100644 index 0000000..1bd7c81 --- /dev/null +++ b/rtdetr_pytorch/src/optim/__init__.py @@ -0,0 +1,4 @@ + +from .ema import * +from .optim import * +from .amp import * \ No newline at end of file diff --git a/rtdetr_pytorch/src/optim/amp.py b/rtdetr_pytorch/src/optim/amp.py new file mode 100644 index 0000000..e43d021 --- /dev/null +++ b/rtdetr_pytorch/src/optim/amp.py @@ -0,0 +1,12 @@ +import torch +import torch.nn as nn +import torch.cuda.amp as amp + + +from src.core import register +import src.misc.dist as dist + + +__all__ = ['GradScaler'] + +GradScaler = register(amp.grad_scaler.GradScaler) diff --git a/rtdetr_pytorch/src/optim/ema.py b/rtdetr_pytorch/src/optim/ema.py new file mode 100644 index 0000000..bf962b3 --- /dev/null +++ b/rtdetr_pytorch/src/optim/ema.py @@ -0,0 +1,115 @@ +""" +reference: +https://github.com/ultralytics/yolov5/blob/master/utils/torch_utils.py#L404 + +by lyuwenyu +""" + +import torch +import torch.nn as nn + +import math +from copy import deepcopy + + + +from src.core import register +import src.misc.dist as dist + + +__all__ = ['ModelEMA'] + + + +@register +class ModelEMA(object): + """ Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models + Keep a moving average of everything in the model state_dict (parameters and buffers). + This is intended to allow functionality like + https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage + A smoothed version of the weights is necessary for some training schemes to perform well. + This class is sensitive where it is initialized in the sequence of model init, + GPU assignment and distributed training wrappers. + """ + def __init__(self, model: nn.Module, decay: float=0.9999, warmups: int=2000): + super().__init__() + + # Create EMA + self.module = deepcopy(dist.de_parallel(model)).eval() # FP32 EMA + + # if next(model.parameters()).device.type != 'cpu': + # self.module.half() # FP16 EMA + + self.decay = decay + self.warmups = warmups + self.updates = 0 # number of EMA updates + # self.filter_no_grad = filter_no_grad + self.decay_fn = lambda x: decay * (1 - math.exp(-x / warmups)) # decay exponential ramp (to help early epochs) + + for p in self.module.parameters(): + p.requires_grad_(False) + + def update(self, model: nn.Module): + # Update EMA parameters + with torch.no_grad(): + self.updates += 1 + d = self.decay_fn(self.updates) + + msd = dist.de_parallel(model).state_dict() + for k, v in self.module.state_dict().items(): + if v.dtype.is_floating_point: + v *= d + v += (1 - d) * msd[k].detach() + + def to(self, *args, **kwargs): + self.module = self.module.to(*args, **kwargs) + return self + + def update_attr(self, model, include=(), exclude=('process_group', 'reducer')): + # Update EMA attributes + self.copy_attr(self.module, model, include, exclude) + + @staticmethod + def copy_attr(a, b, include=(), exclude=()): + # Copy attributes from b to a, options to only include [...] and to exclude [...] + for k, v in b.__dict__.items(): + if (len(include) and k not in include) or k.startswith('_') or k in exclude: + continue + else: + setattr(a, k, v) + + def state_dict(self, ): + return dict(module=self.module.state_dict(), updates=self.updates, warmups=self.warmups) + + def load_state_dict(self, state): + self.module.load_state_dict(state['module']) + if 'updates' in state: + self.updates = state['updates'] + + def forwad(self, ): + raise RuntimeError('ema...') + + def extra_repr(self) -> str: + return f'decay={self.decay}, warmups={self.warmups}' + + + + +class ExponentialMovingAverage(torch.optim.swa_utils.AveragedModel): + """Maintains moving averages of model parameters using an exponential decay. + ``ema_avg = decay * avg_model_param + (1 - decay) * model_param`` + `torch.optim.swa_utils.AveragedModel `_ + is used to compute the EMA. + """ + def __init__(self, model, decay, device="cpu", use_buffers=True): + + self.decay_fn = lambda x: decay * (1 - math.exp(-x / 2000)) + + def ema_avg(avg_model_param, model_param, num_averaged): + decay = self.decay_fn(num_averaged) + return decay * avg_model_param + (1 - decay) * model_param + + super().__init__(model, device, ema_avg, use_buffers=use_buffers) + + + diff --git a/rtdetr_pytorch/src/optim/optim.py b/rtdetr_pytorch/src/optim/optim.py new file mode 100644 index 0000000..b10bd82 --- /dev/null +++ b/rtdetr_pytorch/src/optim/optim.py @@ -0,0 +1,22 @@ + +import torch +import torch.nn as nn +import torch.optim as optim +import torch.optim.lr_scheduler as lr_scheduler + +from src.core import register + + +__all__ = ['AdamW', 'SGD', 'Adam', 'MultiStepLR', 'CosineAnnealingLR', 'OneCycleLR', 'LambdaLR'] + + + +SGD = register(optim.SGD) +Adam = register(optim.Adam) +AdamW = register(optim.AdamW) + + +MultiStepLR = register(lr_scheduler.MultiStepLR) +CosineAnnealingLR = register(lr_scheduler.CosineAnnealingLR) +OneCycleLR = register(lr_scheduler.OneCycleLR) +LambdaLR = register(lr_scheduler.LambdaLR) diff --git a/rtdetr_pytorch/src/solver/__init__.py b/rtdetr_pytorch/src/solver/__init__.py new file mode 100644 index 0000000..eddab7b --- /dev/null +++ b/rtdetr_pytorch/src/solver/__init__.py @@ -0,0 +1,12 @@ +"""by lyuwenyu +""" + +from .solver import BaseSolver +from .det_solver import DetSolver + + +from typing import Dict + +TASKS :Dict[str, BaseSolver] = { + 'detection': DetSolver, +} \ No newline at end of file diff --git a/rtdetr_pytorch/src/solver/det_engine.py b/rtdetr_pytorch/src/solver/det_engine.py new file mode 100644 index 0000000..fbca083 --- /dev/null +++ b/rtdetr_pytorch/src/solver/det_engine.py @@ -0,0 +1,190 @@ +""" +Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +https://github.com/facebookresearch/detr/blob/main/engine.py + +by lyuwenyu +""" + +import math +import os +import sys +import pathlib +from typing import Iterable + +import torch +import torch.amp + +from src.data import CocoEvaluator +from src.misc import (MetricLogger, SmoothedValue, reduce_dict) + + +def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, + data_loader: Iterable, optimizer: torch.optim.Optimizer, + device: torch.device, epoch: int, max_norm: float = 0, **kwargs): + model.train() + criterion.train() + metric_logger = MetricLogger(delimiter=" ") + metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}')) + # metric_logger.add_meter('class_error', SmoothedValue(window_size=1, fmt='{value:.2f}')) + header = 'Epoch: [{}]'.format(epoch) + print_freq = kwargs.get('print_freq', 10) + + ema = kwargs.get('ema', None) + scaler = kwargs.get('scaler', None) + + for samples, targets in metric_logger.log_every(data_loader, print_freq, header): + samples = samples.to(device) + targets = [{k: v.to(device) for k, v in t.items()} for t in targets] + + if scaler is not None: + with torch.autocast(device_type=str(device), cache_enabled=True): + outputs = model(samples, targets) + + with torch.autocast(device_type=str(device), enabled=False): + loss_dict = criterion(outputs, targets) + + loss = sum(loss_dict.values()) + scaler.scale(loss).backward() + + if max_norm > 0: + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) + + scaler.step(optimizer) + scaler.update() + optimizer.zero_grad() + + else: + outputs = model(samples, targets) + loss_dict = criterion(outputs, targets) + + loss = sum(loss_dict.values()) + optimizer.zero_grad() + loss.backward() + + if max_norm > 0: + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) + + optimizer.step() + + # ema + if ema is not None: + ema.update(model) + + loss_dict_reduced = reduce_dict(loss_dict) + loss_value = sum(loss_dict_reduced.values()) + + if not math.isfinite(loss_value): + print("Loss is {}, stopping training".format(loss_value)) + print(loss_dict_reduced) + sys.exit(1) + + metric_logger.update(loss=loss_value, **loss_dict_reduced) + metric_logger.update(lr=optimizer.param_groups[0]["lr"]) + + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + return {k: meter.global_avg for k, meter in metric_logger.meters.items()} + + + +@torch.no_grad() +def evaluate(model: torch.nn.Module, criterion: torch.nn.Module, postprocessors, data_loader, base_ds, device, output_dir): + model.eval() + criterion.eval() + + metric_logger = MetricLogger(delimiter=" ") + # metric_logger.add_meter('class_error', SmoothedValue(window_size=1, fmt='{value:.2f}')) + header = 'Test:' + + # iou_types = tuple(k for k in ('segm', 'bbox') if k in postprocessors.keys()) + iou_types = postprocessors.iou_types + coco_evaluator = CocoEvaluator(base_ds, iou_types) + # coco_evaluator.coco_eval[iou_types[0]].params.iouThrs = [0, 0.1, 0.5, 0.75] + + panoptic_evaluator = None + # if 'panoptic' in postprocessors.keys(): + # panoptic_evaluator = PanopticEvaluator( + # data_loader.dataset.ann_file, + # data_loader.dataset.ann_folder, + # output_dir=os.path.join(output_dir, "panoptic_eval"), + # ) + + for samples, targets in metric_logger.log_every(data_loader, 10, header): + samples = samples.to(device) + targets = [{k: v.to(device) for k, v in t.items()} for t in targets] + + # with torch.autocast(device_type=str(device)): + # outputs = model(samples) + + outputs = model(samples) + + # loss_dict = criterion(outputs, targets) + # weight_dict = criterion.weight_dict + # # reduce losses over all GPUs for logging purposes + # loss_dict_reduced = reduce_dict(loss_dict) + # loss_dict_reduced_scaled = {k: v * weight_dict[k] + # for k, v in loss_dict_reduced.items() if k in weight_dict} + # loss_dict_reduced_unscaled = {f'{k}_unscaled': v + # for k, v in loss_dict_reduced.items()} + # metric_logger.update(loss=sum(loss_dict_reduced_scaled.values()), + # **loss_dict_reduced_scaled, + # **loss_dict_reduced_unscaled) + # metric_logger.update(class_error=loss_dict_reduced['class_error']) + + orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0) + results = postprocessors(outputs, orig_target_sizes) + # results = postprocessors(outputs, targets) + + # if 'segm' in postprocessors.keys(): + # target_sizes = torch.stack([t["size"] for t in targets], dim=0) + # results = postprocessors['segm'](results, outputs, orig_target_sizes, target_sizes) + + res = {target['image_id'].item(): output for target, output in zip(targets, results)} + if coco_evaluator is not None: + coco_evaluator.update(res) + + # if panoptic_evaluator is not None: + # res_pano = postprocessors["panoptic"](outputs, target_sizes, orig_target_sizes) + # for i, target in enumerate(targets): + # image_id = target["image_id"].item() + # file_name = f"{image_id:012d}.png" + # res_pano[i]["image_id"] = image_id + # res_pano[i]["file_name"] = file_name + # panoptic_evaluator.update(res_pano) + + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + if coco_evaluator is not None: + coco_evaluator.synchronize_between_processes() + if panoptic_evaluator is not None: + panoptic_evaluator.synchronize_between_processes() + + # accumulate predictions from all images + if coco_evaluator is not None: + coco_evaluator.accumulate() + coco_evaluator.summarize() + + # panoptic_res = None + # if panoptic_evaluator is not None: + # panoptic_res = panoptic_evaluator.summarize() + + stats = {} + # stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()} + if coco_evaluator is not None: + if 'bbox' in iou_types: + stats['coco_eval_bbox'] = coco_evaluator.coco_eval['bbox'].stats.tolist() + if 'segm' in iou_types: + stats['coco_eval_masks'] = coco_evaluator.coco_eval['segm'].stats.tolist() + + # if panoptic_res is not None: + # stats['PQ_all'] = panoptic_res["All"] + # stats['PQ_th'] = panoptic_res["Things"] + # stats['PQ_st'] = panoptic_res["Stuff"] + + return stats, coco_evaluator + + + diff --git a/rtdetr_pytorch/src/solver/det_solver.py b/rtdetr_pytorch/src/solver/det_solver.py new file mode 100644 index 0000000..d0a0a84 --- /dev/null +++ b/rtdetr_pytorch/src/solver/det_solver.py @@ -0,0 +1,104 @@ +''' +by lyuwenyu +''' +import time +import json +import datetime + +import torch + +from src.misc import dist +from src.data import get_coco_api_from_dataset + +from .solver import BaseSolver +from .det_engine import train_one_epoch, evaluate + + +class DetSolver(BaseSolver): + + def fit(self, ): + print("Start training") + self.train() + + args = self.cfg + + n_parameters = sum(p.numel() for p in self.model.parameters() if p.requires_grad) + print('number of params:', n_parameters) + + base_ds = get_coco_api_from_dataset(self.val_dataloader.dataset) + # best_stat = {'coco_eval_bbox': 0, 'coco_eval_masks': 0, 'epoch': -1, } + best_stat = {'epoch': -1, } + + start_time = time.time() + for epoch in range(self.last_epoch + 1, args.epoches): + if dist.is_dist_available_and_initialized(): + self.train_dataloader.sampler.set_epoch(epoch) + + train_stats = train_one_epoch( + self.model, self.criterion, self.train_dataloader, self.optimizer, self.device, epoch, + args.clip_max_norm, print_freq=args.log_step, ema=self.ema, scaler=self.scaler) + + self.lr_scheduler.step() + + if self.output_dir: + checkpoint_paths = [self.output_dir / 'checkpoint.pth'] + # extra checkpoint before LR drop and every 100 epochs + if (epoch + 1) % args.checkpoint_step == 0: + checkpoint_paths.append(self.output_dir / f'checkpoint{epoch:04}.pth') + for checkpoint_path in checkpoint_paths: + dist.save_on_master(self.state_dict(epoch), checkpoint_path) + + module = self.ema.module if self.ema else self.model + test_stats, coco_evaluator = evaluate( + module, self.criterion, self.postprocessor, self.val_dataloader, base_ds, self.device, self.output_dir + ) + + # TODO + for k in test_stats.keys(): + if k in best_stat: + best_stat['epoch'] = epoch if test_stats[k][0] > best_stat[k] else best_stat['epoch'] + best_stat[k] = max(best_stat[k], test_stats[k][0]) + else: + best_stat['epoch'] = epoch + best_stat[k] = test_stats[k][0] + print('best_stat: ', best_stat) + + + log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, + **{f'test_{k}': v for k, v in test_stats.items()}, + 'epoch': epoch, + 'n_parameters': n_parameters} + + if self.output_dir and dist.is_main_process(): + with (self.output_dir / "log.txt").open("a") as f: + f.write(json.dumps(log_stats) + "\n") + + # for evaluation logs + if coco_evaluator is not None: + (self.output_dir / 'eval').mkdir(exist_ok=True) + if "bbox" in coco_evaluator.coco_eval: + filenames = ['latest.pth'] + if epoch % 50 == 0: + filenames.append(f'{epoch:03}.pth') + for name in filenames: + torch.save(coco_evaluator.coco_eval["bbox"].eval, + self.output_dir / "eval" / name) + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('Training time {}'.format(total_time_str)) + + + def val(self, ): + self.eval() + + base_ds = get_coco_api_from_dataset(self.val_dataloader.dataset) + + module = self.ema.module if self.ema else self.model + test_stats, coco_evaluator = evaluate(module, self.criterion, self.postprocessor, + self.val_dataloader, base_ds, self.device, self.output_dir) + + if self.output_dir: + dist.save_on_master(coco_evaluator.coco_eval["bbox"].eval, self.output_dir / "eval.pth") + + return diff --git a/rtdetr_pytorch/src/solver/solver.py b/rtdetr_pytorch/src/solver/solver.py new file mode 100644 index 0000000..55452f2 --- /dev/null +++ b/rtdetr_pytorch/src/solver/solver.py @@ -0,0 +1,182 @@ +"""by lyuwenyu +""" + +import torch +import torch.nn as nn + +from datetime import datetime +from pathlib import Path +from typing import Dict + +from src.misc import dist +from src.core import BaseConfig + + +class BaseSolver(object): + def __init__(self, cfg: BaseConfig) -> None: + + self.cfg = cfg + + def setup(self, ): + '''Avoid instantiating unnecessary classes + ''' + cfg = self.cfg + device = cfg.device + self.device = device + self.last_epoch = cfg.last_epoch + + self.model = dist.warp_model(cfg.model.to(device), cfg.find_unused_parameters, cfg.sync_bn) + self.criterion = cfg.criterion.to(device) + self.postprocessor = cfg.postprocessor + + # NOTE (lvwenyu): should load_tuning_state before ema instance building + if self.cfg.tuning: + print(f'Tuning checkpoint from {self.cfg.tuning}') + self.load_tuning_state(self.cfg.tuning) + + self.scaler = cfg.scaler + self.ema = cfg.ema.to(device) if cfg.ema is not None else None + + self.output_dir = Path(cfg.output_dir) + self.output_dir.mkdir(parents=True, exist_ok=True) + + + def train(self, ): + self.setup() + self.optimizer = self.cfg.optimizer + self.lr_scheduler = self.cfg.lr_scheduler + + # NOTE instantiating order + if self.cfg.resume: + print(f'Resume checkpoint from {self.cfg.resume}') + self.resume(self.cfg.resume) + + self.train_dataloader = dist.warp_loader(self.cfg.train_dataloader, \ + shuffle=self.cfg.train_dataloader.shuffle) + self.val_dataloader = dist.warp_loader(self.cfg.val_dataloader, \ + shuffle=self.cfg.val_dataloader.shuffle) + + + def eval(self, ): + self.setup() + self.val_dataloader = dist.warp_loader(self.cfg.val_dataloader, \ + shuffle=self.cfg.val_dataloader.shuffle) + + if self.cfg.resume: + print(f'resume from {self.cfg.resume}') + self.resume(self.cfg.resume) + + + def state_dict(self, last_epoch): + '''state dict + ''' + state = {} + state['model'] = dist.de_parallel(self.model).state_dict() + state['date'] = datetime.now().isoformat() + + # TODO + state['last_epoch'] = last_epoch + + if self.optimizer is not None: + state['optimizer'] = self.optimizer.state_dict() + + if self.lr_scheduler is not None: + state['lr_scheduler'] = self.lr_scheduler.state_dict() + # state['last_epoch'] = self.lr_scheduler.last_epoch + + if self.ema is not None: + state['ema'] = self.ema.state_dict() + + if self.scaler is not None: + state['scaler'] = self.scaler.state_dict() + + return state + + + def load_state_dict(self, state): + '''load state dict + ''' + # TODO + if getattr(self, 'last_epoch', None) and 'last_epoch' in state: + self.last_epoch = state['last_epoch'] + print('Loading last_epoch') + + if getattr(self, 'model', None) and 'model' in state: + if dist.is_parallel(self.model): + self.model.module.load_state_dict(state['model']) + else: + self.model.load_state_dict(state['model']) + print('Loading model.state_dict') + + if getattr(self, 'ema', None) and 'ema' in state: + self.ema.load_state_dict(state['ema']) + print('Loading ema.state_dict') + + if getattr(self, 'optimizer', None) and 'optimizer' in state: + self.optimizer.load_state_dict(state['optimizer']) + print('Loading optimizer.state_dict') + + if getattr(self, 'lr_scheduler', None) and 'lr_scheduler' in state: + self.lr_scheduler.load_state_dict(state['lr_scheduler']) + print('Loading lr_scheduler.state_dict') + + if getattr(self, 'scaler', None) and 'scaler' in state: + self.scaler.load_state_dict(state['scaler']) + print('Loading scaler.state_dict') + + + def save(self, path): + '''save state + ''' + state = self.state_dict() + dist.save_on_master(state, path) + + + def resume(self, path): + '''load resume + ''' + # for cuda:0 memory + state = torch.load(path, map_location='cpu') + self.load_state_dict(state) + + def load_tuning_state(self, path,): + """only load model for tuning and skip missed/dismatched keys + """ + if 'http' in path: + state = torch.hub.load_state_dict_from_url(path, map_location='cpu') + else: + state = torch.load(path, map_location='cpu') + + module = dist.de_parallel(self.model) + + # TODO hard code + if 'ema' in state: + stat, infos = self._matched_state(module.state_dict(), state['ema']['module']) + else: + stat, infos = self._matched_state(module.state_dict(), state['model']) + + module.load_state_dict(stat, strict=False) + print(f'Load model.state_dict, {infos}') + + @staticmethod + def _matched_state(state: Dict[str, torch.Tensor], params: Dict[str, torch.Tensor]): + missed_list = [] + unmatched_list = [] + matched_state = {} + for k, v in state.items(): + if k in params: + if v.shape == params[k].shape: + matched_state[k] = params[k] + else: + unmatched_list.append(k) + else: + missed_list.append(k) + + return matched_state, {'missed': missed_list, 'unmatched': unmatched_list} + + + def fit(self, ): + raise NotImplementedError('') + + def val(self, ): + raise NotImplementedError('') diff --git a/rtdetr_pytorch/src/zoo/__init__.py b/rtdetr_pytorch/src/zoo/__init__.py new file mode 100644 index 0000000..e6c56d9 --- /dev/null +++ b/rtdetr_pytorch/src/zoo/__init__.py @@ -0,0 +1,2 @@ + +from .rtdetr import * diff --git a/rtdetr_pytorch/src/zoo/rtdetr/__init__.py b/rtdetr_pytorch/src/zoo/rtdetr/__init__.py new file mode 100644 index 0000000..1b4583b --- /dev/null +++ b/rtdetr_pytorch/src/zoo/rtdetr/__init__.py @@ -0,0 +1,12 @@ +"""by lyuwenyu +""" + + +from .rtdetr import * + +from .hybrid_encoder import * +from .rtdetr_decoder import * +from .rtdetr_postprocessor import * +from .rtdetr_criterion import * + +from .matcher import * diff --git a/rtdetr_pytorch/src/zoo/rtdetr/box_ops.py b/rtdetr_pytorch/src/zoo/rtdetr/box_ops.py new file mode 100644 index 0000000..5d65866 --- /dev/null +++ b/rtdetr_pytorch/src/zoo/rtdetr/box_ops.py @@ -0,0 +1,89 @@ +''' +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +https://github.com/facebookresearch/detr/blob/main/util/box_ops.py +''' + +import torch +from torchvision.ops.boxes import box_area + + +def box_cxcywh_to_xyxy(x): + x_c, y_c, w, h = x.unbind(-1) + b = [(x_c - 0.5 * w), (y_c - 0.5 * h), + (x_c + 0.5 * w), (y_c + 0.5 * h)] + return torch.stack(b, dim=-1) + + +def box_xyxy_to_cxcywh(x): + x0, y0, x1, y1 = x.unbind(-1) + b = [(x0 + x1) / 2, (y0 + y1) / 2, + (x1 - x0), (y1 - y0)] + return torch.stack(b, dim=-1) + + +# modified from torchvision to also return the union +def box_iou(boxes1, boxes2): + area1 = box_area(boxes1) + area2 = box_area(boxes2) + + lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] + rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] + + wh = (rb - lt).clamp(min=0) # [N,M,2] + inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] + + union = area1[:, None] + area2 - inter + + iou = inter / union + return iou, union + + +def generalized_box_iou(boxes1, boxes2): + """ + Generalized IoU from https://giou.stanford.edu/ + + The boxes should be in [x0, y0, x1, y1] format + + Returns a [N, M] pairwise matrix, where N = len(boxes1) + and M = len(boxes2) + """ + # degenerate boxes gives inf / nan results + # so do an early check + assert (boxes1[:, 2:] >= boxes1[:, :2]).all() + assert (boxes2[:, 2:] >= boxes2[:, :2]).all() + iou, union = box_iou(boxes1, boxes2) + + lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) + rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) + + wh = (rb - lt).clamp(min=0) # [N,M,2] + area = wh[:, :, 0] * wh[:, :, 1] + + return iou - (area - union) / area + + +def masks_to_boxes(masks): + """Compute the bounding boxes around the provided masks + + The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. + + Returns a [N, 4] tensors, with the boxes in xyxy format + """ + if masks.numel() == 0: + return torch.zeros((0, 4), device=masks.device) + + h, w = masks.shape[-2:] + + y = torch.arange(0, h, dtype=torch.float) + x = torch.arange(0, w, dtype=torch.float) + y, x = torch.meshgrid(y, x) + + x_mask = (masks * x.unsqueeze(0)) + x_max = x_mask.flatten(1).max(-1)[0] + x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] + + y_mask = (masks * y.unsqueeze(0)) + y_max = y_mask.flatten(1).max(-1)[0] + y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] + + return torch.stack([x_min, y_min, x_max, y_max], 1) \ No newline at end of file diff --git a/rtdetr_pytorch/src/zoo/rtdetr/denoising.py b/rtdetr_pytorch/src/zoo/rtdetr/denoising.py new file mode 100644 index 0000000..6830752 --- /dev/null +++ b/rtdetr_pytorch/src/zoo/rtdetr/denoising.py @@ -0,0 +1,125 @@ +"""by lyuwenyu +""" + +import torch + +from .utils import inverse_sigmoid +from .box_ops import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh + + + +def get_contrastive_denoising_training_group(targets, + num_classes, + num_queries, + class_embed, + num_denoising=100, + label_noise_ratio=0.5, + box_noise_scale=1.0,): + """cnd""" + if num_denoising <= 0: + return None, None, None, None + + num_gts = [len(t['labels']) for t in targets] + device = targets[0]['labels'].device + + max_gt_num = max(num_gts) + if max_gt_num == 0: + return None, None, None, None + + num_group = num_denoising // max_gt_num + num_group = 1 if num_group == 0 else num_group + # pad gt to max_num of a batch + bs = len(num_gts) + + input_query_class = torch.full([bs, max_gt_num], num_classes, dtype=torch.int32, device=device) + input_query_bbox = torch.zeros([bs, max_gt_num, 4], device=device) + pad_gt_mask = torch.zeros([bs, max_gt_num], dtype=torch.bool, device=device) + + for i in range(bs): + num_gt = num_gts[i] + if num_gt > 0: + input_query_class[i, :num_gt] = targets[i]['labels'] + input_query_bbox[i, :num_gt] = targets[i]['boxes'] + pad_gt_mask[i, :num_gt] = 1 + # each group has positive and negative queries. + input_query_class = input_query_class.tile([1, 2 * num_group]) + input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1]) + pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group]) + # positive and negative mask + negative_gt_mask = torch.zeros([bs, max_gt_num * 2, 1], device=device) + negative_gt_mask[:, max_gt_num:] = 1 + negative_gt_mask = negative_gt_mask.tile([1, num_group, 1]) + positive_gt_mask = 1 - negative_gt_mask + # contrastive denoising training positive index + positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask + dn_positive_idx = torch.nonzero(positive_gt_mask)[:, 1] + dn_positive_idx = torch.split(dn_positive_idx, [n * num_group for n in num_gts]) + # total denoising queries + num_denoising = int(max_gt_num * 2 * num_group) + + if label_noise_ratio > 0: + mask = torch.rand_like(input_query_class, dtype=torch.float) < (label_noise_ratio * 0.5) + # randomly put a new one here + new_label = torch.randint_like(mask, 0, num_classes, dtype=input_query_class.dtype) + input_query_class = torch.where(mask & pad_gt_mask, new_label, input_query_class) + + # if label_noise_ratio > 0: + # input_query_class = input_query_class.flatten() + # pad_gt_mask = pad_gt_mask.flatten() + # # half of bbox prob + # # mask = torch.rand(input_query_class.shape, device=device) < (label_noise_ratio * 0.5) + # mask = torch.rand_like(input_query_class) < (label_noise_ratio * 0.5) + # chosen_idx = torch.nonzero(mask * pad_gt_mask).squeeze(-1) + # # randomly put a new one here + # new_label = torch.randint_like(chosen_idx, 0, num_classes, dtype=input_query_class.dtype) + # # input_query_class.scatter_(dim=0, index=chosen_idx, value=new_label) + # input_query_class[chosen_idx] = new_label + # input_query_class = input_query_class.reshape(bs, num_denoising) + # pad_gt_mask = pad_gt_mask.reshape(bs, num_denoising) + + if box_noise_scale > 0: + known_bbox = box_cxcywh_to_xyxy(input_query_bbox) + diff = torch.tile(input_query_bbox[..., 2:] * 0.5, [1, 1, 2]) * box_noise_scale + rand_sign = torch.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0 + rand_part = torch.rand_like(input_query_bbox) + rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (1 - negative_gt_mask) + rand_part *= rand_sign + known_bbox += rand_part * diff + known_bbox.clip_(min=0.0, max=1.0) + input_query_bbox = box_xyxy_to_cxcywh(known_bbox) + input_query_bbox = inverse_sigmoid(input_query_bbox) + + # class_embed = torch.concat([class_embed, torch.zeros([1, class_embed.shape[-1]], device=device)]) + # input_query_class = torch.gather( + # class_embed, input_query_class.flatten(), + # axis=0).reshape(bs, num_denoising, -1) + # input_query_class = class_embed(input_query_class.flatten()).reshape(bs, num_denoising, -1) + input_query_class = class_embed(input_query_class) + + tgt_size = num_denoising + num_queries + # attn_mask = torch.ones([tgt_size, tgt_size], device=device) < 0 + attn_mask = torch.full([tgt_size, tgt_size], False, dtype=torch.bool, device=device) + # match query cannot see the reconstruction + attn_mask[num_denoising:, :num_denoising] = True + + # reconstruct cannot see each other + for i in range(num_group): + if i == 0: + attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1): num_denoising] = True + if i == num_group - 1: + attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), :max_gt_num * i * 2] = True + else: + attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1): num_denoising] = True + attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), :max_gt_num * 2 * i] = True + + dn_meta = { + "dn_positive_idx": dn_positive_idx, + "dn_num_group": num_group, + "dn_num_split": [num_denoising, num_queries] + } + + # print(input_query_class.shape) # torch.Size([4, 196, 256]) + # print(input_query_bbox.shape) # torch.Size([4, 196, 4]) + # print(attn_mask.shape) # torch.Size([496, 496]) + + return input_query_class, input_query_bbox, attn_mask, dn_meta diff --git a/rtdetr_pytorch/src/zoo/rtdetr/hybrid_encoder.py b/rtdetr_pytorch/src/zoo/rtdetr/hybrid_encoder.py new file mode 100644 index 0000000..804db69 --- /dev/null +++ b/rtdetr_pytorch/src/zoo/rtdetr/hybrid_encoder.py @@ -0,0 +1,322 @@ +'''by lyuwenyu +''' + +import copy +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .utils import get_activation + +from src.core import register + + +__all__ = ['HybridEncoder'] + + + +class ConvNormLayer(nn.Module): + def __init__(self, ch_in, ch_out, kernel_size, stride, padding=None, bias=False, act=None): + super().__init__() + self.conv = nn.Conv2d( + ch_in, + ch_out, + kernel_size, + stride, + padding=(kernel_size-1)//2 if padding is None else padding, + bias=bias) + self.norm = nn.BatchNorm2d(ch_out) + self.act = nn.Identity() if act is None else get_activation(act) + + def forward(self, x): + return self.act(self.norm(self.conv(x))) + + +class RepVggBlock(nn.Module): + def __init__(self, ch_in, ch_out, act='relu'): + super().__init__() + self.ch_in = ch_in + self.ch_out = ch_out + self.conv1 = ConvNormLayer(ch_in, ch_out, 3, 1, padding=1, act=None) + self.conv2 = ConvNormLayer(ch_in, ch_out, 1, 1, padding=0, act=None) + self.act = nn.Identity() if act is None else get_activation(act) + + def forward(self, x): + if hasattr(self, 'conv'): + y = self.conv(x) + else: + y = self.conv1(x) + self.conv2(x) + + return self.act(y) + + def convert_to_deploy(self): + if not hasattr(self, 'conv'): + self.conv = nn.Conv2d(self.ch_in, self.ch_out, 3, 1, padding=1) + + kernel, bias = self.get_equivalent_kernel_bias() + self.conv.weight.data = kernel + self.conv.bias.data = bias + # self.__delattr__('conv1') + # self.__delattr__('conv2') + + def get_equivalent_kernel_bias(self): + kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1) + kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2) + + return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1), bias3x3 + bias1x1 + + def _pad_1x1_to_3x3_tensor(self, kernel1x1): + if kernel1x1 is None: + return 0 + else: + return F.pad(kernel1x1, [1, 1, 1, 1]) + + def _fuse_bn_tensor(self, branch: ConvNormLayer): + if branch is None: + return 0, 0 + kernel = branch.conv.weight + running_mean = branch.norm.running_mean + running_var = branch.norm.running_var + gamma = branch.norm.weight + beta = branch.norm.bias + eps = branch.norm.eps + std = (running_var + eps).sqrt() + t = (gamma / std).reshape(-1, 1, 1, 1) + return kernel * t, beta - running_mean * gamma / std + + +class CSPRepLayer(nn.Module): + def __init__(self, + in_channels, + out_channels, + num_blocks=3, + expansion=1.0, + bias=None, + act="silu"): + super(CSPRepLayer, self).__init__() + hidden_channels = int(out_channels * expansion) + self.conv1 = ConvNormLayer(in_channels, hidden_channels, 1, 1, bias=bias, act=act) + self.conv2 = ConvNormLayer(in_channels, hidden_channels, 1, 1, bias=bias, act=act) + self.bottlenecks = nn.Sequential(*[ + RepVggBlock(hidden_channels, hidden_channels, act=act) for _ in range(num_blocks) + ]) + if hidden_channels != out_channels: + self.conv3 = ConvNormLayer(hidden_channels, out_channels, 1, 1, bias=bias, act=act) + else: + self.conv3 = nn.Identity() + + def forward(self, x): + x_1 = self.conv1(x) + x_1 = self.bottlenecks(x_1) + x_2 = self.conv2(x) + return self.conv3(x_1 + x_2) + + +# transformer +class TransformerEncoderLayer(nn.Module): + def __init__(self, + d_model, + nhead, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + normalize_before=False): + super().__init__() + self.normalize_before = normalize_before + + self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout, batch_first=True) + + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + self.activation = get_activation(activation) + + @staticmethod + def with_pos_embed(tensor, pos_embed): + return tensor if pos_embed is None else tensor + pos_embed + + def forward(self, src, src_mask=None, pos_embed=None) -> torch.Tensor: + residual = src + if self.normalize_before: + src = self.norm1(src) + q = k = self.with_pos_embed(src, pos_embed) + src, _ = self.self_attn(q, k, value=src, attn_mask=src_mask) + + src = residual + self.dropout1(src) + if not self.normalize_before: + src = self.norm1(src) + + residual = src + if self.normalize_before: + src = self.norm2(src) + src = self.linear2(self.dropout(self.activation(self.linear1(src)))) + src = residual + self.dropout2(src) + if not self.normalize_before: + src = self.norm2(src) + return src + + +class TransformerEncoder(nn.Module): + def __init__(self, encoder_layer, num_layers, norm=None): + super(TransformerEncoder, self).__init__() + self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for _ in range(num_layers)]) + self.num_layers = num_layers + self.norm = norm + + def forward(self, src, src_mask=None, pos_embed=None) -> torch.Tensor: + output = src + for layer in self.layers: + output = layer(output, src_mask=src_mask, pos_embed=pos_embed) + + if self.norm is not None: + output = self.norm(output) + + return output + + +@register +class HybridEncoder(nn.Module): + def __init__(self, + in_channels=[512, 1024, 2048], + feat_strides=[8, 16, 32], + hidden_dim=256, + nhead=8, + dim_feedforward = 1024, + dropout=0.0, + enc_act='gelu', + use_encoder_idx=[2], + num_encoder_layers=1, + pe_temperature=10000, + expansion=1.0, + depth_mult=1.0, + act='silu', + eval_spatial_size=None): + super().__init__() + self.in_channels = in_channels + self.feat_strides = feat_strides + self.hidden_dim = hidden_dim + self.use_encoder_idx = use_encoder_idx + self.num_encoder_layers = num_encoder_layers + self.pe_temperature = pe_temperature + self.eval_spatial_size = eval_spatial_size + + self.out_channels = [hidden_dim for _ in range(len(in_channels))] + self.out_strides = feat_strides + + # channel projection + self.input_proj = nn.ModuleList() + for in_channel in in_channels: + self.input_proj.append( + nn.Sequential( + nn.Conv2d(in_channel, hidden_dim, kernel_size=1, bias=False), + nn.BatchNorm2d(hidden_dim) + ) + ) + + # encoder transformer + encoder_layer = TransformerEncoderLayer( + hidden_dim, + nhead=nhead, + dim_feedforward=dim_feedforward, + dropout=dropout, + activation=enc_act) + + self.encoder = nn.ModuleList([ + TransformerEncoder(copy.deepcopy(encoder_layer), num_encoder_layers) for _ in range(len(use_encoder_idx)) + ]) + + # top-down fpn + self.lateral_convs = nn.ModuleList() + self.fpn_blocks = nn.ModuleList() + for _ in range(len(in_channels) - 1, 0, -1): + self.lateral_convs.append(ConvNormLayer(hidden_dim, hidden_dim, 1, 1, act=act)) + self.fpn_blocks.append( + CSPRepLayer(hidden_dim * 2, hidden_dim, round(3 * depth_mult), act=act, expansion=expansion) + ) + + # bottom-up pan + self.downsample_convs = nn.ModuleList() + self.pan_blocks = nn.ModuleList() + for _ in range(len(in_channels) - 1): + self.downsample_convs.append( + ConvNormLayer(hidden_dim, hidden_dim, 3, 2, act=act) + ) + self.pan_blocks.append( + CSPRepLayer(hidden_dim * 2, hidden_dim, round(3 * depth_mult), act=act, expansion=expansion) + ) + + self._reset_parameters() + + def _reset_parameters(self): + if self.eval_spatial_size: + for idx in self.use_encoder_idx: + stride = self.feat_strides[idx] + pos_embed = self.build_2d_sincos_position_embedding( + self.eval_spatial_size[1] // stride, self.eval_spatial_size[0] // stride, + self.hidden_dim, self.pe_temperature) + setattr(self, f'pos_embed{idx}', pos_embed) + # self.register_buffer(f'pos_embed{idx}', pos_embed) + + @staticmethod + def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.): + ''' + ''' + grid_w = torch.arange(int(w), dtype=torch.float32) + grid_h = torch.arange(int(h), dtype=torch.float32) + grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing='ij') + assert embed_dim % 4 == 0, \ + 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding' + pos_dim = embed_dim // 4 + omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim + omega = 1. / (temperature ** omega) + + out_w = grid_w.flatten()[..., None] @ omega[None] + out_h = grid_h.flatten()[..., None] @ omega[None] + + return torch.concat([out_w.sin(), out_w.cos(), out_h.sin(), out_h.cos()], dim=1)[None, :, :] + + def forward(self, feats): + assert len(feats) == len(self.in_channels) + proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)] + + # encoder + if self.num_encoder_layers > 0: + for i, enc_ind in enumerate(self.use_encoder_idx): + h, w = proj_feats[enc_ind].shape[2:] + # flatten [B, C, H, W] to [B, HxW, C] + src_flatten = proj_feats[enc_ind].flatten(2).permute(0, 2, 1) + if self.training or self.eval_spatial_size is None: + pos_embed = self.build_2d_sincos_position_embedding( + w, h, self.hidden_dim, self.pe_temperature).to(src_flatten.device) + else: + pos_embed = getattr(self, f'pos_embed{enc_ind}', None).to(src_flatten.device) + + memory = self.encoder[i](src_flatten, pos_embed=pos_embed) + proj_feats[enc_ind] = memory.permute(0, 2, 1).reshape(-1, self.hidden_dim, h, w).contiguous() + # print([x.is_contiguous() for x in proj_feats ]) + + # broadcasting and fusion + inner_outs = [proj_feats[-1]] + for idx in range(len(self.in_channels) - 1, 0, -1): + feat_high = inner_outs[0] + feat_low = proj_feats[idx - 1] + feat_high = self.lateral_convs[len(self.in_channels) - 1 - idx](feat_high) + inner_outs[0] = feat_high + upsample_feat = F.interpolate(feat_high, scale_factor=2., mode='nearest') + inner_out = self.fpn_blocks[len(self.in_channels)-1-idx](torch.concat([upsample_feat, feat_low], dim=1)) + inner_outs.insert(0, inner_out) + + outs = [inner_outs[0]] + for idx in range(len(self.in_channels) - 1): + feat_low = outs[-1] + feat_high = inner_outs[idx + 1] + downsample_feat = self.downsample_convs[idx](feat_low) + out = self.pan_blocks[idx](torch.concat([downsample_feat, feat_high], dim=1)) + outs.append(out) + + return outs diff --git a/rtdetr_pytorch/src/zoo/rtdetr/matcher.py b/rtdetr_pytorch/src/zoo/rtdetr/matcher.py new file mode 100644 index 0000000..cf9dec1 --- /dev/null +++ b/rtdetr_pytorch/src/zoo/rtdetr/matcher.py @@ -0,0 +1,108 @@ +""" +Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +Modules to compute the matching cost and solve the corresponding LSAP. + +by lyuwenyu +""" + +import torch +import torch.nn.functional as F + +from scipy.optimize import linear_sum_assignment +from torch import nn + +from .box_ops import box_cxcywh_to_xyxy, generalized_box_iou + +from src.core import register + + +@register +class HungarianMatcher(nn.Module): + """This class computes an assignment between the targets and the predictions of the network + + For efficiency reasons, the targets don't include the no_object. Because of this, in general, + there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, + while the others are un-matched (and thus treated as non-objects). + """ + + __share__ = ['use_focal_loss', ] + + def __init__(self, weight_dict, use_focal_loss=False, alpha=0.25, gamma=2.0): + """Creates the matcher + + Params: + cost_class: This is the relative weight of the classification error in the matching cost + cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost + cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost + """ + super().__init__() + self.cost_class = weight_dict['cost_class'] + self.cost_bbox = weight_dict['cost_bbox'] + self.cost_giou = weight_dict['cost_giou'] + + self.use_focal_loss = use_focal_loss + self.alpha = alpha + self.gamma = gamma + + assert self.cost_class != 0 or self.cost_bbox != 0 or self.cost_giou != 0, "all costs cant be 0" + + @torch.no_grad() + def forward(self, outputs, targets): + """ Performs the matching + + Params: + outputs: This is a dict that contains at least these entries: + "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits + "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates + + targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: + "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth + objects in the target) containing the class labels + "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates + + Returns: + A list of size batch_size, containing tuples of (index_i, index_j) where: + - index_i is the indices of the selected predictions (in order) + - index_j is the indices of the corresponding selected targets (in order) + For each batch element, it holds: + len(index_i) = len(index_j) = min(num_queries, num_target_boxes) + """ + bs, num_queries = outputs["pred_logits"].shape[:2] + + # We flatten to compute the cost matrices in a batch + if self.use_focal_loss: + out_prob = F.sigmoid(outputs["pred_logits"].flatten(0, 1)) + else: + out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1) # [batch_size * num_queries, num_classes] + + out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] + + # Also concat the target labels and boxes + tgt_ids = torch.cat([v["labels"] for v in targets]) + tgt_bbox = torch.cat([v["boxes"] for v in targets]) + + # Compute the classification cost. Contrary to the loss, we don't use the NLL, + # but approximate it in 1 - proba[target class]. + # The 1 is a constant that doesn't change the matching, it can be ommitted. + if self.use_focal_loss: + out_prob = out_prob[:, tgt_ids] + neg_cost_class = (1 - self.alpha) * (out_prob**self.gamma) * (-(1 - out_prob + 1e-8).log()) + pos_cost_class = self.alpha * ((1 - out_prob)**self.gamma) * (-(out_prob + 1e-8).log()) + cost_class = pos_cost_class - neg_cost_class + else: + cost_class = -out_prob[:, tgt_ids] + + # Compute the L1 cost between boxes + cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1) + + # Compute the giou cost betwen boxes + cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)) + + # Final cost matrix + C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou + C = C.view(bs, num_queries, -1).cpu() + + sizes = [len(v["boxes"]) for v in targets] + indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))] + + return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr.py new file mode 100644 index 0000000..851d4f7 --- /dev/null +++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr.py @@ -0,0 +1,44 @@ +"""by lyuwenyu +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import random +import numpy as np + +from src.core import register + + +__all__ = ['RTDETR', ] + + +@register +class RTDETR(nn.Module): + __inject__ = ['backbone', 'encoder', 'decoder', ] + + def __init__(self, backbone: nn.Module, encoder, decoder, multi_scale=None): + super().__init__() + self.backbone = backbone + self.decoder = decoder + self.encoder = encoder + self.multi_scale = multi_scale + + def forward(self, x, targets=None): + if self.multi_scale and self.training: + sz = np.random.choice(self.multi_scale) + x = F.interpolate(x, size=[sz, sz]) + + x = self.backbone(x) + x = self.encoder(x) + x = self.decoder(x, targets) + + return x + + def deploy(self, ): + self.eval() + for m in self.modules(): + if hasattr(m, 'convert_to_deploy'): + m.convert_to_deploy() + return self diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_criterion.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_criterion.py new file mode 100644 index 0000000..3ce77c0 --- /dev/null +++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_criterion.py @@ -0,0 +1,341 @@ +""" +reference: +https://github.com/facebookresearch/detr/blob/main/models/detr.py + +by lyuwenyu +""" + + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchvision + +# from torchvision.ops import box_convert, generalized_box_iou +from .box_ops import box_cxcywh_to_xyxy, box_iou, generalized_box_iou + +from src.misc.dist import get_world_size, is_dist_available_and_initialized +from src.core import register + + + +@register +class SetCriterion(nn.Module): + """ This class computes the loss for DETR. + The process happens in two steps: + 1) we compute hungarian assignment between ground truth boxes and the outputs of the model + 2) we supervise each pair of matched ground-truth / prediction (supervise class and box) + """ + __share__ = ['num_classes', ] + __inject__ = ['matcher', ] + + def __init__(self, matcher, weight_dict, losses, alpha=0.2, gamma=2.0, eos_coef=1e-4, num_classes=80): + """ Create the criterion. + Parameters: + num_classes: number of object categories, omitting the special no-object category + matcher: module able to compute a matching between targets and proposals + weight_dict: dict containing as key the names of the losses and as values their relative weight. + eos_coef: relative classification weight applied to the no-object category + losses: list of all the losses to be applied. See get_loss for list of available losses. + """ + super().__init__() + self.num_classes = num_classes + self.matcher = matcher + self.weight_dict = weight_dict + self.losses = losses + + empty_weight = torch.ones(self.num_classes + 1) + empty_weight[-1] = eos_coef + self.register_buffer('empty_weight', empty_weight) + + self.alpha = alpha + self.gamma = gamma + + + def loss_labels(self, outputs, targets, indices, num_boxes, log=True): + """Classification loss (NLL) + targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes] + """ + assert 'pred_logits' in outputs + src_logits = outputs['pred_logits'] + + idx = self._get_src_permutation_idx(indices) + target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) + target_classes = torch.full(src_logits.shape[:2], self.num_classes, + dtype=torch.int64, device=src_logits.device) + target_classes[idx] = target_classes_o + + loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight) + losses = {'loss_ce': loss_ce} + + if log: + # TODO this should probably be a separate loss, not hacked in this one here + losses['class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0] + return losses + + def loss_labels_bce(self, outputs, targets, indices, num_boxes, log=True): + src_logits = outputs['pred_logits'] + idx = self._get_src_permutation_idx(indices) + target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) + target_classes = torch.full(src_logits.shape[:2], self.num_classes, + dtype=torch.int64, device=src_logits.device) + target_classes[idx] = target_classes_o + + target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1] + loss = F.binary_cross_entropy_with_logits(src_logits, target * 1., reduction='none') + loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes + return {'loss_bce': loss} + + def loss_labels_focal(self, outputs, targets, indices, num_boxes, log=True): + assert 'pred_logits' in outputs + src_logits = outputs['pred_logits'] + + idx = self._get_src_permutation_idx(indices) + target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) + target_classes = torch.full(src_logits.shape[:2], self.num_classes, + dtype=torch.int64, device=src_logits.device) + target_classes[idx] = target_classes_o + + target = F.one_hot(target_classes, num_classes=self.num_classes+1)[..., :-1] + # ce_loss = F.binary_cross_entropy_with_logits(src_logits, target * 1., reduction="none") + # prob = F.sigmoid(src_logits) # TODO .detach() + # p_t = prob * target + (1 - prob) * (1 - target) + # alpha_t = self.alpha * target + (1 - self.alpha) * (1 - target) + # loss = alpha_t * ce_loss * ((1 - p_t) ** self.gamma) + # loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes + loss = torchvision.ops.sigmoid_focal_loss(src_logits, target, self.alpha, self.gamma, reduction='none') + loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes + + return {'loss_focal': loss} + + def loss_labels_vfl(self, outputs, targets, indices, num_boxes, log=True): + assert 'pred_boxes' in outputs + idx = self._get_src_permutation_idx(indices) + + src_boxes = outputs['pred_boxes'][idx] + target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0) + ious, _ = box_iou(box_cxcywh_to_xyxy(src_boxes), box_cxcywh_to_xyxy(target_boxes)) + ious = torch.diag(ious).detach() + + src_logits = outputs['pred_logits'] + target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) + target_classes = torch.full(src_logits.shape[:2], self.num_classes, + dtype=torch.int64, device=src_logits.device) + target_classes[idx] = target_classes_o + target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1] + + target_score_o = torch.zeros_like(target_classes, dtype=src_logits.dtype) + target_score_o[idx] = ious.to(target_score_o.dtype) + target_score = target_score_o.unsqueeze(-1) * target + + pred_score = F.sigmoid(src_logits).detach() + weight = self.alpha * pred_score.pow(self.gamma) * (1 - target) + target_score + + loss = F.binary_cross_entropy_with_logits(src_logits, target_score, weight=weight, reduction='none') + loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes + return {'loss_vfl': loss} + + @torch.no_grad() + def loss_cardinality(self, outputs, targets, indices, num_boxes): + """ Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes + This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients + """ + pred_logits = outputs['pred_logits'] + device = pred_logits.device + tgt_lengths = torch.as_tensor([len(v["labels"]) for v in targets], device=device) + # Count the number of predictions that are NOT "no-object" (which is the last class) + card_pred = (pred_logits.argmax(-1) != pred_logits.shape[-1] - 1).sum(1) + card_err = F.l1_loss(card_pred.float(), tgt_lengths.float()) + losses = {'cardinality_error': card_err} + return losses + + def loss_boxes(self, outputs, targets, indices, num_boxes): + """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss + targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4] + The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size. + """ + assert 'pred_boxes' in outputs + idx = self._get_src_permutation_idx(indices) + src_boxes = outputs['pred_boxes'][idx] + target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0) + + losses = {} + + loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none') + losses['loss_bbox'] = loss_bbox.sum() / num_boxes + + loss_giou = 1 - torch.diag(generalized_box_iou( + box_cxcywh_to_xyxy(src_boxes), + box_cxcywh_to_xyxy(target_boxes))) + losses['loss_giou'] = loss_giou.sum() / num_boxes + return losses + + def loss_masks(self, outputs, targets, indices, num_boxes): + """Compute the losses related to the masks: the focal loss and the dice loss. + targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w] + """ + assert "pred_masks" in outputs + + src_idx = self._get_src_permutation_idx(indices) + tgt_idx = self._get_tgt_permutation_idx(indices) + src_masks = outputs["pred_masks"] + src_masks = src_masks[src_idx] + masks = [t["masks"] for t in targets] + # TODO use valid to mask invalid areas due to padding in loss + target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() + target_masks = target_masks.to(src_masks) + target_masks = target_masks[tgt_idx] + + # upsample predictions to the target size + src_masks = interpolate(src_masks[:, None], size=target_masks.shape[-2:], + mode="bilinear", align_corners=False) + src_masks = src_masks[:, 0].flatten(1) + + target_masks = target_masks.flatten(1) + target_masks = target_masks.view(src_masks.shape) + losses = { + "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes), + "loss_dice": dice_loss(src_masks, target_masks, num_boxes), + } + return losses + + def _get_src_permutation_idx(self, indices): + # permute predictions following indices + batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) + src_idx = torch.cat([src for (src, _) in indices]) + return batch_idx, src_idx + + def _get_tgt_permutation_idx(self, indices): + # permute targets following indices + batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) + tgt_idx = torch.cat([tgt for (_, tgt) in indices]) + return batch_idx, tgt_idx + + def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs): + loss_map = { + 'labels': self.loss_labels, + 'cardinality': self.loss_cardinality, + 'boxes': self.loss_boxes, + 'masks': self.loss_masks, + + 'bce': self.loss_labels_bce, + 'focal': self.loss_labels_focal, + 'vfl': self.loss_labels_vfl, + } + assert loss in loss_map, f'do you really want to compute {loss} loss?' + return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs) + + def forward(self, outputs, targets): + """ This performs the loss computation. + Parameters: + outputs: dict of tensors, see the output specification of the model for the format + targets: list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc + """ + outputs_without_aux = {k: v for k, v in outputs.items() if 'aux' not in k} + + # Retrieve the matching between the outputs of the last layer and the targets + indices = self.matcher(outputs_without_aux, targets) + + # Compute the average number of target boxes accross all nodes, for normalization purposes + num_boxes = sum(len(t["labels"]) for t in targets) + num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) + if is_dist_available_and_initialized(): + torch.distributed.all_reduce(num_boxes) + num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item() + + # Compute all the requested losses + losses = {} + for loss in self.losses: + l_dict = self.get_loss(loss, outputs, targets, indices, num_boxes) + l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict} + losses.update(l_dict) + + # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. + if 'aux_outputs' in outputs: + for i, aux_outputs in enumerate(outputs['aux_outputs']): + indices = self.matcher(aux_outputs, targets) + for loss in self.losses: + if loss == 'masks': + # Intermediate masks losses are too costly to compute, we ignore them. + continue + kwargs = {} + if loss == 'labels': + # Logging is enabled only for the last layer + kwargs = {'log': False} + + l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs) + l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict} + l_dict = {k + f'_aux_{i}': v for k, v in l_dict.items()} + losses.update(l_dict) + + # In case of cdn auxiliary losses. For rtdetr + if 'dn_aux_outputs' in outputs: + assert 'dn_meta' in outputs, '' + indices = self.get_cdn_matched_indices(outputs['dn_meta'], targets) + num_boxes = num_boxes * outputs['dn_meta']['dn_num_group'] + + for i, aux_outputs in enumerate(outputs['dn_aux_outputs']): + # indices = self.matcher(aux_outputs, targets) + for loss in self.losses: + if loss == 'masks': + # Intermediate masks losses are too costly to compute, we ignore them. + continue + kwargs = {} + if loss == 'labels': + # Logging is enabled only for the last layer + kwargs = {'log': False} + + l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs) + l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict} + l_dict = {k + f'_dn_{i}': v for k, v in l_dict.items()} + losses.update(l_dict) + + return losses + + @staticmethod + def get_cdn_matched_indices(dn_meta, targets): + '''get_cdn_matched_indices + ''' + dn_positive_idx, dn_num_group = dn_meta["dn_positive_idx"], dn_meta["dn_num_group"] + num_gts = [len(t['labels']) for t in targets] + device = targets[0]['labels'].device + + dn_match_indices = [] + for i, num_gt in enumerate(num_gts): + if num_gt > 0: + gt_idx = torch.arange(num_gt, dtype=torch.int64, device=device) + gt_idx = gt_idx.tile(dn_num_group) + assert len(dn_positive_idx[i]) == len(gt_idx) + dn_match_indices.append((dn_positive_idx[i], gt_idx)) + else: + dn_match_indices.append((torch.zeros(0, dtype=torch.int64, device=device), \ + torch.zeros(0, dtype=torch.int64, device=device))) + + return dn_match_indices + + + + + +@torch.no_grad() +def accuracy(output, target, topk=(1,)): + """Computes the precision@k for the specified values of k""" + if target.numel() == 0: + return [torch.zeros([], device=output.device)] + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].view(-1).float().sum(0) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + + + diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py new file mode 100644 index 0000000..a611474 --- /dev/null +++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py @@ -0,0 +1,574 @@ +"""by lyuwenyu +""" + +import math +import copy +from collections import OrderedDict + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.nn.init as init + +from .denoising import get_contrastive_denoising_training_group +from .utils import deformable_attention_core_func, get_activation, inverse_sigmoid +from .utils import bias_init_with_prob + + +from src.core import register + + +__all__ = ['RTDETRTransformer'] + + + +class MLP(nn.Module): + def __init__(self, input_dim, hidden_dim, output_dim, num_layers, act='relu'): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + self.act = nn.Identity() if act is None else get_activation(act) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = self.act(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + + +class MSDeformableAttention(nn.Module): + def __init__(self, embed_dim=256, num_heads=8, num_levels=4, num_points=4,): + """ + Multi-Scale Deformable Attention Module + """ + super(MSDeformableAttention, self).__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.num_levels = num_levels + self.num_points = num_points + self.total_points = num_heads * num_levels * num_points + + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" + + self.sampling_offsets = nn.Linear(embed_dim, self.total_points * 2,) + self.attention_weights = nn.Linear(embed_dim, self.total_points) + self.value_proj = nn.Linear(embed_dim, embed_dim) + self.output_proj = nn.Linear(embed_dim, embed_dim) + + self.ms_deformable_attn_core = deformable_attention_core_func + + self._reset_parameters() + + + def _reset_parameters(self): + # sampling_offsets + init.constant_(self.sampling_offsets.weight, 0) + thetas = torch.arange(self.num_heads, dtype=torch.float32) * (2.0 * math.pi / self.num_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = grid_init / grid_init.abs().max(-1, keepdim=True).values + grid_init = grid_init.reshape(self.num_heads, 1, 1, 2).tile([1, self.num_levels, self.num_points, 1]) + scaling = torch.arange(1, self.num_points + 1, dtype=torch.float32).reshape(1, 1, -1, 1) + grid_init *= scaling + self.sampling_offsets.bias.data[...] = grid_init.flatten() + + # attention_weights + init.constant_(self.attention_weights.weight, 0) + init.constant_(self.attention_weights.bias, 0) + + # proj + init.xavier_uniform_(self.value_proj.weight) + init.constant_(self.value_proj.bias, 0) + init.xavier_uniform_(self.output_proj.weight) + init.constant_(self.output_proj.bias, 0) + + + def forward(self, + query, + reference_points, + value, + value_spatial_shapes, + value_mask=None): + """ + Args: + query (Tensor): [bs, query_length, C] + reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area + value (Tensor): [bs, value_length, C] + value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] + value_level_start_index (List): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...] + value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements + + Returns: + output (Tensor): [bs, Length_{query}, C] + """ + bs, Len_q = query.shape[:2] + Len_v = value.shape[1] + + value = self.value_proj(value) + if value_mask is not None: + value_mask = value_mask.astype(value.dtype).unsqueeze(-1) + value *= value_mask + value = value.reshape(bs, Len_v, self.num_heads, self.head_dim) + + sampling_offsets = self.sampling_offsets(query).reshape( + bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2) + attention_weights = self.attention_weights(query).reshape( + bs, Len_q, self.num_heads, self.num_levels * self.num_points) + attention_weights = F.softmax(attention_weights, dim=-1).reshape( + bs, Len_q, self.num_heads, self.num_levels, self.num_points) + + if reference_points.shape[-1] == 2: + offset_normalizer = torch.tensor(value_spatial_shapes) + offset_normalizer = offset_normalizer.flip([1]).reshape( + 1, 1, 1, self.num_levels, 1, 2) + sampling_locations = reference_points.reshape( + bs, Len_q, 1, self.num_levels, 1, 2 + ) + sampling_offsets / offset_normalizer + elif reference_points.shape[-1] == 4: + sampling_locations = ( + reference_points[:, :, None, :, None, :2] + sampling_offsets / + self.num_points * reference_points[:, :, None, :, None, 2:] * 0.5) + else: + raise ValueError( + "Last dim of reference_points must be 2 or 4, but get {} instead.". + format(reference_points.shape[-1])) + + output = self.ms_deformable_attn_core(value, value_spatial_shapes, sampling_locations, attention_weights) + + output = self.output_proj(output) + + return output + + +class TransformerDecoderLayer(nn.Module): + def __init__(self, + d_model=256, + n_head=8, + dim_feedforward=1024, + dropout=0., + activation="relu", + n_levels=4, + n_points=4,): + super(TransformerDecoderLayer, self).__init__() + + # self attention + self.self_attn = nn.MultiheadAttention(d_model, n_head, dropout=dropout, batch_first=True) + self.dropout1 = nn.Dropout(dropout) + self.norm1 = nn.LayerNorm(d_model) + + # cross attention + self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels, n_points) + self.dropout2 = nn.Dropout(dropout) + self.norm2 = nn.LayerNorm(d_model) + + # ffn + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.activation = getattr(F, activation) + self.dropout3 = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + self.dropout4 = nn.Dropout(dropout) + self.norm3 = nn.LayerNorm(d_model) + + # self._reset_parameters() + + # def _reset_parameters(self): + # linear_init_(self.linear1) + # linear_init_(self.linear2) + # xavier_uniform_(self.linear1.weight) + # xavier_uniform_(self.linear2.weight) + + def with_pos_embed(self, tensor, pos): + return tensor if pos is None else tensor + pos + + def forward_ffn(self, tgt): + return self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) + + def forward(self, + tgt, + reference_points, + memory, + memory_spatial_shapes, + memory_level_start_index, + attn_mask=None, + memory_mask=None, + query_pos_embed=None): + # self attention + q = k = self.with_pos_embed(tgt, query_pos_embed) + + # if attn_mask is not None: + # attn_mask = torch.where( + # attn_mask.to(torch.bool), + # torch.zeros_like(attn_mask), + # torch.full_like(attn_mask, float('-inf'), dtype=tgt.dtype)) + + tgt2, _ = self.self_attn(q, k, value=tgt, attn_mask=attn_mask) + tgt = tgt + self.dropout1(tgt2) + tgt = self.norm1(tgt) + + # cross attention + tgt2 = self.cross_attn(\ + self.with_pos_embed(tgt, query_pos_embed), + reference_points, + memory, + memory_spatial_shapes, + memory_mask) + tgt = tgt + self.dropout2(tgt2) + tgt = self.norm2(tgt) + + # ffn + tgt2 = self.forward_ffn(tgt) + tgt = tgt + self.dropout4(tgt2) + tgt = self.norm3(tgt.clamp(min=-65504, max=65504)) + + return tgt + + +class TransformerDecoder(nn.Module): + def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1): + super(TransformerDecoder, self).__init__() + self.layers = nn.ModuleList([copy.deepcopy(decoder_layer) for _ in range(num_layers)]) + self.hidden_dim = hidden_dim + self.num_layers = num_layers + self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx + + def forward(self, + tgt, + ref_points_unact, + memory, + memory_spatial_shapes, + memory_level_start_index, + bbox_head, + score_head, + query_pos_head, + attn_mask=None, + memory_mask=None): + output = tgt + dec_out_bboxes = [] + dec_out_logits = [] + ref_points_detach = F.sigmoid(ref_points_unact) + + for i, layer in enumerate(self.layers): + ref_points_input = ref_points_detach.unsqueeze(2) + query_pos_embed = query_pos_head(ref_points_detach) + + output = layer(output, ref_points_input, memory, + memory_spatial_shapes, memory_level_start_index, + attn_mask, memory_mask, query_pos_embed) + + inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points_detach)) + + if self.training: + dec_out_logits.append(score_head[i](output)) + if i == 0: + dec_out_bboxes.append(inter_ref_bbox) + else: + dec_out_bboxes.append(F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points))) + + elif i == self.eval_idx: + dec_out_logits.append(score_head[i](output)) + dec_out_bboxes.append(inter_ref_bbox) + break + + ref_points = inter_ref_bbox + ref_points_detach = inter_ref_bbox.detach( + ) if self.training else inter_ref_bbox + + return torch.stack(dec_out_bboxes), torch.stack(dec_out_logits) + + +@register +class RTDETRTransformer(nn.Module): + __share__ = ['num_classes'] + def __init__(self, + num_classes=80, + hidden_dim=256, + num_queries=300, + position_embed_type='sine', + feat_channels=[512, 1024, 2048], + feat_strides=[8, 16, 32], + num_levels=3, + num_decoder_points=4, + nhead=8, + num_decoder_layers=6, + dim_feedforward=1024, + dropout=0., + activation="relu", + num_denoising=100, + label_noise_ratio=0.5, + box_noise_scale=1.0, + learnt_init_query=False, + eval_spatial_size=None, + eval_idx=-1, + eps=1e-2, + aux_loss=True): + + super(RTDETRTransformer, self).__init__() + assert position_embed_type in ['sine', 'learned'], \ + f'ValueError: position_embed_type not supported {position_embed_type}!' + assert len(feat_channels) <= num_levels + assert len(feat_strides) == len(feat_channels) + for _ in range(num_levels - len(feat_strides)): + feat_strides.append(feat_strides[-1] * 2) + + self.hidden_dim = hidden_dim + self.nhead = nhead + self.feat_strides = feat_strides + self.num_levels = num_levels + self.num_classes = num_classes + self.num_queries = num_queries + self.eps = eps + self.num_decoder_layers = num_decoder_layers + self.eval_spatial_size = eval_spatial_size + self.aux_loss = aux_loss + + # backbone feature projection + self._build_input_proj_layer(feat_channels) + + # Transformer module + decoder_layer = TransformerDecoderLayer(hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels, num_decoder_points) + self.decoder = TransformerDecoder(hidden_dim, decoder_layer, num_decoder_layers, eval_idx) + + self.num_denoising = num_denoising + self.label_noise_ratio = label_noise_ratio + self.box_noise_scale = box_noise_scale + # denoising part + if num_denoising > 0: + # self.denoising_class_embed = nn.Embedding(num_classes, hidden_dim, padding_idx=num_classes-1) # TODO for load paddle weights + self.denoising_class_embed = nn.Embedding(num_classes+1, hidden_dim, padding_idx=num_classes) + + # decoder embedding + self.learnt_init_query = learnt_init_query + if learnt_init_query: + self.tgt_embed = nn.Embedding(num_queries, hidden_dim) + self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2) + + # encoder head + self.enc_output = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim), + nn.LayerNorm(hidden_dim,) + ) + self.enc_score_head = nn.Linear(hidden_dim, num_classes) + self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3) + + # decoder head + self.dec_score_head = nn.ModuleList([ + nn.Linear(hidden_dim, num_classes) + for _ in range(num_decoder_layers) + ]) + self.dec_bbox_head = nn.ModuleList([ + MLP(hidden_dim, hidden_dim, 4, num_layers=3) + for _ in range(num_decoder_layers) + ]) + + # init encoder output anchors and valid_mask + if self.eval_spatial_size: + self.anchors, self.valid_mask = self._generate_anchors() + + self._reset_parameters() + + def _reset_parameters(self): + bias = bias_init_with_prob(0.01) + + init.constant_(self.enc_score_head.bias, bias) + init.constant_(self.enc_bbox_head.layers[-1].weight, 0) + init.constant_(self.enc_bbox_head.layers[-1].bias, 0) + + for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head): + init.constant_(cls_.bias, bias) + init.constant_(reg_.layers[-1].weight, 0) + init.constant_(reg_.layers[-1].bias, 0) + + # linear_init_(self.enc_output[0]) + init.xavier_uniform_(self.enc_output[0].weight) + if self.learnt_init_query: + init.xavier_uniform_(self.tgt_embed.weight) + init.xavier_uniform_(self.query_pos_head.layers[0].weight) + init.xavier_uniform_(self.query_pos_head.layers[1].weight) + + + def _build_input_proj_layer(self, feat_channels): + self.input_proj = nn.ModuleList() + for in_channels in feat_channels: + self.input_proj.append( + nn.Sequential(OrderedDict([ + ('conv', nn.Conv2d(in_channels, self.hidden_dim, 1, bias=False)), + ('norm', nn.BatchNorm2d(self.hidden_dim,))]) + ) + ) + + in_channels = feat_channels[-1] + + for _ in range(self.num_levels - len(feat_channels)): + self.input_proj.append( + nn.Sequential(OrderedDict([ + ('conv', nn.Conv2d(in_channels, self.hidden_dim, 3, 2, padding=1, bias=False)), + ('norm', nn.BatchNorm2d(self.hidden_dim))]) + ) + ) + in_channels = self.hidden_dim + + def _get_encoder_input(self, feats): + # get projection features + proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)] + if self.num_levels > len(proj_feats): + len_srcs = len(proj_feats) + for i in range(len_srcs, self.num_levels): + if i == len_srcs: + proj_feats.append(self.input_proj[i](feats[-1])) + else: + proj_feats.append(self.input_proj[i](proj_feats[-1])) + + # get encoder inputs + feat_flatten = [] + spatial_shapes = [] + level_start_index = [0, ] + for i, feat in enumerate(proj_feats): + _, _, h, w = feat.shape + # [b, c, h, w] -> [b, h*w, c] + feat_flatten.append(feat.flatten(2).permute(0, 2, 1)) + # [num_levels, 2] + spatial_shapes.append([h, w]) + # [l], start index of each level + level_start_index.append(h * w + level_start_index[-1]) + + # [b, l, c] + feat_flatten = torch.concat(feat_flatten, 1) + level_start_index.pop() + return (feat_flatten, spatial_shapes, level_start_index) + + def _generate_anchors(self, + spatial_shapes=None, + grid_size=0.05, + dtype=torch.float32, + device='cpu'): + if spatial_shapes is None: + spatial_shapes = [[int(self.eval_spatial_size[0] / s), int(self.eval_spatial_size[1] / s)] + for s in self.feat_strides + ] + anchors = [] + for lvl, (h, w) in enumerate(spatial_shapes): + grid_y, grid_x = torch.meshgrid(\ + torch.arange(end=h, dtype=dtype), \ + torch.arange(end=w, dtype=dtype), indexing='ij') + grid_xy = torch.stack([grid_x, grid_y], -1) + valid_WH = torch.tensor([w, h]).to(dtype) + grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH + wh = torch.ones_like(grid_xy) * grid_size * (2.0 ** lvl) + anchors.append(torch.concat([grid_xy, wh], -1).reshape(-1, h * w, 4)) + + anchors = torch.concat(anchors, 1).to(device) + valid_mask = ((anchors > self.eps) * (anchors < 1 - self.eps)).all(-1, keepdim=True) + anchors = torch.log(anchors / (1 - anchors)) + # anchors = torch.where(valid_mask, anchors, float('inf')) + # anchors[valid_mask] = torch.inf # valid_mask [1, 8400, 1] + anchors = torch.where(valid_mask, anchors, torch.inf) + + return anchors, valid_mask + + + def _get_decoder_input(self, + memory, + spatial_shapes, + denoising_class=None, + denoising_bbox_unact=None): + bs, _, _ = memory.shape + # prepare input for decoder + if self.training or self.eval_spatial_size is None: + anchors, valid_mask = self._generate_anchors(spatial_shapes, device=memory.device) + else: + anchors, valid_mask = self.anchors.to(memory.device), self.valid_mask.to(memory.device) + + # memory = torch.where(valid_mask, memory, 0) + memory = valid_mask.to(memory.dtype) * memory # TODO fix type error for onnx export + + output_memory = self.enc_output(memory) + + enc_outputs_class = self.enc_score_head(output_memory) + enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors + + _, topk_ind = torch.topk(enc_outputs_class.max(-1).values, self.num_queries, dim=1) + + reference_points_unact = enc_outputs_coord_unact.gather(dim=1, \ + index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_coord_unact.shape[-1])) + + enc_topk_bboxes = F.sigmoid(reference_points_unact) + if denoising_bbox_unact is not None: + reference_points_unact = torch.concat( + [denoising_bbox_unact, reference_points_unact], 1) + + enc_topk_logits = enc_outputs_class.gather(dim=1, \ + index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_class.shape[-1])) + + # extract region features + if self.learnt_init_query: + target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1]) + else: + target = output_memory.gather(dim=1, \ + index=topk_ind.unsqueeze(-1).repeat(1, 1, output_memory.shape[-1])) + target = target.detach() + + if denoising_class is not None: + target = torch.concat([denoising_class, target], 1) + + return target, reference_points_unact.detach(), enc_topk_bboxes, enc_topk_logits + + + def forward(self, feats, targets=None): + + # input projection and embedding + (memory, spatial_shapes, level_start_index) = self._get_encoder_input(feats) + + # prepare denoising training + if self.training and self.num_denoising > 0: + denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \ + get_contrastive_denoising_training_group(targets, \ + self.num_classes, + self.num_queries, + self.denoising_class_embed, + num_denoising=self.num_denoising, + label_noise_ratio=self.label_noise_ratio, + box_noise_scale=self.box_noise_scale, ) + else: + denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None + + target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \ + self._get_decoder_input(memory, spatial_shapes, denoising_class, denoising_bbox_unact) + + # decoder + out_bboxes, out_logits = self.decoder( + target, + init_ref_points_unact, + memory, + spatial_shapes, + level_start_index, + self.dec_bbox_head, + self.dec_score_head, + self.query_pos_head, + attn_mask=attn_mask) + + if self.training and dn_meta is not None: + dn_out_bboxes, out_bboxes = torch.split(out_bboxes, dn_meta['dn_num_split'], dim=2) + dn_out_logits, out_logits = torch.split(out_logits, dn_meta['dn_num_split'], dim=2) + + out = {'pred_logits': out_logits[-1], 'pred_boxes': out_bboxes[-1]} + + if self.training and self.aux_loss: + out['aux_outputs'] = self._set_aux_loss(out_logits[:-1], out_bboxes[:-1]) + out['aux_outputs'].extend(self._set_aux_loss([enc_topk_logits], [enc_topk_bboxes])) + + if self.training and dn_meta is not None: + out['dn_aux_outputs'] = self._set_aux_loss(dn_out_logits, dn_out_bboxes) + out['dn_meta'] = dn_meta + + return out + + + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_coord): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [{'pred_logits': a, 'pred_boxes': b} + for a, b in zip(outputs_class, outputs_coord)] diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py new file mode 100644 index 0000000..7d70113 --- /dev/null +++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py @@ -0,0 +1,81 @@ +"""by lyuwenyu +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import torchvision + +from src.core import register + + +__all__ = ['RTDETRPostProcessor'] + + +@register +class RTDETRPostProcessor(nn.Module): + __share__ = ['num_classes', 'use_focal_loss', 'num_top_queries', 'remap_mscoco_category'] + + def __init__(self, num_classes=80, use_focal_loss=True, num_top_queries=300, remap_mscoco_category=False) -> None: + super().__init__() + self.use_focal_loss = use_focal_loss + self.num_top_queries = num_top_queries + self.num_classes = num_classes + self.remap_mscoco_category = remap_mscoco_category + self.deploy_mode = False + + def extra_repr(self) -> str: + return f'use_focal_loss={self.use_focal_loss}, num_classes={self.num_classes}, num_top_queries={self.num_top_queries}' + + # def forward(self, outputs, orig_target_sizes): + def forward(self, outputs, orig_target_sizes): + + logits, boxes = outputs['pred_logits'], outputs['pred_boxes'] + # orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0) + + bbox_pred = torchvision.ops.box_convert(boxes, in_fmt='cxcywh', out_fmt='xyxy') + bbox_pred *= orig_target_sizes.repeat(1, 2).unsqueeze(1) + + if self.use_focal_loss: + scores = F.sigmoid(logits) + scores, index = torch.topk(scores.flatten(1), self.num_top_queries, axis=-1) + labels = index % self.num_classes + index = index // self.num_classes + boxes = bbox_pred.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, bbox_pred.shape[-1])) + + else: + scores = F.softmax(logits)[:, :, :-1] + scores, labels = scores.max(dim=-1) + boxes = bbox_pred + if scores.shape[1] > self.num_top_queries: + scores, index = torch.topk(scores, self.num_top_queries, dim=-1) + labels = torch.gather(labels, dim=1, index=index) + boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1])) + + # TODO for onnx export + if self.deploy_mode: + return labels, boxes, scores + + # TODO + if self.remap_mscoco_category: + from ...data.coco import mscoco_label2category + labels = torch.tensor([mscoco_label2category[int(x.item())] for x in labels.flatten()])\ + .to(boxes.device).reshape(labels.shape) + + results = [] + for lab, box, sco in zip(labels, boxes, scores): + result = dict(labels=lab, boxes=box, scores=sco) + results.append(result) + + return results + + + def deploy(self, ): + self.eval() + self.deploy_mode = True + return self + + @property + def iou_types(self, ): + return ('bbox', ) diff --git a/rtdetr_pytorch/src/zoo/rtdetr/utils.py b/rtdetr_pytorch/src/zoo/rtdetr/utils.py new file mode 100644 index 0000000..4f44cc5 --- /dev/null +++ b/rtdetr_pytorch/src/zoo/rtdetr/utils.py @@ -0,0 +1,101 @@ +"""by lyuwenyu +""" + +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def inverse_sigmoid(x: torch.Tensor, eps: float=1e-5) -> torch.Tensor: + x = x.clip(min=0., max=1.) + return torch.log(x.clip(min=eps) / (1 - x).clip(min=eps)) + + +def deformable_attention_core_func(value, value_spatial_shapes, sampling_locations, attention_weights): + """ + Args: + value (Tensor): [bs, value_length, n_head, c] + value_spatial_shapes (Tensor|List): [n_levels, 2] + value_level_start_index (Tensor|List): [n_levels] + sampling_locations (Tensor): [bs, query_length, n_head, n_levels, n_points, 2] + attention_weights (Tensor): [bs, query_length, n_head, n_levels, n_points] + + Returns: + output (Tensor): [bs, Length_{query}, C] + """ + bs, _, n_head, c = value.shape + _, Len_q, _, n_levels, n_points, _ = sampling_locations.shape + + split_shape = [h * w for h, w in value_spatial_shapes] + value_list = value.split(split_shape, dim=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for level, (h, w) in enumerate(value_spatial_shapes): + # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ + value_l_ = value_list[level].flatten(2).permute( + 0, 2, 1).reshape(bs * n_head, c, h, w) + # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 + sampling_grid_l_ = sampling_grids[:, :, :, level].permute( + 0, 2, 1, 3, 4).flatten(0, 1) + # N_*M_, D_, Lq_, P_ + sampling_value_l_ = F.grid_sample( + value_l_, + sampling_grid_l_, + mode='bilinear', + padding_mode='zeros', + align_corners=False) + sampling_value_list.append(sampling_value_l_) + # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_) + attention_weights = attention_weights.permute(0, 2, 1, 3, 4).reshape( + bs * n_head, 1, Len_q, n_levels * n_points) + output = (torch.stack( + sampling_value_list, dim=-2).flatten(-2) * + attention_weights).sum(-1).reshape(bs, n_head * c, Len_q) + + return output.permute(0, 2, 1) + + +import math +def bias_init_with_prob(prior_prob=0.01): + """initialize conv/fc bias value according to a given probability value.""" + bias_init = float(-math.log((1 - prior_prob) / prior_prob)) + return bias_init + + + +def get_activation(act: str, inpace: bool=True): + '''get activation + ''' + act = act.lower() + + if act == 'silu': + m = nn.SiLU() + + elif act == 'relu': + m = nn.ReLU() + + elif act == 'leaky_relu': + m = nn.LeakyReLU() + + elif act == 'silu': + m = nn.SiLU() + + elif act == 'gelu': + m = nn.GELU() + + elif act is None: + m = nn.Identity() + + elif isinstance(act, nn.Module): + m = act + + else: + raise RuntimeError('') + + if hasattr(m, 'inplace'): + m.inplace = inpace + + return m + + diff --git a/rtdetr_pytorch/tools/README.md b/rtdetr_pytorch/tools/README.md new file mode 100644 index 0000000..00eb9d1 --- /dev/null +++ b/rtdetr_pytorch/tools/README.md @@ -0,0 +1,24 @@ + + +Train/test script examples +- `CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=4 --master-port=8989 tools/train.py -c path/to/config &> train.log 2>&1 &` +- `-r path/to/checkpoint` +- `--amp` +- `--test-only` + + +Tuning script examples +- `torchrun --master_port=8844 --nproc_per_node=4 tools/train.py -c configs/rtdetr/rtdetr_r18vd_6x_coco.yml -t https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r18vd_5x_coco_objects365_from_paddle.pth` + + +Export script examples +- `python tools/export_onnx.py -c path/to/config -r path/to/checkpoint --check` + + +GPU do not release memory +- `ps aux | grep "tools/train.py" | awk '{print $2}' | xargs kill -9` + + +Save all logs +- Appending `&> train.log 2>&1 &` or `&> train.log 2>&1` + diff --git a/rtdetr_pytorch/tools/export_onnx.py b/rtdetr_pytorch/tools/export_onnx.py new file mode 100644 index 0000000..789420f --- /dev/null +++ b/rtdetr_pytorch/tools/export_onnx.py @@ -0,0 +1,147 @@ +"""by lyuwenyu +""" + +import os +import sys +sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')) + +import argparse +import numpy as np + +from src.core import YAMLConfig + +import torch +import torch.nn as nn + + +def main(args, ): + """main + """ + cfg = YAMLConfig(args.config, resume=args.resume) + + if args.resume: + checkpoint = torch.load(args.resume, map_location='cpu') + if 'ema' in checkpoint: + state = checkpoint['ema']['module'] + else: + state = checkpoint['model'] + else: + raise AttributeError('only support resume to load model.state_dict by now.') + + # NOTE load train mode state -> convert to deploy mode + cfg.model.load_state_dict(state) + + class Model(nn.Module): + def __init__(self, ) -> None: + super().__init__() + self.model = cfg.model.deploy() + self.postprocessor = cfg.postprocessor.deploy() + print(self.postprocessor.deploy_mode) + + def forward(self, images, orig_target_sizes): + outputs = self.model(images) + return self.postprocessor(outputs, orig_target_sizes) + + + model = Model() + + dynamic_axes = { + 'images': {0: 'N', }, + 'orig_target_sizes': {0: 'N'} + } + + data = torch.rand(1, 3, 640, 640) + size = torch.tensor([[640, 640]]) + + torch.onnx.export( + model, + (data, size), + args.file_name, + input_names=['images', 'orig_target_sizes'], + output_names=['labels', 'boxes', 'scores'], + dynamic_axes=dynamic_axes, + opset_version=16, + verbose=False + ) + + + if args.check: + import onnx + onnx_model = onnx.load(args.file_name) + onnx.checker.check_model(onnx_model) + print('Check export onnx model done...') + + + if args.simplify: + import onnxsim + dynamic = True + input_shapes = {'images': data.shape, 'orig_target_sizes': size.shape} if dynamic else None + onnx_model_simplify, check = onnxsim.simplify(args.file_name, input_shapes=input_shapes, dynamic_input_shape=dynamic) + onnx.save(onnx_model_simplify, args.file_name) + print(f'Simplify onnx model {check}...') + + + # import onnxruntime as ort + # from PIL import Image, ImageDraw, ImageFont + # from torchvision.transforms import ToTensor + # from src.data.coco.coco_dataset import mscoco_category2name, mscoco_category2label, mscoco_label2category + + # # print(onnx.helper.printable_graph(mm.graph)) + + # # Load the original image without resizing + # original_im = Image.open('./hongkong.jpg').convert('RGB') + # original_size = original_im.size + + # # Resize the image for model input + # im = original_im.resize((640, 640)) + # im_data = ToTensor()(im)[None] + # print(im_data.shape) + + # sess = ort.InferenceSession(args.file_name) + # output = sess.run( + # # output_names=['labels', 'boxes', 'scores'], + # output_names=None, + # input_feed={'images': im_data.data.numpy(), "orig_target_sizes": size.data.numpy()} + # ) + + # # print(type(output)) + # # print([out.shape for out in output]) + + # labels, boxes, scores = output + + # draw = ImageDraw.Draw(original_im) # Draw on the original image + # thrh = 0.6 + + # for i in range(im_data.shape[0]): + + # scr = scores[i] + # lab = labels[i][scr > thrh] + # box = boxes[i][scr > thrh] + + # print(i, sum(scr > thrh)) + + # for b, l in zip(box, lab): + # # Scale the bounding boxes back to the original image size + # b = [coord * original_size[j % 2] / 640 for j, coord in enumerate(b)] + # # Get the category name from the label + # category_name = mscoco_category2name[mscoco_label2category[l]] + # draw.rectangle(list(b), outline='red', width=2) + # font = ImageFont.truetype("Arial.ttf", 15) + # draw.text((b[0], b[1]), text=category_name, fill='yellow', font=font) + + # # Save the original image with bounding boxes + # original_im.save('test.jpg') + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument('--config', '-c', type=str, ) + parser.add_argument('--resume', '-r', type=str, ) + parser.add_argument('--file-name', '-f', type=str, default='model.onnx') + parser.add_argument('--check', action='store_true', default=False,) + parser.add_argument('--simplify', action='store_true', default=False,) + + args = parser.parse_args() + + main(args) diff --git a/rtdetr_pytorch/tools/infer.py b/rtdetr_pytorch/tools/infer.py new file mode 100644 index 0000000..385ce80 --- /dev/null +++ b/rtdetr_pytorch/tools/infer.py @@ -0,0 +1,203 @@ +import torch +import torch.nn as nn +import torchvision.transforms as T +from torch.cuda.amp import autocast +import numpy as np +from PIL import Image, ImageDraw, ImageFont +import os +import sys +sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')) +import argparse +import src.misc.dist as dist +from src.core import YAMLConfig +from src.solver import TASKS +import numpy as np + +def postprocess(labels, boxes, scores, iou_threshold=0.55): + def calculate_iou(box1, box2): + x1, y1, x2, y2 = box1 + x3, y3, x4, y4 = box2 + xi1 = max(x1, x3) + yi1 = max(y1, y3) + xi2 = min(x2, x4) + yi2 = min(y2, y4) + inter_width = max(0, xi2 - xi1) + inter_height = max(0, yi2 - yi1) + inter_area = inter_width * inter_height + box1_area = (x2 - x1) * (y2 - y1) + box2_area = (x4 - x3) * (y4 - y3) + union_area = box1_area + box2_area - inter_area + iou = inter_area / union_area if union_area != 0 else 0 + return iou + merged_labels = [] + merged_boxes = [] + merged_scores = [] + used_indices = set() + for i in range(len(boxes)): + if i in used_indices: + continue + current_box = boxes[i] + current_label = labels[i] + current_score = scores[i] + boxes_to_merge = [current_box] + scores_to_merge = [current_score] + used_indices.add(i) + for j in range(i + 1, len(boxes)): + if j in used_indices: + continue + if labels[j] != current_label: + continue + other_box = boxes[j] + iou = calculate_iou(current_box, other_box) + if iou >= iou_threshold: + boxes_to_merge.append(other_box.tolist()) + scores_to_merge.append(scores[j]) + used_indices.add(j) + xs = np.concatenate([[box[0], box[2]] for box in boxes_to_merge]) + ys = np.concatenate([[box[1], box[3]] for box in boxes_to_merge]) + merged_box = [np.min(xs), np.min(ys), np.max(xs), np.max(ys)] + merged_score = max(scores_to_merge) + merged_boxes.append(merged_box) + merged_labels.append(current_label) + merged_scores.append(merged_score) + return [np.array(merged_labels)], [np.array(merged_boxes)], [np.array(merged_scores)] +def slice_image(image, slice_height, slice_width, overlap_ratio): + img_width, img_height = image.size + + slices = [] + coordinates = [] + step_x = int(slice_width * (1 - overlap_ratio)) + step_y = int(slice_height * (1 - overlap_ratio)) + + for y in range(0, img_height, step_y): + for x in range(0, img_width, step_x): + box = (x, y, min(x + slice_width, img_width), min(y + slice_height, img_height)) + slice_img = image.crop(box) + slices.append(slice_img) + coordinates.append((x, y)) + return slices, coordinates +def merge_predictions(predictions, slice_coordinates, orig_image_size, slice_width, slice_height, threshold=0.30): + merged_labels = [] + merged_boxes = [] + merged_scores = [] + orig_height, orig_width = orig_image_size + for i, (label, boxes, scores) in enumerate(predictions): + x_shift, y_shift = slice_coordinates[i] + scores = np.array(scores).reshape(-1) + valid_indices = scores > threshold + valid_labels = np.array(label).reshape(-1)[valid_indices] + valid_boxes = np.array(boxes).reshape(-1, 4)[valid_indices] + valid_scores = scores[valid_indices] + for j, box in enumerate(valid_boxes): + box[0] = np.clip(box[0] + x_shift, 0, orig_width) + box[1] = np.clip(box[1] + y_shift, 0, orig_height) + box[2] = np.clip(box[2] + x_shift, 0, orig_width) + box[3] = np.clip(box[3] + y_shift, 0, orig_height) + valid_boxes[j] = box + merged_labels.extend(valid_labels) + merged_boxes.extend(valid_boxes) + merged_scores.extend(valid_scores) + return np.array(merged_labels), np.array(merged_boxes), np.array(merged_scores) +def draw(images, labels, boxes, scores, thrh = 0.6, path = ""): + for i, im in enumerate(images): + draw = ImageDraw.Draw(im) + scr = scores[i] + lab = labels[i][scr > thrh] + box = boxes[i][scr > thrh] + scrs = scores[i][scr > thrh] + for j,b in enumerate(box): + draw.rectangle(list(b), outline='red',) + draw.text((b[0], b[1]), text=f"label: {lab[j].item()} {round(scrs[j].item(),2)}", font=ImageFont.load_default(), fill='blue') + if path == "": + im.save(f'results_{i}.jpg') + else: + im.save(path) + +def main(args, ): + """main + """ + cfg = YAMLConfig(args.config, resume=args.resume) + if args.resume: + checkpoint = torch.load(args.resume, map_location='cpu') + if 'ema' in checkpoint: + state = checkpoint['ema']['module'] + else: + state = checkpoint['model'] + else: + raise AttributeError('Only support resume to load model.state_dict by now.') + # NOTE load train mode state -> convert to deploy mode + cfg.model.load_state_dict(state) + class Model(nn.Module): + def __init__(self, ) -> None: + super().__init__() + self.model = cfg.model.deploy() + self.postprocessor = cfg.postprocessor.deploy() + + def forward(self, images, orig_target_sizes): + outputs = self.model(images) + outputs = self.postprocessor(outputs, orig_target_sizes) + return outputs + + model = Model().to(args.device) + im_pil = Image.open(args.im_file).convert('RGB') + w, h = im_pil.size + orig_size = torch.tensor([w, h])[None].to(args.device) + + transforms = T.Compose([ + T.Resize((640, 640)), + T.ToTensor(), + ]) + im_data = transforms(im_pil)[None].to(args.device) + if args.sliced: + num_boxes = args.numberofboxes + + aspect_ratio = w / h + num_cols = int(np.sqrt(num_boxes * aspect_ratio)) + num_rows = int(num_boxes / num_cols) + slice_height = h // num_rows + slice_width = w // num_cols + overlap_ratio = 0.2 + slices, coordinates = slice_image(im_pil, slice_height, slice_width, overlap_ratio) + predictions = [] + for i, slice_img in enumerate(slices): + slice_tensor = transforms(slice_img)[None].to(args.device) + with autocast(): # Use AMP for each slice + output = model(slice_tensor, torch.tensor([[slice_img.size[0], slice_img.size[1]]]).to(args.device)) + torch.cuda.empty_cache() + labels, boxes, scores = output + + labels = labels.cpu().detach().numpy() + boxes = boxes.cpu().detach().numpy() + scores = scores.cpu().detach().numpy() + predictions.append((labels, boxes, scores)) + + merged_labels, merged_boxes, merged_scores = merge_predictions(predictions, coordinates, (h, w), slice_width, slice_height) + labels, boxes, scores = postprocess(merged_labels, merged_boxes, merged_scores) + else: + output = model(im_data, orig_size) + labels, boxes, scores = output + + draw([im_pil], labels, boxes, scores, 0.6) + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--config', type=str, ) + parser.add_argument('-r', '--resume', type=str, ) + parser.add_argument('-f', '--im-file', type=str, ) + parser.add_argument('-s', '--sliced', type=bool, default=False) + parser.add_argument('-d', '--device', type=str, default='cpu') + parser.add_argument('-nc', '--numberofboxes', type=int, default=25) + args = parser.parse_args() + main(args) + + + + + + + + + + + diff --git a/rtdetr_pytorch/tools/train.py b/rtdetr_pytorch/tools/train.py new file mode 100644 index 0000000..31b31ef --- /dev/null +++ b/rtdetr_pytorch/tools/train.py @@ -0,0 +1,50 @@ +"""by lyuwenyu +""" + +import os +import sys +sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')) +import argparse + +import src.misc.dist as dist +from src.core import YAMLConfig +from src.solver import TASKS + + +def main(args, ) -> None: + '''main + ''' + dist.init_distributed() + if args.seed is not None: + dist.set_seed(args.seed) + + assert not all([args.tuning, args.resume]), \ + 'Only support from_scrach or resume or tuning at one time' + + cfg = YAMLConfig( + args.config, + resume=args.resume, + use_amp=args.amp, + tuning=args.tuning + ) + + solver = TASKS[cfg.yaml_cfg['task']](cfg) + + if args.test_only: + solver.val() + else: + solver.fit() + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument('--config', '-c', type=str, ) + parser.add_argument('--resume', '-r', type=str, ) + parser.add_argument('--tuning', '-t', type=str, ) + parser.add_argument('--test-only', action='store_true', default=False,) + parser.add_argument('--amp', action='store_true', default=False,) + parser.add_argument('--seed', type=int, help='seed',) + args = parser.parse_args() + + main(args) diff --git a/rtdetrv2_paddle/readme.md b/rtdetrv2_paddle/readme.md new file mode 100644 index 0000000..6d4f01b --- /dev/null +++ b/rtdetrv2_paddle/readme.md @@ -0,0 +1 @@ +see https://github.com/PaddlePaddle/PaddleDetection \ No newline at end of file diff --git a/rtdetrv2_pytorch/Dockerfile b/rtdetrv2_pytorch/Dockerfile new file mode 100644 index 0000000..4682732 --- /dev/null +++ b/rtdetrv2_pytorch/Dockerfile @@ -0,0 +1,10 @@ +FROM nvcr.io/nvidia/pytorch:25.06-py3 + +WORKDIR /workspace + +COPY requirements.txt . + +RUN pip install --upgrade pip && \ + pip install -r requirements.txt + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/rtdetrv2_pytorch/README.md b/rtdetrv2_pytorch/README.md new file mode 100644 index 0000000..69b6f13 --- /dev/null +++ b/rtdetrv2_pytorch/README.md @@ -0,0 +1,168 @@ + +## Quick start + +
+Setup + +```shell + +pip install -r requirements.txt +``` + +The following is the corresponding `torch` and `torchvision` versions. +`rtdetr` | `torch` | `torchvision` +|---|---|---| +| `-` | `2.4` | `0.19` | +| `-` | `2.2` | `0.17` | +| `-` | `2.1` | `0.16` | +| `-` | `2.0` | `0.15` | + +
+ +
+Fig + +
+image +
+ +
+ + +## Model Zoo + +### Base models + +| Model | Dataset | Input Size | APval | AP50val | #Params(M) | FPS | config| checkpoint | +| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |:---: | +**RT-DETRv2-S** | COCO | 640 | **48.1** (+1.6) | **65.1** | 20 | 217 | [config](./configs/rtdetrv2/rtdetrv2_r18vd_120e_coco.yml) | [url](https://github.com/lyuwenyu/storage/releases/download/v0.2/rtdetrv2_r18vd_120e_coco_rerun_48.1.pth) | +**RT-DETRv2-M*** | COCO | 640 | **49.9** (+1.0) | **67.5** | 31 | 161 | [config](./configs/rtdetrv2/rtdetrv2_r34vd_120e_coco.yml) | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r34vd_120e_coco_ema.pth) +**RT-DETRv2-M** | COCO | 640 | **51.9** (+0.6) | **69.9** | 36 | 145 | [config](./configs/rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml) | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_m_7x_coco_ema.pth) +**RT-DETRv2-L** | COCO | 640 | **53.4** (+0.3) | **71.6** | 42 | 108 | [config](./configs/rtdetrv2/rtdetrv2_r50vd_6x_coco.yml) | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_6x_coco_ema.pth) +**RT-DETRv2-X** | COCO | 640 | 54.3 | **72.8** (+0.1) | 76 | 74 | [config](./configs/rtdetrv2/rtdetrv2_r101vd_6x_coco.yml) | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r101vd_6x_coco_from_paddle.pth) + + +**Notes:** +- `AP` is evaluated on *MSCOCO val2017* dataset. +- `FPS` is evaluated on a single T4 GPU with $batch\\_size = 1$, $fp16$, and $TensorRT>=8.5.1$. +- `COCO + Objects365` in the table means finetuned model on `COCO` using pretrained weights trained on `Objects365`. + + + +### Models of discrete sampling + +| Model | Sampling Method | APval | AP50val | config| checkpoint +| :---: | :---: | :---: | :---: | :---: | :---: | +**RT-DETRv2-S_dsp** | discrete_sampling | 47.4 | 64.8 (-0.1) | [config](./configs/rtdetrv2/rtdetrv2_r18vd_dsp_3x_coco.yml) | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r18vd_dsp_3x_coco.pth) +**RT-DETRv2-M*****_dsp** | discrete_sampling | 49.2 | 67.1 (-0.4) | [config](./configs/rtdetrv2/rtdetrv2_r34vd_dsp_1x_coco.yml) | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rrtdetrv2_r34vd_dsp_1x_coco.pth) +**RT-DETRv2-M_dsp** | discrete_sampling | 51.4 | 69.7 (-0.2) | [config](./configs/rtdetrv2/rtdetrv2_r50vd_m_dsp_3x_coco.yml) | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_m_dsp_3x_coco.pth) +**RT-DETRv2-L_dsp** | discrete_sampling | 52.9 | 71.3 (-0.3) |[config](./configs/rtdetrv2/rtdetrv2_r50vd_dsp_1x_coco.yml)| [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_dsp_1x_coco.pth) + + + + + + + +**Notes:** +- The impact on inference speed is related to specific device and software. +- `*_dsp*` is the model inherit `*_sp*` model's knowledge and adapt to `discrete_sampling` strategy. **You can use TensorRT 8.4 (or even older versions) to inference for these models** + + + +### Ablation on sampling points + + + +| Model | Sampling Method | #Points | APval | AP50val | checkpoint +| :---: | :---: | :---: | :---: | :---: | :---: | +**rtdetrv2_r18vd_sp1** | grid_sampling | 21,600 | 47.3 | 64.3 (-0.6) | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r18vd_sp1_120e_coco.pth) +**rtdetrv2_r18vd_sp2** | grid_sampling | 43,200 | 47.7 | 64.7 (-0.2) | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r18vd_sp2_120e_coco.pth) +**rtdetrv2_r18vd_sp3** | grid_sampling | 64,800 | 47.8 | 64.8 (-0.1) | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r18vd_sp3_120e_coco.pth) +rtdetrv2_r18vd(_sp4)| grid_sampling | 86,400 | 47.9 | 64.9 | [url](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r18vd_120e_coco.pth) + +**Notes:** +- The impact on inference speed is related to specific device and software. +- `#points` the total number of sampling points in decoder for per image inference. + + +## Usage +
+ details + + +1. Training +```shell +CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=9909 --nproc_per_node=4 tools/train.py -c path/to/config --use-amp --seed=0 &> log.txt 2>&1 & +``` + + +2. Testing +```shell +CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=9909 --nproc_per_node=4 tools/train.py -c path/to/config -r path/to/checkpoint --test-only +``` + + +3. Tuning +```shell +CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=9909 --nproc_per_node=4 tools/train.py -c path/to/config -t path/to/checkpoint --use-amp --seed=0 &> log.txt 2>&1 & +``` + + +4. Export onnx +```shell +python tools/export_onnx.py -c path/to/config -r path/to/checkpoint --check +``` + + +5. Export tensorrt +```shell +python tools/export_trt.py -i path/to/onnxfile +``` + + +5. Inference + +Support torch, onnxruntime, tensorrt and openvino, see details in *references/deploy* +```shell +python references/deploy/rtdetrv2_onnxruntime.py --onnx-file=model.onnx --im-file=xxxx +python references/deploy/rtdetrv2_tensorrt.py --trt-file=model.trt --im-file=xxxx +python references/deploy/rtdetrv2_torch.py -c path/to/config -r path/to/checkpoint --im-file=xxx --device=cuda:0 +``` +
+ + + +## Citation +If you use `RTDETR` or `RTDETRv2` in your work, please use the following BibTeX entries: + +
+ bibtex + +```latex +@misc{lv2023detrs, + title={DETRs Beat YOLOs on Real-time Object Detection}, + author={Wenyu Lv and Shangliang Xu and Yian Zhao and Guanzhong Wang and Jinman Wei and Cheng Cui and Yuning Du and Qingqing Dang and Yi Liu}, + year={2023}, + eprint={2304.08069}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} + +@misc{lv2024rtdetrv2improvedbaselinebagoffreebies, + title={RT-DETRv2: Improved Baseline with Bag-of-Freebies for Real-Time Detection Transformer}, + author={Wenyu Lv and Yian Zhao and Qinyao Chang and Kui Huang and Guanzhong Wang and Yi Liu}, + year={2024}, + eprint={2407.17140}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2407.17140}, +} +``` +
diff --git a/rtdetrv2_pytorch/configs/dataset/coco_detection.yml b/rtdetrv2_pytorch/configs/dataset/coco_detection.yml new file mode 100644 index 0000000..270b319 --- /dev/null +++ b/rtdetrv2_pytorch/configs/dataset/coco_detection.yml @@ -0,0 +1,48 @@ +task: detection + +evaluator: + type: CocoEvaluator + iou_types: ['bbox', ] + +# num_classes: 365 +# remap_mscoco_category: False + +# num_classes: 91 +# remap_mscoco_category: False + +num_classes: 80 +remap_mscoco_category: True + + +train_dataloader: + type: DataLoader + dataset: + type: CocoDetection + img_folder: ./dataset/coco/train2017/ + ann_file: ./dataset/coco/annotations/instances_train2017.json + return_masks: False + transforms: + type: Compose + ops: ~ + shuffle: True + num_workers: 4 + drop_last: True + collate_fn: + type: BatchImageCollateFunction + + +val_dataloader: + type: DataLoader + dataset: + type: CocoDetection + img_folder: ./dataset/coco/val2017/ + ann_file: ./dataset/coco/annotations/instances_val2017.json + return_masks: False + transforms: + type: Compose + ops: ~ + shuffle: False + num_workers: 4 + drop_last: False + collate_fn: + type: BatchImageCollateFunction diff --git a/rtdetrv2_pytorch/configs/dataset/voc_detection.yml b/rtdetrv2_pytorch/configs/dataset/voc_detection.yml new file mode 100644 index 0000000..7f6f155 --- /dev/null +++ b/rtdetrv2_pytorch/configs/dataset/voc_detection.yml @@ -0,0 +1,40 @@ +task: detection + +evaluator: + type: CocoEvaluator + iou_types: ['bbox', ] + +num_classes: 20 + +train_dataloader: + type: DataLoader + dataset: + type: VOCDetection + root: ./dataset/voc/ + ann_file: trainval.txt + label_file: label_list.txt + transforms: + type: Compose + ops: ~ + shuffle: True + num_workers: 4 + drop_last: True + collate_fn: + type: BatchImageCollateFunction + + +val_dataloader: + type: DataLoader + dataset: + type: VOCDetection + root: ./dataset/voc/ + ann_file: test.txt + label_file: label_list.txt + transforms: + type: Compose + ops: ~ + shuffle: False + num_workers: 4 + drop_last: False + collate_fn: + type: BatchImageCollateFunction diff --git a/rtdetrv2_pytorch/configs/rtdetr/include/dataloader.yml b/rtdetrv2_pytorch/configs/rtdetr/include/dataloader.yml new file mode 100644 index 0000000..64d6dc7 --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetr/include/dataloader.yml @@ -0,0 +1,31 @@ + +train_dataloader: + dataset: + return_masks: False + transforms: + ops: + - {type: RandomPhotometricDistort, p: 0.5} + - {type: RandomZoomOut, fill: 0} + - {type: RandomIoUCrop, p: 0.8} + - {type: SanitizeBoundingBoxes, min_size: 1} + - {type: RandomHorizontalFlip} + - {type: Resize, size: [640, 640], } + - {type: SanitizeBoundingBoxes, min_size: 1} + - {type: ConvertPILImage, dtype: 'float32', scale: True} + - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} + collate_fn: + type: BatchImageCollateFunction + scales: [480, 512, 544, 576, 608, 640, 640, 640, 672, 704, 736, 768, 800] + shuffle: True + num_workers: 4 + total_batch_size: 16 + +val_dataloader: + dataset: + transforms: + ops: + - {type: Resize, size: [640, 640]} + - {type: ConvertPILImage, dtype: 'float32', scale: True} + shuffle: False + total_batch_size: 16 + num_workers: 8 \ No newline at end of file diff --git a/rtdetrv2_pytorch/configs/rtdetr/include/optimizer.yml b/rtdetrv2_pytorch/configs/rtdetr/include/optimizer.yml new file mode 100644 index 0000000..29abdd8 --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetr/include/optimizer.yml @@ -0,0 +1,40 @@ + +use_ema: True +ema: + type: ModelEMA + decay: 0.9999 + warmups: 2000 + + +epoches: 72 +clip_max_norm: 0.1 + + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*(?:norm|bn)).*$' + lr: 0.00001 + - + params: '^(?=.*backbone)(?=.*(?:norm|bn)).*$' + weight_decay: 0. + lr: 0.00001 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$' + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 + + +lr_scheduler: + type: MultiStepLR + milestones: [1000] + gamma: 0.1 + + +lr_warmup_scheduler: + type: LinearWarmup + warmup_duration: 2000 \ No newline at end of file diff --git a/rtdetrv2_pytorch/configs/rtdetr/include/rtdetr_r50vd.yml b/rtdetrv2_pytorch/configs/rtdetr/include/rtdetr_r50vd.yml new file mode 100644 index 0000000..f21615e --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetr/include/rtdetr_r50vd.yml @@ -0,0 +1,79 @@ +task: detection + +model: RTDETR +criterion: RTDETRCriterion +postprocessor: RTDETRPostProcessor + + +use_focal_loss: True +eval_spatial_size: [640, 640] # h w + + +RTDETR: + backbone: PResNet + encoder: HybridEncoder + decoder: RTDETRTransformer + + +PResNet: + depth: 50 + variant: d + freeze_at: 0 + return_idx: [1, 2, 3] + num_stages: 4 + freeze_norm: True + pretrained: True + + +HybridEncoder: + in_channels: [512, 1024, 2048] + feat_strides: [8, 16, 32] + + # intra + hidden_dim: 256 + use_encoder_idx: [2] + num_encoder_layers: 1 + nhead: 8 + dim_feedforward: 1024 + dropout: 0. + enc_act: 'gelu' + + # cross + expansion: 1.0 + depth_mult: 1 + act: 'silu' + + version: v1 + +RTDETRTransformer: + feat_channels: [256, 256, 256] + feat_strides: [8, 16, 32] + hidden_dim: 256 + num_levels: 3 + + num_layers: 6 + num_queries: 300 + + num_denoising: 100 + label_noise_ratio: 0.5 + box_noise_scale: 1.0 # 1.0 0.4 + + eval_idx: -1 + + +RTDETRPostProcessor: + num_top_queries: 300 + + +RTDETRCriterion: + weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2,} + losses: ['vfl', 'boxes', ] + alpha: 0.75 + gamma: 2.0 + + matcher: + type: HungarianMatcher + weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2} + alpha: 0.25 + gamma: 2.0 + diff --git a/rtdetrv2_pytorch/configs/rtdetr/readme.md b/rtdetrv2_pytorch/configs/rtdetr/readme.md new file mode 100644 index 0000000..46ccd52 --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetr/readme.md @@ -0,0 +1,111 @@ +# DETRs Beat YOLOs on Real-time Object Detection + +## Introduction +This repository is the official pytorch implementation of [*RTDETR*](https://arxiv.org/abs/2304.08069v1), and is compatiable with [RT-DETR/rtdetr_pytorch](https://github.com/lyuwenyu/RT-DETR/tree/main). For paddle version implementation, please refer to [RT-DETR/rtdetr_paddle](https://github.com/lyuwenyu/RT-DETR/tree/main). **If you are using rtdetr for the first time, it is highly recommended to use [rtdetrv2](../rtdetrv2/)**. + +
+ Fig +
+ +
+
+ + + + +## Model Zoo +| Model | Dataset | Input Size | APval | AP50val | #Params(M) | FPS | checkpoint | +| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | +rtdetr_r18vd | COCO | 640 | 46.4 | 63.7 | 20 | 217 | [url*](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r18vd_dec3_6x_coco_from_paddle.pth) +rtdetr_r34vd | COCO | 640 | 48.9 | 66.8 | 31 | 161 | [url*](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r34vd_dec4_6x_coco_from_paddle.pth) +rtdetr_r50vd_m | COCO | 640 | 51.3 | 69.5 | 36 | 145 | [url*](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_m_6x_coco_from_paddle.pth) +rtdetr_r50vd | COCO | 640 | 53.1 | 71.2| 42 | 108 | [url*](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_6x_coco_from_paddle.pth) +rtdetr_r101vd | COCO | 640 | 54.3 | 72.8 | 76 | 74 | [url*](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r101vd_6x_coco_from_paddle.pth) +rtdetr_18vd | COCO+Objects365 | 640 | 49.0 | 66.5 | 20 | 217 | [url*](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r18vd_5x_coco_objects365_from_paddle.pth) +rtdetr_r50vd | COCO+Objects365 | 640 | 55.2 | 73.4 | 42 | 108 | [url*](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_2x_coco_objects365_from_paddle.pth) +rtdetr_r101vd | COCO+Objects365 | 640 | 56.2 | 74.5 | 76 | 74 | [url*](https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r101vd_2x_coco_objects365_from_paddle.pth) + + + + + +Notes + + +- `COCO + Objects365` in the table means finetuned model on `COCO` using pretrained weights trained on `Objects365`. +- `FPS` is evaluated on a single T4 GPU with $batch\\_size = 1$ and $tensorrt\\_fp16$ mode +- `url``*` is the url of the pretrained weights, converted from the paddle model to save energy. *There may be slight differences between this table and the paper. + + +## Usage +
+ details + + +1. Training +```shell +CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=9909 --nproc_per_node=4 tools/train.py -c path/to/config &> log.txt 2>&1 & +``` + + +2. Testing +```shell +CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=9909 --nproc_per_node=4 tools/train.py -c path/to/config -r path/to/checkpoint --test-only +``` + + +3. Tuning +```shell +CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=9909 --nproc_per_node=4 tools/train.py -c path/to/config -t path/to/checkpoint &> log.txt 2>&1 & +``` + + +4. Export onnx +```shell +python tools/export_onnx.py -c path/to/config -r path/to/checkpoint --check +``` + + +5. Inference + +Support torch, onnxruntime, tensorrt and openvino, see details in *references/deploy* +```shell +python references/deploy/rtdetrv2_onnx.py --onnx-file=model.onnx --im-file=xxxx +python references/deploy/rtdetrv2_tensorrt.py --trt-file=model.trt --im-file=xxxx +python references/deploy/rtdetrv2_torch.py -c path/to/config -r path/to/checkpoint --im-file=xxx --device=cuda:0 +``` +
+ + +## Citation +If you use `RTDETR` in your work, please use the following BibTeX entries: + +
+ bibtex + +```latex +@misc{lv2023detrs, + title={DETRs Beat YOLOs on Real-time Object Detection}, + author={Wenyu Lv and Shangliang Xu and Yian Zhao and Guanzhong Wang and Jinman Wei and Cheng Cui and Yuning Du and Qingqing Dang and Yi Liu}, + year={2023}, + eprint={2304.08069}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} + +@software{Lv_rtdetr_by_cvperception_2023, +author = {Lv, Wenyu}, +license = {Apache-2.0}, +month = oct, +title = {{rtdetr by cvperception}}, +url = {https://github.com/lyuwenyu/cvperception/}, +version = {0.0.1dev}, +year = {2023} +} +``` +
diff --git a/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r101vd_6x_coco.yml b/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r101vd_6x_coco.yml new file mode 100644 index 0000000..82dc545 --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r101vd_6x_coco.yml @@ -0,0 +1,41 @@ + +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetr_r50vd.yml', +] + + +output_dir: ./output/rtdetr_r101vd_6x_coco + + +PResNet: + depth: 101 + + +HybridEncoder: + # intra + hidden_dim: 384 + dim_feedforward: 2048 + + +RTDETRTransformer: + feat_channels: [384, 384, 384] + + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.000001 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 + diff --git a/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r18vd_6x_coco.yml b/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r18vd_6x_coco.yml new file mode 100644 index 0000000..5e4f95a --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r18vd_6x_coco.yml @@ -0,0 +1,48 @@ + +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetr_r50vd.yml', +] + + +output_dir: ./output/rtdetr_r18vd_6x_coco + + +PResNet: + depth: 18 + freeze_at: -1 + freeze_norm: False + pretrained: True + + +HybridEncoder: + in_channels: [128, 256, 512] + hidden_dim: 256 + expansion: 0.5 + + +RTDETRTransformer: + num_layers: 3 + + + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?=.*norm|bn).*$' + weight_decay: 0. + lr: 0.00001 + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.00001 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$' + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 diff --git a/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r34vd_6x_coco.yml b/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r34vd_6x_coco.yml new file mode 100644 index 0000000..f857644 --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r34vd_6x_coco.yml @@ -0,0 +1,48 @@ + +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetr_r50vd.yml', +] + + +output_dir: ./output/rtdetr_r34vd_6x_coco + + +PResNet: + depth: 34 + freeze_at: -1 + freeze_norm: False + pretrained: True + + +HybridEncoder: + in_channels: [128, 256, 512] + hidden_dim: 256 + expansion: 0.5 + + +RTDETRTransformer: + num_layers: 4 + + + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?=.*norm|bn).*$' + weight_decay: 0. + lr: 0.00001 + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.00001 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$' + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 diff --git a/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r50vd_6x_coco.yml b/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r50vd_6x_coco.yml new file mode 100644 index 0000000..bc39f4a --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r50vd_6x_coco.yml @@ -0,0 +1,14 @@ + +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetr_r50vd.yml', +] + + +output_dir: ./output/rtdetr_r50vd_6x_coco + + + diff --git a/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml b/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml new file mode 100644 index 0000000..25d5ad8 --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetr/rtdetr_r50vd_m_6x_coco.yml @@ -0,0 +1,34 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetr_r50vd.yml', +] + +output_dir: ./output/rtdetr_r50vd_m_6x_coco + + +HybridEncoder: + expansion: 0.5 + + +RTDETRTransformer: + eval_idx: 2 # use 3th decoder layer to eval + + + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.00001 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$' + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 + diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/include/dataloader.yml b/rtdetrv2_pytorch/configs/rtdetrv2/include/dataloader.yml new file mode 100644 index 0000000..d55a411 --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetrv2/include/dataloader.yml @@ -0,0 +1,38 @@ + +train_dataloader: + dataset: + transforms: + ops: + - {type: RandomPhotometricDistort, p: 0.5} + - {type: RandomZoomOut, fill: 0} + - {type: RandomIoUCrop, p: 0.8} + - {type: SanitizeBoundingBoxes, min_size: 1} + - {type: RandomHorizontalFlip} + - {type: Resize, size: [640, 640], } + - {type: SanitizeBoundingBoxes, min_size: 1} + - {type: ConvertPILImage, dtype: 'float32', scale: True} + - {type: ConvertBoxes, fmt: 'cxcywh', normalize: True} + policy: + name: stop_epoch + epoch: 71 # epoch in [71, ~) stop `ops` + ops: ['RandomPhotometricDistort', 'RandomZoomOut', 'RandomIoUCrop'] + + collate_fn: + type: BatchImageCollateFunction + scales: [480, 512, 544, 576, 608, 640, 640, 640, 672, 704, 736, 768, 800] + stop_epoch: 71 # epoch in [71, ~) stop `multiscales` + + shuffle: True + total_batch_size: 16 # total batch size equals to 16 (4 * 4) + num_workers: 4 + + +val_dataloader: + dataset: + transforms: + ops: + - {type: Resize, size: [640, 640]} + - {type: ConvertPILImage, dtype: 'float32', scale: True} + shuffle: False + total_batch_size: 32 + num_workers: 4 \ No newline at end of file diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/include/optimizer.yml b/rtdetrv2_pytorch/configs/rtdetrv2/include/optimizer.yml new file mode 100644 index 0000000..189a9a1 --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetrv2/include/optimizer.yml @@ -0,0 +1,37 @@ + +use_amp: True +use_ema: True +ema: + type: ModelEMA + decay: 0.9999 + warmups: 2000 + + +epoches: 72 +clip_max_norm: 0.1 + + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm).*$' + lr: 0.00001 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 + + +lr_scheduler: + type: MultiStepLR + milestones: [1000] + gamma: 0.1 + + +lr_warmup_scheduler: + type: LinearWarmup + warmup_duration: 2000 \ No newline at end of file diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/include/rtdetrv2_r50vd.yml b/rtdetrv2_pytorch/configs/rtdetrv2/include/rtdetrv2_r50vd.yml new file mode 100644 index 0000000..a5c1490 --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetrv2/include/rtdetrv2_r50vd.yml @@ -0,0 +1,83 @@ +task: detection + +model: RTDETR +criterion: RTDETRCriterionv2 +postprocessor: RTDETRPostProcessor + + +use_focal_loss: True +eval_spatial_size: [640, 640] # h w + + +RTDETR: + backbone: PResNet + encoder: HybridEncoder + decoder: RTDETRTransformerv2 + + +PResNet: + depth: 50 + variant: d + freeze_at: 0 + return_idx: [1, 2, 3] + num_stages: 4 + freeze_norm: True + pretrained: True + + +HybridEncoder: + in_channels: [512, 1024, 2048] + feat_strides: [8, 16, 32] + + # intra + hidden_dim: 256 + use_encoder_idx: [2] + num_encoder_layers: 1 + nhead: 8 + dim_feedforward: 1024 + dropout: 0. + enc_act: 'gelu' + + # cross + expansion: 1.0 + depth_mult: 1 + act: 'silu' + + +RTDETRTransformerv2: + feat_channels: [256, 256, 256] + feat_strides: [8, 16, 32] + hidden_dim: 256 + num_levels: 3 + + num_layers: 6 + num_queries: 300 + + num_denoising: 100 + label_noise_ratio: 0.5 + box_noise_scale: 1.0 # 1.0 0.4 + + eval_idx: -1 + + # NEW + num_points: [4, 4, 4] # [3,3,3] [2,2,2] + cross_attn_method: default # default, discrete + query_select_method: default # default, agnostic + + +RTDETRPostProcessor: + num_top_queries: 300 + + +RTDETRCriterionv2: + weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2,} + losses: ['vfl', 'boxes', ] + alpha: 0.75 + gamma: 2.0 + + matcher: + type: HungarianMatcher + weight_dict: {cost_class: 2, cost_bbox: 5, cost_giou: 2} + alpha: 0.25 + gamma: 2.0 + diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_hgnetv2_h_6x_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_hgnetv2_h_6x_coco.yml new file mode 100644 index 0000000..7bb3546 --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_hgnetv2_h_6x_coco.yml @@ -0,0 +1,50 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetrv2_r50vd.yml', +] + + +output_dir: ./output/rtdetrv2_hgnetv2_h_6x_coco + + +RTDETR: + backbone: HGNetv2 + + +HGNetv2: + name: 'H' + return_idx: [1, 2, 3] + freeze_at: 0 + freeze_norm: True + pretrained: True + + +HybridEncoder: + # intra + hidden_dim: 512 + dim_feedforward: 2048 + num_encoder_layers: 2 + + +RTDETRTransformerv2: + feat_channels: [512, 512, 512] + + + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.000005 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 + diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_hgnetv2_l_6x_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_hgnetv2_l_6x_coco.yml new file mode 100644 index 0000000..5602496 --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_hgnetv2_l_6x_coco.yml @@ -0,0 +1,38 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetrv2_r50vd.yml', +] + + +output_dir: ./output/rtdetrv2_hgnetv2_l_6x_coco + + +RTDETR: + backbone: HGNetv2 + + +HGNetv2: + name: 'L' + return_idx: [1, 2, 3] + freeze_at: 0 + freeze_norm: True + pretrained: True + + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.000005 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 + diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_hgnetv2_x_6x_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_hgnetv2_x_6x_coco.yml new file mode 100644 index 0000000..b85d8a5 --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_hgnetv2_x_6x_coco.yml @@ -0,0 +1,50 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetrv2_r50vd.yml', +] + + +output_dir: ./output/rtdetrv2_hgnetv2_x_6x_coco + + +RTDETR: + backbone: HGNetv2 + + +HGNetv2: + name: 'X' + return_idx: [1, 2, 3] + freeze_at: 0 + freeze_norm: True + pretrained: True + + + +HybridEncoder: + # intra + hidden_dim: 384 + dim_feedforward: 2048 + + +RTDETRTransformerv2: + feat_channels: [384, 384, 384] + + + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.000001 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 + diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r101vd_6x_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r101vd_6x_coco.yml new file mode 100644 index 0000000..f0171de --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r101vd_6x_coco.yml @@ -0,0 +1,40 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetrv2_r50vd.yml', +] + + +output_dir: ./output/rtdetrv2_r101vd_6x_coco + + +PResNet: + depth: 101 + + +HybridEncoder: + # intra + hidden_dim: 384 + dim_feedforward: 2048 + + +RTDETRTransformerv2: + feat_channels: [384, 384, 384] + + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.000001 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 + diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_120e_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_120e_coco.yml new file mode 100644 index 0000000..0a4557b --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_120e_coco.yml @@ -0,0 +1,46 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetrv2_r50vd.yml', +] + + +output_dir: ./output/rtdetrv2_r18vd_120e_coco + + +PResNet: + depth: 18 + freeze_at: -1 + freeze_norm: False + pretrained: True + + +HybridEncoder: + in_channels: [128, 256, 512] + hidden_dim: 256 + expansion: 0.5 + + +RTDETRTransformerv2: + num_layers: 3 + + +epoches: 120 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + +train_dataloader: + dataset: + transforms: + policy: + epoch: 117 + collate_fn: + scales: ~ \ No newline at end of file diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_120e_voc.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_120e_voc.yml new file mode 100644 index 0000000..28b9873 --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_120e_voc.yml @@ -0,0 +1,46 @@ +__include__: [ + '../dataset/voc_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetrv2_r50vd.yml', +] + + +output_dir: ./output/rtdetrv2_r18vd_120e_voc + + +PResNet: + depth: 18 + freeze_at: -1 + freeze_norm: False + pretrained: True + + +HybridEncoder: + in_channels: [128, 256, 512] + hidden_dim: 256 + expansion: 0.5 + + +RTDETRTransformerv2: + num_layers: 3 + + +epoches: 120 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*(?:norm|bn)).*$' + weight_decay: 0. + +train_dataloader: + dataset: + transforms: + policy: + epoch: 117 + collate_fn: + scales: ~ + total_batch_size: 32 diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_dsp_3x_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_dsp_3x_coco.yml new file mode 100644 index 0000000..a3a3a58 --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_dsp_3x_coco.yml @@ -0,0 +1,49 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetrv2_r50vd.yml', +] + + +tuning: https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r18vd_120e_coco.pth + +output_dir: ./output/rtdetrv2_r18vd_dsp_3x_coco + +PResNet: + depth: 18 + freeze_at: -1 + freeze_norm: False + pretrained: True + + +HybridEncoder: + in_channels: [128, 256, 512] + hidden_dim: 256 + expansion: 0.5 + + +RTDETRTransformerv2: + num_layers: 3 + num_points: [4, 4, 4] + cross_attn_method: discrete + + +epoches: 36 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + +train_dataloader: + dataset: + transforms: + policy: + epoch: 33 + collate_fn: + scales: ~ \ No newline at end of file diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_sp1_120e_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_sp1_120e_coco.yml new file mode 100644 index 0000000..ed029c1 --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_sp1_120e_coco.yml @@ -0,0 +1,47 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetrv2_r50vd.yml', +] + + +output_dir: ./output/rtdetrv2_r18vd_sp1_120e_coco + + +PResNet: + depth: 18 + freeze_at: -1 + freeze_norm: False + pretrained: True + + +HybridEncoder: + in_channels: [128, 256, 512] + hidden_dim: 256 + expansion: 0.5 + + +RTDETRTransformerv2: + num_layers: 3 + num_points: [1, 1, 1] + + +epoches: 120 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + +train_dataloader: + dataset: + transforms: + policy: + epoch: 117 + collate_fn: + scales: ~ \ No newline at end of file diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_sp2_120e_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_sp2_120e_coco.yml new file mode 100644 index 0000000..c75d0d7 --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_sp2_120e_coco.yml @@ -0,0 +1,47 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetrv2_r50vd.yml', +] + + +output_dir: ./output/rtdetrv2_r18vd_sp2_120e_coco + + +PResNet: + depth: 18 + freeze_at: -1 + freeze_norm: False + pretrained: True + + +HybridEncoder: + in_channels: [128, 256, 512] + hidden_dim: 256 + expansion: 0.5 + + +RTDETRTransformerv2: + num_layers: 3 + num_points: [2, 2, 2] + + +epoches: 120 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + +train_dataloader: + dataset: + transforms: + policy: + epoch: 117 + collate_fn: + scales: ~ \ No newline at end of file diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_sp3_120e_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_sp3_120e_coco.yml new file mode 100644 index 0000000..2a00b1c --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r18vd_sp3_120e_coco.yml @@ -0,0 +1,47 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetrv2_r50vd.yml', +] + + +output_dir: ./output/rtdetrv2_r18vd_sp3_120e_coco + + +PResNet: + depth: 18 + freeze_at: -1 + freeze_norm: False + pretrained: True + + +HybridEncoder: + in_channels: [128, 256, 512] + hidden_dim: 256 + expansion: 0.5 + + +RTDETRTransformerv2: + num_layers: 3 + num_points: [3, 3, 3] + + +epoches: 120 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + +train_dataloader: + dataset: + transforms: + policy: + epoch: 117 + collate_fn: + scales: ~ \ No newline at end of file diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r34vd_120e_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r34vd_120e_coco.yml new file mode 100644 index 0000000..348c0e9 --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r34vd_120e_coco.yml @@ -0,0 +1,57 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetrv2_r50vd.yml', +] + + +output_dir: ./output/rtdetrv2_r34vd_120e_coco + + +PResNet: + depth: 34 + freeze_at: -1 + freeze_norm: False + pretrained: True + + +HybridEncoder: + in_channels: [128, 256, 512] + hidden_dim: 256 + expansion: 0.5 + + +RTDETRTransformerv2: + num_layers: 4 + + +epoches: 120 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.00005 + - + params: '^(?=.*backbone)(?=.*norm|bn).*$' + lr: 0.00005 + weight_decay: 0. + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$' + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 + + +train_dataloader: + dataset: + transforms: + policy: + epoch: 117 + collate_fn: + stop_epoch: 117 diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r34vd_dsp_1x_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r34vd_dsp_1x_coco.yml new file mode 100644 index 0000000..064d5f3 --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r34vd_dsp_1x_coco.yml @@ -0,0 +1,59 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetrv2_r50vd.yml', +] + +tuning: https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r34vd_120e_coco_ema.pth + +output_dir: ./output/rtdetrv2_r34vd_dsp_1x_coco + + +PResNet: + depth: 34 + freeze_at: -1 + freeze_norm: False + pretrained: True + + +HybridEncoder: + in_channels: [128, 256, 512] + hidden_dim: 256 + expansion: 0.5 + + +RTDETRTransformerv2: + num_layers: 4 + cross_attn_method: discrete + + +epoches: 12 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm|bn).*$' + lr: 0.00005 + - + params: '^(?=.*backbone)(?=.*norm|bn).*$' + lr: 0.00005 + weight_decay: 0. + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn|bias)).*$' + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 + + +train_dataloader: + dataset: + transforms: + policy: + epoch: 10 + collate_fn: + stop_epoch: 10 diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_6x_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_6x_coco.yml new file mode 100644 index 0000000..63f0bd6 --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_6x_coco.yml @@ -0,0 +1,27 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetrv2_r50vd.yml', +] + + +output_dir: ./output/rtdetrv2_r50vd_6x_coco + + + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm).*$' + lr: 0.00001 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 + diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_dsp_1x_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_dsp_1x_coco.yml new file mode 100644 index 0000000..1c1cfad --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_dsp_1x_coco.yml @@ -0,0 +1,27 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetrv2_r50vd.yml', +] + + +tuning: https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_6x_coco_ema.pth + +output_dir: ./output/rtdetrv2_r50vd_dsp_1x_coco + + +RTDETRTransformerv2: + cross_attn_method: discrete + + +epoches: 12 + +train_dataloader: + dataset: + transforms: + policy: + epoch: 10 + collate_fn: + stop_epoch: 10 diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml new file mode 100644 index 0000000..43ab113 --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_m_7x_coco.yml @@ -0,0 +1,43 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetrv2_r50vd.yml', +] + +output_dir: ./output/rtdetrv2_r50vd_m_6x_coco + + +HybridEncoder: + expansion: 0.5 + + +RTDETRTransformerv2: + eval_idx: 2 # use 3th decoder layer to eval + + +epoches: 84 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm).*$' + lr: 0.00001 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 + + +train_dataloader: + dataset: + transforms: + policy: + epoch: 81 + collate_fn: + stop_epoch: 81 diff --git a/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_m_dsp_3x_coco.yml b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_m_dsp_3x_coco.yml new file mode 100644 index 0000000..af617ff --- /dev/null +++ b/rtdetrv2_pytorch/configs/rtdetrv2/rtdetrv2_r50vd_m_dsp_3x_coco.yml @@ -0,0 +1,44 @@ +__include__: [ + '../dataset/coco_detection.yml', + '../runtime.yml', + './include/dataloader.yml', + './include/optimizer.yml', + './include/rtdetrv2_r50vd.yml', +] + +output_dir: ./output/rtdetrv2_r50vd_m_dsp_3x_coco +tuning: https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_m_7x_coco_ema.pth + +HybridEncoder: + expansion: 0.5 + + +RTDETRTransformerv2: + eval_idx: 2 # use 3th decoder layer to eval + cross_attn_method: discrete + + +epoches: 36 + +optimizer: + type: AdamW + params: + - + params: '^(?=.*backbone)(?!.*norm).*$' + lr: 0.00001 + - + params: '^(?=.*(?:encoder|decoder))(?=.*(?:norm|bn)).*$' + weight_decay: 0. + + lr: 0.0001 + betas: [0.9, 0.999] + weight_decay: 0.0001 + + +train_dataloader: + dataset: + transforms: + policy: + epoch: 33 + collate_fn: + stop_epoch: 33 diff --git a/rtdetrv2_pytorch/configs/runtime.yml b/rtdetrv2_pytorch/configs/runtime.yml new file mode 100644 index 0000000..4217b9e --- /dev/null +++ b/rtdetrv2_pytorch/configs/runtime.yml @@ -0,0 +1,21 @@ + +print_freq: 100 +output_dir: './logs' +checkpoint_freq: 1 + + +sync_bn: True +find_unused_parameters: False + + +use_amp: False +scaler: + type: GradScaler + enabled: True + + +use_ema: False +ema: + type: ModelEMA + decay: 0.9999 + warmups: 2000 diff --git a/rtdetrv2_pytorch/docker-compose.yml b/rtdetrv2_pytorch/docker-compose.yml new file mode 100644 index 0000000..7d07984 --- /dev/null +++ b/rtdetrv2_pytorch/docker-compose.yml @@ -0,0 +1,23 @@ +services: + tensorrt-container: + build: + context: . + dockerfile: Dockerfile + image: rtdetr-v2:25.06 + container_name: rtdetr-v2-trt + ports: + - "6006:6006" # tensorboard + volumes: + - ./:/workspace + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + working_dir: /workspace + restart: unless-stopped + stdin_open: true + tty: true + command: bash \ No newline at end of file diff --git a/rtdetrv2_pytorch/references/deploy/readme.md b/rtdetrv2_pytorch/references/deploy/readme.md new file mode 100644 index 0000000..ed3c9a4 --- /dev/null +++ b/rtdetrv2_pytorch/references/deploy/readme.md @@ -0,0 +1,2 @@ +# Deployment + diff --git a/rtdetrv2_pytorch/references/deploy/rtdetrv2_onnxruntime.py b/rtdetrv2_pytorch/references/deploy/rtdetrv2_onnxruntime.py new file mode 100644 index 0000000..0f94dd2 --- /dev/null +++ b/rtdetrv2_pytorch/references/deploy/rtdetrv2_onnxruntime.py @@ -0,0 +1,61 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torchvision.transforms as T + +import numpy as np +import onnxruntime as ort +from PIL import Image, ImageDraw + + +def draw(images, labels, boxes, scores, thrh = 0.6): + for i, im in enumerate(images): + draw = ImageDraw.Draw(im) + + scr = scores[i] + lab = labels[i][scr > thrh] + box = boxes[i][scr > thrh] + + for b in box: + draw.rectangle(list(b), outline='red',) + draw.text((b[0], b[1]), text=str(lab[i].item()), fill='blue', ) + + im.save(f'results_{i}.jpg') + + +def main(args, ): + """main + """ + sess = ort.InferenceSession(args.onnx_file) + print(ort.get_device()) + + im_pil = Image.open(args.im_file).convert('RGB') + w, h = im_pil.size + orig_size = torch.tensor([w, h])[None] + + transforms = T.Compose([ + T.Resize((640, 640)), + T.ToTensor(), + ]) + im_data = transforms(im_pil)[None] + + output = sess.run( + # output_names=['labels', 'boxes', 'scores'], + output_names=None, + input_feed={'images': im_data.data.numpy(), "orig_target_sizes": orig_size.data.numpy()} + ) + + labels, boxes, scores = output + + draw([im_pil], labels, boxes, scores) + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--onnx-file', type=str, ) + parser.add_argument('--im-file', type=str, ) + # parser.add_argument('-d', '--device', type=str, default='cpu') + args = parser.parse_args() + main(args) diff --git a/rtdetrv2_pytorch/references/deploy/rtdetrv2_openvino.py b/rtdetrv2_pytorch/references/deploy/rtdetrv2_openvino.py new file mode 100644 index 0000000..bc8a7d7 --- /dev/null +++ b/rtdetrv2_pytorch/references/deploy/rtdetrv2_openvino.py @@ -0,0 +1,5 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + + +# please reference: https://github.com/guojin-yan/RT-DETR-OpenVINO \ No newline at end of file diff --git a/rtdetrv2_pytorch/references/deploy/rtdetrv2_tensorrt.py b/rtdetrv2_pytorch/references/deploy/rtdetrv2_tensorrt.py new file mode 100644 index 0000000..8bf2a56 --- /dev/null +++ b/rtdetrv2_pytorch/references/deploy/rtdetrv2_tensorrt.py @@ -0,0 +1,258 @@ +# Copyright 2023 lyuwenyu. All Rights Reserved. +# Copyright (c) 2025 Hitbee-dev. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# ============================================================================== +# NOTICE: This file has been heavily modified by [Hitbee-dev] from the original source. +# Modifications include restructuring for broader GPU architecture compatibility +# (including NVIDIA Blackwell), improved modularity, and enhanced testability. +# ============================================================================== + +import time +import numpy as np +import torch +import tensorrt as trt +from collections import OrderedDict +from PIL import Image, ImageDraw, ImageFont + +class TRTInference(object): + """ + A high-level wrapper for TensorRT inference, designed for ease of use and flexibility. + This class handles engine loading, context creation, and dynamic buffer allocation. + """ + def __init__(self, engine_path, device='cuda:0', verbose=False): + """ + Initializes the TRTInference instance. + + Args: + engine_path (str): Path to the serialized TensorRT engine file. + device (str): The device to run inference on (e.g., 'cuda:0'). + verbose (bool): If True, enables verbose logging from the TensorRT logger. + """ + self.engine_path = engine_path + self.device = torch.device(device) + self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO) + + trt.init_libnvinfer_plugins(self.logger, '') + self.runtime = trt.Runtime(self.logger) + self.engine = self._load_engine(engine_path) + self.context = self.engine.create_execution_context() + + self.input_names, self.output_names = self._get_io_names() + + self.buffers_allocated = False + self.gpu_buffers = OrderedDict() + self.binding_addrs = OrderedDict() + + print(f"[TRTInference] Initialized successfully. Engine: '{engine_path}'.") + + def _load_engine(self, path): + """Loads a TensorRT engine from a file.""" + with open(path, 'rb') as f: + engine = self.runtime.deserialize_cuda_engine(f.read()) + if engine is None: + raise RuntimeError(f"Failed to load TensorRT engine from '{path}'.") + return engine + + def _get_io_names(self): + """Parses input and output tensor names from the engine.""" + input_names, output_names = [], [] + for i in range(self.engine.num_io_tensors): + name = self.engine.get_tensor_name(i) + if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT: + input_names.append(name) + else: + output_names.append(name) + return input_names, output_names + + def _allocate_buffers(self, blob: dict): + """ + Allocates GPU buffers for inputs and outputs based on the first inference request. + This "lazy allocation" strategy handles dynamic input shapes gracefully. + """ + print("[TRTInference] First inference call detected. Allocating GPU buffers...") + for name in self.input_names: + tensor = blob[name] + shape = tuple(tensor.shape) + dtype = tensor.dtype + self.context.set_input_shape(name, shape) + self.gpu_buffers[name] = torch.empty(shape, dtype=dtype, device=self.device) + self.binding_addrs[name] = self.gpu_buffers[name].data_ptr() + print(f" - Input '{name}': allocated buffer with shape {shape}.") + + for name in self.output_names: + shape = tuple(self.context.get_tensor_shape(name)) + dtype = trt.nptype(self.engine.get_tensor_dtype(name)) + torch_dtype = torch.from_numpy(np.array(0, dtype=dtype)).dtype + self.gpu_buffers[name] = torch.empty(shape, dtype=torch_dtype, device=self.device) + self.binding_addrs[name] = self.gpu_buffers[name].data_ptr() + print(f" - Output '{name}': allocated buffer with shape {shape}.") + + self.buffers_allocated = True + print("[TRTInference] GPU buffers allocated successfully.") + + def __call__(self, blob: dict): + """ + Executes inference on the loaded TensorRT engine. + + Args: + blob (dict): A dictionary mapping input tensor names to their corresponding + torch.Tensor data on the GPU. + + Returns: + dict: A dictionary mapping output tensor names to their corresponding + torch.Tensor results on the GPU. + """ + if not self.buffers_allocated: + self._allocate_buffers(blob) + + for name in self.input_names: + self.gpu_buffers[name].copy_(blob[name]) + + self.context.execute_v2(bindings=list(self.binding_addrs.values())) + + return {name: self.gpu_buffers[name] for name in self.output_names} + +# --- Visualization Utility Function --- +COCO_CLASSES = [ + 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', + 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', + 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', + 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', + 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', + 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', + 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', + 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', + 'scissors', 'teddy bear', 'hair drier', 'toothbrush' +] + +def visualize_detections(image_pil, boxes, scores, labels, class_names=COCO_CLASSES, threshold=0.5): + """ + Draws bounding boxes on a PIL image. This function is a general-purpose utility. + + Args: + image_pil (PIL.Image.Image): The image to draw on. + boxes (torch.Tensor): A tensor of bounding boxes (shape: [N, 4]). + scores (torch.Tensor): A tensor of confidence scores (shape: [N]). + labels (torch.Tensor): A tensor of class labels (shape: [N]). + class_names (list): A list of strings corresponding to class labels. + threshold (float): The confidence threshold for displaying detections. + + Returns: + PIL.Image.Image: The image with detections drawn on it. + """ + img_draw = image_pil.copy() + draw = ImageDraw.Draw(img_draw) + + # Ensure tensors are on CPU and converted to NumPy for processing + boxes = boxes.cpu().numpy() + scores = scores.cpu().numpy() + labels = labels.cpu().numpy() + + count = 0 + for i in range(len(scores)): + score = scores[i] + if score < threshold: + continue + + count += 1 + box = boxes[i] + label_idx = int(labels[i]) + + xmin, ymin, xmax, ymax = box + class_name = class_names[label_idx] if label_idx < len(class_names) else f'CLS-{label_idx}' + color = 'red' # Keep it simple or use a color map + + draw.rectangle(((xmin, ymin), (xmax, ymax)), outline=color, width=3) + + text = f"{class_name}: {score:.2f}" + + try: + font = ImageFont.truetype("arial.ttf", 20) + except IOError: + font = ImageFont.load_default() + + text_bbox = draw.textbbox((xmin, ymin), text, font=font) + draw.rectangle(text_bbox, fill=color) + draw.text((xmin, ymin), text, fill="white", font=font) + + print(f" - Found {count} objects above threshold {threshold}.") + return img_draw + +if __name__ == '__main__': + import argparse + import torchvision.transforms as T + import os + + parser = argparse.ArgumentParser(description="Test script for the TRTInference wrapper.") + parser.add_argument('--engine', type=str, required=True, help="Path to the TensorRT engine file.") + parser.add_argument('--image', type=str, required=True, help="Path to the input image file.") + parser.add_argument('--output', type=str, default='output.jpg', help="Path to save the output image with detections.") + parser.add_argument('--device', type=str, default='cuda:0', help="Device to run inference on.") + parser.add_argument('--threshold', type=float, default=0.5, help="Confidence threshold for displaying detections.") + args = parser.parse_args() + + if not torch.cuda.is_available(): + raise SystemExit("CUDA is not available. This script requires a GPU.") + + print("--- TRTInference Wrapper Test ---") + + print("\n1. Initializing TRTInference...") + trt_model = TRTInference(args.engine, device=args.device) + + print("\n2. Preprocessing input image...") + image_pil = Image.open(args.image).convert('RGB') + w, h = image_pil.size + + transforms = T.Compose([ + T.Resize((640, 640)), + T.ToTensor(), + ]) + + image_tensor = transforms(image_pil).unsqueeze(0).to(args.device) + orig_size_tensor = torch.tensor([[w, h]], dtype=torch.int64, device=args.device) + + blob = { + 'images': image_tensor, + 'orig_target_sizes': orig_size_tensor + } + print(f" - Original image size: {w}x{h}") + print(f" - Input tensor shape: {image_tensor.shape}") + + print("\n3. Running inference...") + start_time = time.time() + output_gpu = trt_model(blob) + torch.cuda.synchronize() + end_time = time.time() + + print(f"\n4. Inference complete in { (end_time - start_time) * 1000:.2f} ms.") + + print("\n5. Post-processing and saving output image...") + output_labels = output_gpu['labels'][0] + output_boxes = output_gpu['boxes'][0] + output_scores = output_gpu['scores'][0] + + # Use the new, separate visualization function + result_image = visualize_detections( + image_pil, + output_boxes, + output_scores, + output_labels, + threshold=args.threshold + ) + + result_image.save(args.output) + print(f" - Output image with detections saved to: {os.path.abspath(args.output)}") + + print("\n--- Test finished successfully ---") \ No newline at end of file diff --git a/rtdetrv2_pytorch/references/deploy/rtdetrv2_torch.py b/rtdetrv2_pytorch/references/deploy/rtdetrv2_torch.py new file mode 100644 index 0000000..3748530 --- /dev/null +++ b/rtdetrv2_pytorch/references/deploy/rtdetrv2_torch.py @@ -0,0 +1,84 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torch.nn as nn +import torchvision.transforms as T + +import numpy as np +from PIL import Image, ImageDraw + +from src.core import YAMLConfig + + +def draw(images, labels, boxes, scores, thrh = 0.6): + for i, im in enumerate(images): + draw = ImageDraw.Draw(im) + + scr = scores[i] + lab = labels[i][scr > thrh] + box = boxes[i][scr > thrh] + scrs = scores[i][scr > thrh] + + for j,b in enumerate(box): + draw.rectangle(list(b), outline='red',) + draw.text((b[0], b[1]), text=f"{lab[j].item()} {round(scrs[j].item(),2)}", fill='blue', ) + + im.save(f'results_{i}.jpg') + + +def main(args, ): + """main + """ + cfg = YAMLConfig(args.config, resume=args.resume) + + if args.resume: + checkpoint = torch.load(args.resume, map_location='cpu') + if 'ema' in checkpoint: + state = checkpoint['ema']['module'] + else: + state = checkpoint['model'] + else: + raise AttributeError('Only support resume to load model.state_dict by now.') + + # NOTE load train mode state -> convert to deploy mode + cfg.model.load_state_dict(state) + + class Model(nn.Module): + def __init__(self, ) -> None: + super().__init__() + self.model = cfg.model.deploy() + self.postprocessor = cfg.postprocessor.deploy() + + def forward(self, images, orig_target_sizes): + outputs = self.model(images) + outputs = self.postprocessor(outputs, orig_target_sizes) + return outputs + + model = Model().to(args.device) + + im_pil = Image.open(args.im_file).convert('RGB') + w, h = im_pil.size + orig_size = torch.tensor([w, h])[None].to(args.device) + + transforms = T.Compose([ + T.Resize((640, 640)), + T.ToTensor(), + ]) + im_data = transforms(im_pil)[None].to(args.device) + + output = model(im_data, orig_size) + labels, boxes, scores = output + + draw([im_pil], labels, boxes, scores) + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--config', type=str, ) + parser.add_argument('-r', '--resume', type=str, ) + parser.add_argument('-f', '--im-file', type=str, ) + parser.add_argument('-d', '--device', type=str, default='cpu') + args = parser.parse_args() + main(args) diff --git a/rtdetrv2_pytorch/requirements.txt b/rtdetrv2_pytorch/requirements.txt new file mode 100644 index 0000000..24b7c2d --- /dev/null +++ b/rtdetrv2_pytorch/requirements.txt @@ -0,0 +1,9 @@ +torch>=2.0.1 +torchvision>=0.15.2 +faster-coco-eval>=1.6.6 +PyYAML +tensorboard +scipy +pycocotools +onnx +onnxruntime-gpu \ No newline at end of file diff --git a/rtdetrv2_pytorch/src/__init__.py b/rtdetrv2_pytorch/src/__init__.py new file mode 100644 index 0000000..5901b01 --- /dev/null +++ b/rtdetrv2_pytorch/src/__init__.py @@ -0,0 +1,8 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +# for register purpose +from . import optim +from . import data +from . import nn +from . import zoo \ No newline at end of file diff --git a/rtdetrv2_pytorch/src/core/__init__.py b/rtdetrv2_pytorch/src/core/__init__.py new file mode 100644 index 0000000..f9ca39f --- /dev/null +++ b/rtdetrv2_pytorch/src/core/__init__.py @@ -0,0 +1,7 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +from .workspace import GLOBAL_CONFIG, register, create +from .yaml_utils import * +from ._config import BaseConfig +from .yaml_config import YAMLConfig diff --git a/rtdetrv2_pytorch/src/core/_config.py b/rtdetrv2_pytorch/src/core/_config.py new file mode 100644 index 0000000..0bc5aeb --- /dev/null +++ b/rtdetrv2_pytorch/src/core/_config.py @@ -0,0 +1,290 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torch.nn as nn +from torch.utils.data import Dataset, DataLoader +from torch.optim import Optimizer +from torch.optim.lr_scheduler import LRScheduler +from torch.cuda.amp.grad_scaler import GradScaler +from torch.utils.tensorboard import SummaryWriter + +from pathlib import Path +from typing import Callable, List, Dict + + +__all__ = ['BaseConfig', ] + + +class BaseConfig(object): + # TODO property + + def __init__(self) -> None: + super().__init__() + + self.task :str = None + + # instance / function + self._model :nn.Module = None + self._postprocessor :nn.Module = None + self._criterion :nn.Module = None + self._optimizer :Optimizer = None + self._lr_scheduler :LRScheduler = None + self._lr_warmup_scheduler: LRScheduler = None + self._train_dataloader :DataLoader = None + self._val_dataloader :DataLoader = None + self._ema :nn.Module = None + self._scaler :GradScaler = None + self._train_dataset :Dataset = None + self._val_dataset :Dataset = None + self._collate_fn :Callable = None + self._evaluator :Callable[[nn.Module, DataLoader, str], ] = None + self._writer: SummaryWriter = None + + # dataset + self.num_workers :int = 0 + self.batch_size :int = None + self._train_batch_size :int = None + self._val_batch_size :int = None + self._train_shuffle: bool = None + self._val_shuffle: bool = None + + # runtime + self.resume :str = None + self.tuning :str = None + + self.epoches :int = None + self.last_epoch :int = -1 + + self.use_amp :bool = False + self.use_ema :bool = False + self.ema_decay :float = 0.9999 + self.ema_warmups: int = 2000 + self.sync_bn :bool = False + self.clip_max_norm : float = 0. + self.find_unused_parameters :bool = None + + self.seed :int = None + self.print_freq :int = None + self.checkpoint_freq :int = 1 + self.output_dir :str = None + self.summary_dir :str = None + self.device : str = '' + + @property + def model(self, ) -> nn.Module: + return self._model + + @model.setter + def model(self, m): + assert isinstance(m, nn.Module), f'{type(m)} != nn.Module, please check your model class' + self._model = m + + @property + def postprocessor(self, ) -> nn.Module: + return self._postprocessor + + @postprocessor.setter + def postprocessor(self, m): + assert isinstance(m, nn.Module), f'{type(m)} != nn.Module, please check your model class' + self._postprocessor = m + + @property + def criterion(self, ) -> nn.Module: + return self._criterion + + @criterion.setter + def criterion(self, m): + assert isinstance(m, nn.Module), f'{type(m)} != nn.Module, please check your model class' + self._criterion = m + + @property + def optimizer(self, ) -> Optimizer: + return self._optimizer + + @optimizer.setter + def optimizer(self, m): + assert isinstance(m, Optimizer), f'{type(m)} != optim.Optimizer, please check your model class' + self._optimizer = m + + @property + def lr_scheduler(self, ) -> LRScheduler: + return self._lr_scheduler + + @lr_scheduler.setter + def lr_scheduler(self, m): + assert isinstance(m, LRScheduler), f'{type(m)} != LRScheduler, please check your model class' + self._lr_scheduler = m + + @property + def lr_warmup_scheduler(self, ) -> LRScheduler: + return self._lr_warmup_scheduler + + @lr_warmup_scheduler.setter + def lr_warmup_scheduler(self, m): + self._lr_warmup_scheduler = m + + @property + def train_dataloader(self) -> DataLoader: + if self._train_dataloader is None and self.train_dataset is not None: + loader = DataLoader(self.train_dataset, + batch_size=self.train_batch_size, + num_workers=self.num_workers, + collate_fn=self.collate_fn, + shuffle=self.train_shuffle, ) + loader.shuffle = self.train_shuffle + self._train_dataloader = loader + + return self._train_dataloader + + @train_dataloader.setter + def train_dataloader(self, loader): + self._train_dataloader = loader + + @property + def val_dataloader(self) -> DataLoader: + if self._val_dataloader is None and self.val_dataset is not None: + loader = DataLoader(self.val_dataset, + batch_size=self.val_batch_size, + num_workers=self.num_workers, + drop_last=False, + collate_fn=self.collate_fn, + shuffle=self.val_shuffle) + loader.shuffle = self.val_shuffle + self._val_dataloader = loader + + return self._val_dataloader + + @val_dataloader.setter + def val_dataloader(self, loader): + self._val_dataloader = loader + + @property + def ema(self, ) -> nn.Module: + if self._ema is None and self.use_ema and self.model is not None: + from ..optim import ModelEMA + self._ema = ModelEMA(self.model, self.ema_decay, self.ema_warmups) + return self._ema + + @ema.setter + def ema(self, obj): + self._ema = obj + + @property + def scaler(self) -> GradScaler: + if self._scaler is None and self.use_amp and torch.cuda.is_available(): + self._scaler = GradScaler() + return self._scaler + + @scaler.setter + def scaler(self, obj: GradScaler): + self._scaler = obj + + @property + def val_shuffle(self) -> bool: + if self._val_shuffle is None: + print('warning: set default val_shuffle=False') + return False + return self._val_shuffle + + @val_shuffle.setter + def val_shuffle(self, shuffle): + assert isinstance(shuffle, bool), 'shuffle must be bool' + self._val_shuffle = shuffle + + @property + def train_shuffle(self) -> bool: + if self._train_shuffle is None: + print('warning: set default train_shuffle=True') + return True + return self._train_shuffle + + @train_shuffle.setter + def train_shuffle(self, shuffle): + assert isinstance(shuffle, bool), 'shuffle must be bool' + self._train_shuffle = shuffle + + + @property + def train_batch_size(self) -> int: + if self._train_batch_size is None and isinstance(self.batch_size, int): + print(f'warning: set train_batch_size=batch_size={self.batch_size}') + return self.batch_size + return self._train_batch_size + + @train_batch_size.setter + def train_batch_size(self, batch_size): + assert isinstance(batch_size, int), 'batch_size must be int' + self._train_batch_size = batch_size + + @property + def val_batch_size(self) -> int: + if self._val_batch_size is None: + print(f'warning: set val_batch_size=batch_size={self.batch_size}') + return self.batch_size + return self._val_batch_size + + @val_batch_size.setter + def val_batch_size(self, batch_size): + assert isinstance(batch_size, int), 'batch_size must be int' + self._val_batch_size = batch_size + + + @property + def train_dataset(self) -> Dataset: + return self._train_dataset + + @train_dataset.setter + def train_dataset(self, dataset): + assert isinstance(dataset, Dataset), f'{type(dataset)} must be Dataset' + self._train_dataset = dataset + + + @property + def val_dataset(self) -> Dataset: + return self._val_dataset + + @val_dataset.setter + def val_dataset(self, dataset): + assert isinstance(dataset, Dataset), f'{type(dataset)} must be Dataset' + self._val_dataset = dataset + + @property + def collate_fn(self) -> Callable: + return self._collate_fn + + @collate_fn.setter + def collate_fn(self, fn): + assert isinstance(fn, Callable), f'{type(fn)} must be Callable' + self._collate_fn = fn + + @property + def evaluator(self) -> Callable: + return self._evaluator + + @evaluator.setter + def evaluator(self, fn): + assert isinstance(fn, Callable), f'{type(fn)} must be Callable' + self._evaluator = fn + + @property + def writer(self) -> SummaryWriter: + if self._writer is None: + if self.summary_dir: + self._writer = SummaryWriter(self.summary_dir) + elif self.output_dir: + self._writer = SummaryWriter(Path(self.output_dir) / 'summary') + return self._writer + + @writer.setter + def writer(self, m): + assert isinstance(m, SummaryWriter), f'{type(m)} must be SummaryWriter' + self._writer = m + + def __repr__(self, ): + s = '' + for k, v in self.__dict__.items(): + if not k.startswith('_'): + s += f'{k}: {v}\n' + return s + diff --git a/rtdetrv2_pytorch/src/core/workspace.py b/rtdetrv2_pytorch/src/core/workspace.py new file mode 100644 index 0000000..e9b3c12 --- /dev/null +++ b/rtdetrv2_pytorch/src/core/workspace.py @@ -0,0 +1,179 @@ +""""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import inspect +import importlib +import functools +import inspect +from collections import defaultdict +from typing import Any, Dict, Optional, List + + +GLOBAL_CONFIG = defaultdict(dict) + + +def register(dct :Any=GLOBAL_CONFIG, name=None, force=False): + """ + dct: + if dct is Dict, register foo into dct as key-value pair + if dct is Clas, register as modules attibute + force + whether force register. + """ + def decorator(foo): + register_name = foo.__name__ if name is None else name + if not force: + if inspect.isclass(dct): + assert not hasattr(dct, foo.__name__), \ + f'module {dct.__name__} has {foo.__name__}' + else: + assert foo.__name__ not in dct, \ + f'{foo.__name__} has been already registered' + + if inspect.isfunction(foo): + @functools.wraps(foo) + def wrap_func(*args, **kwargs): + return foo(*args, **kwargs) + if isinstance(dct, dict): + dct[foo.__name__] = wrap_func + elif inspect.isclass(dct): + setattr(dct, foo.__name__, wrap_func) + else: + raise AttributeError('') + return wrap_func + + elif inspect.isclass(foo): + dct[register_name] = extract_schema(foo) + + else: + raise ValueError(f'Do not support {type(foo)} register') + + return foo + + return decorator + + + +def extract_schema(module: type): + """ + Args: + module (type), + Return: + Dict, + """ + argspec = inspect.getfullargspec(module.__init__) + arg_names = [arg for arg in argspec.args if arg != 'self'] + num_defualts = len(argspec.defaults) if argspec.defaults is not None else 0 + num_requires = len(arg_names) - num_defualts + + schame = dict() + schame['_name'] = module.__name__ + schame['_pymodule'] = importlib.import_module(module.__module__) + schame['_inject'] = getattr(module, '__inject__', []) + schame['_share'] = getattr(module, '__share__', []) + schame['_kwargs'] = {} + for i, name in enumerate(arg_names): + if name in schame['_share']: + assert i >= num_requires, 'share config must have default value.' + value = argspec.defaults[i - num_requires] + + elif i >= num_requires: + value = argspec.defaults[i - num_requires] + + else: + value = None + + schame[name] = value + schame['_kwargs'][name] = value + + return schame + + +def create(type_or_name, global_cfg=GLOBAL_CONFIG, **kwargs): + """ + """ + assert type(type_or_name) in (type, str), 'create should be modules or name.' + + name = type_or_name if isinstance(type_or_name, str) else type_or_name.__name__ + + if name in global_cfg: + if hasattr(global_cfg[name], '__dict__'): + return global_cfg[name] + else: + raise ValueError('The module {} is not registered'.format(name)) + + cfg = global_cfg[name] + + if isinstance(cfg, dict) and 'type' in cfg: + _cfg: dict = global_cfg[cfg['type']] + # clean args + _keys = [k for k in _cfg.keys() if not k.startswith('_')] + for _arg in _keys: + del _cfg[_arg] + _cfg.update(_cfg['_kwargs']) # restore default args + _cfg.update(cfg) # load config args + _cfg.update(kwargs) # TODO recive extra kwargs + name = _cfg.pop('type') # pop extra key `type` (from cfg) + + return create(name, global_cfg) + + module = getattr(cfg['_pymodule'], name) + module_kwargs = {} + module_kwargs.update(cfg) + + # shared var + for k in cfg['_share']: + if k in global_cfg: + module_kwargs[k] = global_cfg[k] + else: + module_kwargs[k] = cfg[k] + + # inject + for k in cfg['_inject']: + _k = cfg[k] + + if _k is None: + continue + + if isinstance(_k, str): + if _k not in global_cfg: + raise ValueError(f'Missing inject config of {_k}.') + + _cfg = global_cfg[_k] + + if isinstance(_cfg, dict): + module_kwargs[k] = create(_cfg['_name'], global_cfg) + else: + module_kwargs[k] = _cfg + + elif isinstance(_k, dict): + if 'type' not in _k.keys(): + raise ValueError(f'Missing inject for `type` style.') + + _type = str(_k['type']) + if _type not in global_cfg: + raise ValueError(f'Missing {_type} in inspect stage.') + + # TODO + _cfg: dict = global_cfg[_type] + # clean args + _keys = [k for k in _cfg.keys() if not k.startswith('_')] + for _arg in _keys: + del _cfg[_arg] + _cfg.update(_cfg['_kwargs']) # restore default values + _cfg.update(_k) # load config args + name = _cfg.pop('type') # pop extra key (`type` from _k) + module_kwargs[k] = create(name, global_cfg) + + else: + raise ValueError(f'Inject does not support {_k}') + + # TODO hard code + module_kwargs = {k: v for k, v in module_kwargs.items() if not k.startswith('_')} + + # TODO for **kwargs + # extra_args = set(module_kwargs.keys()) - set(arg_names) + # if len(extra_args) > 0: + # raise RuntimeError(f'Error: unknown args {extra_args} for {module}') + + return module(**module_kwargs) \ No newline at end of file diff --git a/rtdetrv2_pytorch/src/core/yaml_config.py b/rtdetrv2_pytorch/src/core/yaml_config.py new file mode 100644 index 0000000..3b6a46e --- /dev/null +++ b/rtdetrv2_pytorch/src/core/yaml_config.py @@ -0,0 +1,172 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader + +import re +import copy + +from ._config import BaseConfig +from .workspace import create +from .yaml_utils import load_config, merge_config, merge_dict + +class YAMLConfig(BaseConfig): + def __init__(self, cfg_path: str, **kwargs) -> None: + super().__init__() + + cfg = load_config(cfg_path) + cfg = merge_dict(cfg, kwargs) + + self.yaml_cfg = copy.deepcopy(cfg) + + for k in super().__dict__: + if not k.startswith('_') and k in cfg: + self.__dict__[k] = cfg[k] + + @property + def global_cfg(self, ): + return merge_config(self.yaml_cfg, inplace=False, overwrite=False) + + @property + def model(self, ) -> torch.nn.Module: + if self._model is None and 'model' in self.yaml_cfg: + self._model = create(self.yaml_cfg['model'], self.global_cfg) + return super().model + + @property + def postprocessor(self, ) -> torch.nn.Module: + if self._postprocessor is None and 'postprocessor' in self.yaml_cfg: + self._postprocessor = create(self.yaml_cfg['postprocessor'], self.global_cfg) + return super().postprocessor + + @property + def criterion(self, ) -> torch.nn.Module: + if self._criterion is None and 'criterion' in self.yaml_cfg: + self._criterion = create(self.yaml_cfg['criterion'], self.global_cfg) + return super().criterion + + @property + def optimizer(self, ) -> optim.Optimizer: + if self._optimizer is None and 'optimizer' in self.yaml_cfg: + params = self.get_optim_params(self.yaml_cfg['optimizer'], self.model) + self._optimizer = create('optimizer', self.global_cfg, params=params) + return super().optimizer + + @property + def lr_scheduler(self, ) -> optim.lr_scheduler.LRScheduler: + if self._lr_scheduler is None and 'lr_scheduler' in self.yaml_cfg: + self._lr_scheduler = create('lr_scheduler', self.global_cfg, optimizer=self.optimizer) + print(f'Initial lr: {self._lr_scheduler.get_last_lr()}') + return super().lr_scheduler + + @property + def lr_warmup_scheduler(self, ) -> optim.lr_scheduler.LRScheduler: + if self._lr_warmup_scheduler is None and 'lr_warmup_scheduler' in self.yaml_cfg : + self._lr_warmup_scheduler = create('lr_warmup_scheduler', self.global_cfg, lr_scheduler=self.lr_scheduler) + return super().lr_warmup_scheduler + + @property + def train_dataloader(self, ) -> DataLoader: + if self._train_dataloader is None and 'train_dataloader' in self.yaml_cfg: + self._train_dataloader = self.build_dataloader('train_dataloader') + return super().train_dataloader + + @property + def val_dataloader(self, ) -> DataLoader: + if self._val_dataloader is None and 'val_dataloader' in self.yaml_cfg: + self._val_dataloader = self.build_dataloader('val_dataloader') + return super().val_dataloader + + @property + def ema(self, ) -> torch.nn.Module: + if self._ema is None and self.yaml_cfg.get('use_ema', False): + self._ema = create('ema', self.global_cfg, model=self.model) + return super().ema + + @property + def scaler(self, ): + if self._scaler is None and self.yaml_cfg.get('use_amp', False): + self._scaler = create('scaler', self.global_cfg) + return super().scaler + + @property + def evaluator(self, ): + if self._evaluator is None and 'evaluator' in self.yaml_cfg: + if self.yaml_cfg['evaluator']['type'] == 'CocoEvaluator': + from ..data import get_coco_api_from_dataset + base_ds = get_coco_api_from_dataset(self.val_dataloader.dataset) + self._evaluator = create('evaluator', self.global_cfg, coco_gt=base_ds) + else: + raise NotImplementedError(f"{self.yaml_cfg['evaluator']['type']}") + return super().evaluator + + @staticmethod + def get_optim_params(cfg: dict, model: nn.Module): + """ + E.g.: + ^(?=.*a)(?=.*b).*$ means including a and b + ^(?=.*(?:a|b)).*$ means including a or b + ^(?=.*a)(?!.*b).*$ means including a, but not b + """ + assert 'type' in cfg, '' + cfg = copy.deepcopy(cfg) + + if 'params' not in cfg: + return model.parameters() + + assert isinstance(cfg['params'], list), '' + + param_groups = [] + visited = [] + for pg in cfg['params']: + pattern = pg['params'] + params = {k: v for k, v in model.named_parameters() if v.requires_grad and len(re.findall(pattern, k)) > 0} + pg['params'] = params.values() + param_groups.append(pg) + visited.extend(list(params.keys())) + # print(params.keys()) + + names = [k for k, v in model.named_parameters() if v.requires_grad] + + if len(visited) < len(names): + unseen = set(names) - set(visited) + params = {k: v for k, v in model.named_parameters() if v.requires_grad and k in unseen} + param_groups.append({'params': params.values()}) + visited.extend(list(params.keys())) + # print(params.keys()) + + assert len(visited) == len(names), '' + + return param_groups + + @staticmethod + def get_rank_batch_size(cfg): + """compute batch size for per rank if total_batch_size is provided. + """ + assert ('total_batch_size' in cfg or 'batch_size' in cfg) \ + and not ('total_batch_size' in cfg and 'batch_size' in cfg), \ + '`batch_size` or `total_batch_size` should be choosed one' + + total_batch_size = cfg.get('total_batch_size', None) + if total_batch_size is None: + bs = cfg.get('batch_size') + else: + from ..misc import dist_utils + assert total_batch_size % dist_utils.get_world_size() == 0, \ + 'total_batch_size should be divisible by world size' + bs = total_batch_size // dist_utils.get_world_size() + return bs + + def build_dataloader(self, name: str): + bs = self.get_rank_batch_size(self.yaml_cfg[name]) + global_cfg = self.global_cfg + if 'total_batch_size' in global_cfg[name]: + # pop unexpected key for dataloader init + _ = global_cfg[name].pop('total_batch_size') + print(f'building {name} with batch_size={bs}...') + loader = create(name, global_cfg, batch_size=bs) + loader.shuffle = self.yaml_cfg[name].get('shuffle', False) + return loader \ No newline at end of file diff --git a/rtdetrv2_pytorch/src/core/yaml_utils.py b/rtdetrv2_pytorch/src/core/yaml_utils.py new file mode 100644 index 0000000..d5732c3 --- /dev/null +++ b/rtdetrv2_pytorch/src/core/yaml_utils.py @@ -0,0 +1,124 @@ +""""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import os +import copy +import yaml +from typing import Any, Dict, Optional, List + +from .workspace import GLOBAL_CONFIG + +__all__ = [ + 'load_config', + 'merge_config', + 'merge_dict', + 'parse_cli', +] + + +INCLUDE_KEY = '__include__' + + +def load_config(file_path, cfg=dict()): + """load config + """ + _, ext = os.path.splitext(file_path) + assert ext in ['.yml', '.yaml'], "only support yaml files" + + with open(file_path) as f: + file_cfg = yaml.load(f, Loader=yaml.Loader) + if file_cfg is None: + return {} + + if INCLUDE_KEY in file_cfg: + base_yamls = list(file_cfg[INCLUDE_KEY]) + for base_yaml in base_yamls: + if base_yaml.startswith('~'): + base_yaml = os.path.expanduser(base_yaml) + + if not base_yaml.startswith('/'): + base_yaml = os.path.join(os.path.dirname(file_path), base_yaml) + + with open(base_yaml) as f: + base_cfg = load_config(base_yaml, cfg) + merge_dict(cfg, base_cfg) + + return merge_dict(cfg, file_cfg) + + +def merge_dict(dct, another_dct, inplace=True) -> Dict: + """merge another_dct into dct + """ + def _merge(dct, another) -> Dict: + for k in another: + if (k in dct and isinstance(dct[k], dict) and isinstance(another[k], dict)): + _merge(dct[k], another[k]) + else: + dct[k] = another[k] + + return dct + + if not inplace: + dct = copy.deepcopy(dct) + + return _merge(dct, another_dct) + + +def dictify(s: str, v: Any) -> Dict: + if '.' not in s: + return {s: v} + key, rest = s.split('.', 1) + return {key: dictify(rest, v)} + + +def parse_cli(nargs: List[str]) -> Dict: + """ + parse command-line arguments + convert `a.c=3 b=10` to `{'a': {'c': 3}, 'b': 10}` + """ + cfg = {} + if nargs is None or len(nargs) == 0: + return cfg + + for s in nargs: + s = s.strip() + k, v = s.split('=', 1) + d = dictify(k, yaml.load(v, Loader=yaml.Loader)) + cfg = merge_dict(cfg, d) + + return cfg + + + +def merge_config(cfg, another_cfg=GLOBAL_CONFIG, inplace: bool=False, overwrite: bool=False): + """ + Merge another_cfg into cfg, return the merged config + + Example: + + cfg1 = load_config('./rtdetrv2_r18vd_6x_coco.yml') + cfg1 = merge_config(cfg, inplace=True) + + cfg2 = load_config('./rtdetr_r50vd_6x_coco.yml') + cfg2 = merge_config(cfg2, inplace=True) + + model1 = create(cfg1['model'], cfg1) + model2 = create(cfg2['model'], cfg2) + """ + def _merge(dct, another): + for k in another: + if k not in dct: + dct[k] = another[k] + + elif isinstance(dct[k], dict) and isinstance(another[k], dict): + _merge(dct[k], another[k]) + + elif overwrite: + dct[k] = another[k] + + return cfg + + if not inplace: + cfg = copy.deepcopy(cfg) + + return _merge(cfg, another_cfg) diff --git a/rtdetrv2_pytorch/src/data/__init__.py b/rtdetrv2_pytorch/src/data/__init__.py new file mode 100644 index 0000000..e42581e --- /dev/null +++ b/rtdetrv2_pytorch/src/data/__init__.py @@ -0,0 +1,21 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +from .dataset import * +from .transforms import * +from .dataloader import * + +from ._misc import convert_to_tv_tensor + + + + +# def set_epoch(self, epoch) -> None: +# self.epoch = epoch +# def _set_epoch_func(datasets): +# """Add `set_epoch` for datasets +# """ +# from ..core import register +# for ds in datasets: +# register(ds)(set_epoch) +# _set_epoch_func([CIFAR10, VOCDetection, CocoDetection]) \ No newline at end of file diff --git a/rtdetrv2_pytorch/src/data/_misc.py b/rtdetrv2_pytorch/src/data/_misc.py new file mode 100644 index 0000000..ae0e225 --- /dev/null +++ b/rtdetrv2_pytorch/src/data/_misc.py @@ -0,0 +1,55 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import importlib.metadata +from torch import Tensor + +if importlib.metadata.version('torchvision') == '0.15.2': + import torchvision + torchvision.disable_beta_transforms_warning() + + from torchvision.datapoints import BoundingBox as BoundingBoxes + from torchvision.datapoints import BoundingBoxFormat, Mask, Image, Video + from torchvision.transforms.v2 import SanitizeBoundingBox as SanitizeBoundingBoxes + _boxes_keys = ['format', 'spatial_size'] + +elif '0.17' > importlib.metadata.version('torchvision') >= '0.16': + import torchvision + torchvision.disable_beta_transforms_warning() + + from torchvision.transforms.v2 import SanitizeBoundingBoxes + from torchvision.tv_tensors import ( + BoundingBoxes, BoundingBoxFormat, Mask, Image, Video) + _boxes_keys = ['format', 'canvas_size'] + +elif importlib.metadata.version('torchvision') >= '0.17': + import torchvision + from torchvision.transforms.v2 import SanitizeBoundingBoxes + from torchvision.tv_tensors import ( + BoundingBoxes, BoundingBoxFormat, Mask, Image, Video) + _boxes_keys = ['format', 'canvas_size'] + +else: + raise RuntimeError('Please make sure torchvision version >= 0.15.2') + + + +def convert_to_tv_tensor(tensor: Tensor, key: str, box_format='xyxy', spatial_size=None) -> Tensor: + """ + Args: + tensor (Tensor): input tensor + key (str): transform to key + + Return: + Dict[str, TV_Tensor] + """ + assert key in ('boxes', 'masks', ), "Only support 'boxes' and 'masks'" + + if key == 'boxes': + box_format = getattr(BoundingBoxFormat, box_format.upper()) + _kwargs = dict(zip(_boxes_keys, [box_format, spatial_size])) + return BoundingBoxes(tensor, **_kwargs) + + if key == 'masks': + return Mask(tensor) + diff --git a/rtdetrv2_pytorch/src/data/dataloader.py b/rtdetrv2_pytorch/src/data/dataloader.py new file mode 100644 index 0000000..d7f5302 --- /dev/null +++ b/rtdetrv2_pytorch/src/data/dataloader.py @@ -0,0 +1,107 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torch.utils.data as data +import torch.nn.functional as F +from torch.utils.data import default_collate + +import torchvision +torchvision.disable_beta_transforms_warning() +import torchvision.transforms.v2 as VT +from torchvision.transforms.v2 import functional as VF, InterpolationMode + +import random +from functools import partial + +from ..core import register + + +__all__ = [ + 'DataLoader', + 'BaseCollateFunction', + 'BatchImageCollateFunction', + 'batch_image_collate_fn' +] + + +@register() +class DataLoader(data.DataLoader): + __inject__ = ['dataset', 'collate_fn'] + + def __repr__(self) -> str: + format_string = self.__class__.__name__ + "(" + for n in ['dataset', 'batch_size', 'num_workers', 'drop_last', 'collate_fn']: + format_string += "\n" + format_string += " {0}: {1}".format(n, getattr(self, n)) + format_string += "\n)" + return format_string + + def set_epoch(self, epoch): + self._epoch = epoch + self.dataset.set_epoch(epoch) + self.collate_fn.set_epoch(epoch) + + @property + def epoch(self): + return self._epoch if hasattr(self, '_epoch') else -1 + + @property + def shuffle(self): + return self._shuffle + + @shuffle.setter + def shuffle(self, shuffle): + assert isinstance(shuffle, bool), 'shuffle must be a boolean' + self._shuffle = shuffle + + +@register() +def batch_image_collate_fn(items): + """only batch image + """ + return torch.cat([x[0][None] for x in items], dim=0), [x[1] for x in items] + + +class BaseCollateFunction(object): + def set_epoch(self, epoch): + self._epoch = epoch + + @property + def epoch(self): + return self._epoch if hasattr(self, '_epoch') else -1 + + def __call__(self, items): + raise NotImplementedError('') + + +@register() +class BatchImageCollateFunction(BaseCollateFunction): + def __init__( + self, + scales=None, + stop_epoch=None, + ) -> None: + super().__init__() + self.scales = scales + self.stop_epoch = stop_epoch if stop_epoch is not None else 100000000 + # self.interpolation = interpolation + + def __call__(self, items): + images = torch.cat([x[0][None] for x in items], dim=0) + targets = [x[1] for x in items] + + if self.scales is not None and self.epoch < self.stop_epoch: + # sz = random.choice(self.scales) + # sz = [sz] if isinstance(sz, int) else list(sz) + # VF.resize(inpt, sz, interpolation=self.interpolation) + + sz = random.choice(self.scales) + images = F.interpolate(images, size=sz) + if 'masks' in targets[0]: + for tg in targets: + tg['masks'] = F.interpolate(tg['masks'], size=sz, mode='nearest') + raise NotImplementedError('') + + return images, targets + diff --git a/rtdetrv2_pytorch/src/data/dataset/__init__.py b/rtdetrv2_pytorch/src/data/dataset/__init__.py new file mode 100644 index 0000000..f4b85bb --- /dev/null +++ b/rtdetrv2_pytorch/src/data/dataset/__init__.py @@ -0,0 +1,16 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +# from ._dataset import DetDataset +from .cifar_dataset import CIFAR10 +from .coco_dataset import CocoDetection +from .coco_dataset import ( + CocoDetection, + mscoco_category2name, + mscoco_category2label, + mscoco_label2category, +) +from .coco_eval import CocoEvaluator +from .coco_utils import get_coco_api_from_dataset +from .voc_detection import VOCDetection +from .voc_eval import VOCEvaluator diff --git a/rtdetrv2_pytorch/src/data/dataset/_dataset.py b/rtdetrv2_pytorch/src/data/dataset/_dataset.py new file mode 100644 index 0000000..c4448f3 --- /dev/null +++ b/rtdetrv2_pytorch/src/data/dataset/_dataset.py @@ -0,0 +1,22 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torch.utils.data as data + +class DetDataset(data.Dataset): + def __getitem__(self, index): + img, target = self.load_item(index) + if self.transforms is not None: + img, target, _ = self.transforms(img, target, self) + return img, target + + def load_item(self, index): + raise NotImplementedError("Please implement this function to return item before `transforms`.") + + def set_epoch(self, epoch) -> None: + self._epoch = epoch + + @property + def epoch(self): + return self._epoch if hasattr(self, '_epoch') else -1 diff --git a/rtdetrv2_pytorch/src/data/dataset/cifar_dataset.py b/rtdetrv2_pytorch/src/data/dataset/cifar_dataset.py new file mode 100644 index 0000000..2fc05f7 --- /dev/null +++ b/rtdetrv2_pytorch/src/data/dataset/cifar_dataset.py @@ -0,0 +1,16 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + + +import torchvision +from typing import Optional, Callable + +from ...core import register + +@register() +class CIFAR10(torchvision.datasets.CIFAR10): + __inject__ = ['transform', 'target_transform'] + + def __init__(self, root: str, train: bool = True, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None, download: bool = False) -> None: + super().__init__(root, train, transform, target_transform, download) + diff --git a/rtdetrv2_pytorch/src/data/dataset/coco_dataset.py b/rtdetrv2_pytorch/src/data/dataset/coco_dataset.py new file mode 100644 index 0000000..053fb13 --- /dev/null +++ b/rtdetrv2_pytorch/src/data/dataset/coco_dataset.py @@ -0,0 +1,261 @@ +""" +Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py + +Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +from faster_coco_eval.utils.pytorch import FasterCocoDetection +import torchvision + +from PIL import Image +from faster_coco_eval.core import mask as coco_mask + +from ._dataset import DetDataset +from .._misc import convert_to_tv_tensor +from ...core import register + +__all__ = ['CocoDetection'] + +torchvision.disable_beta_transforms_warning() + +@register() +class CocoDetection(FasterCocoDetection, DetDataset): + __inject__ = ['transforms', ] + __share__ = ['remap_mscoco_category'] + + def __init__(self, img_folder, ann_file, transforms, return_masks=False, remap_mscoco_category=False): + super(FasterCocoDetection, self).__init__(img_folder, ann_file) + self._transforms = transforms + self.prepare = ConvertCocoPolysToMask(return_masks) + self.img_folder = img_folder + self.ann_file = ann_file + self.return_masks = return_masks + self.remap_mscoco_category = remap_mscoco_category + + def __getitem__(self, idx): + img, target = self.load_item(idx) + if self._transforms is not None: + img, target, _ = self._transforms(img, target, self) + return img, target + + def load_item(self, idx): + image, target = super(FasterCocoDetection, self).__getitem__(idx) + image_id = self.ids[idx] + target = {'image_id': image_id, 'annotations': target} + + if self.remap_mscoco_category: + image, target = self.prepare(image, target, category2label=mscoco_category2label) + # image, target = self.prepare(image, target, category2label=self.category2label) + else: + image, target = self.prepare(image, target) + + target['idx'] = torch.tensor([idx]) + + if 'boxes' in target: + target['boxes'] = convert_to_tv_tensor(target['boxes'], key='boxes', spatial_size=image.size[::-1]) + + if 'masks' in target: + target['masks'] = convert_to_tv_tensor(target['masks'], key='masks') + + return image, target + + def extra_repr(self) -> str: + s = f' img_folder: {self.img_folder}\n ann_file: {self.ann_file}\n' + s += f' return_masks: {self.return_masks}\n' + if hasattr(self, '_transforms') and self._transforms is not None: + s += f' transforms:\n {repr(self._transforms)}' + if hasattr(self, '_preset') and self._preset is not None: + s += f' preset:\n {repr(self._preset)}' + return s + + @property + def categories(self, ): + return self.coco.dataset['categories'] + + @property + def category2name(self, ): + return {cat['id']: cat['name'] for cat in self.categories} + + @property + def category2label(self, ): + return {cat['id']: i for i, cat in enumerate(self.categories)} + + @property + def label2category(self, ): + return {i: cat['id'] for i, cat in enumerate(self.categories)} + + +def convert_coco_poly_to_mask(segmentations, height, width): + masks = [] + for polygons in segmentations: + rles = coco_mask.frPyObjects(polygons, height, width) + mask = coco_mask.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = torch.as_tensor(mask, dtype=torch.uint8) + mask = mask.any(dim=2) + masks.append(mask) + if masks: + masks = torch.stack(masks, dim=0) + else: + masks = torch.zeros((0, height, width), dtype=torch.uint8) + return masks + + +class ConvertCocoPolysToMask(object): + def __init__(self, return_masks=False): + self.return_masks = return_masks + + def __call__(self, image: Image.Image, target, **kwargs): + w, h = image.size + + image_id = target["image_id"] + image_id = torch.tensor([image_id]) + + anno = target["annotations"] + + anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0] + + boxes = [obj["bbox"] for obj in anno] + # guard against no boxes via resizing + boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + + category2label = kwargs.get('category2label', None) + if category2label is not None: + labels = [category2label[obj["category_id"]] for obj in anno] + else: + labels = [obj["category_id"] for obj in anno] + + labels = torch.tensor(labels, dtype=torch.int64) + + if self.return_masks: + segmentations = [obj["segmentation"] for obj in anno] + masks = convert_coco_poly_to_mask(segmentations, h, w) + + keypoints = None + if anno and "keypoints" in anno[0]: + keypoints = [obj["keypoints"] for obj in anno] + keypoints = torch.as_tensor(keypoints, dtype=torch.float32) + num_keypoints = keypoints.shape[0] + if num_keypoints: + keypoints = keypoints.view(num_keypoints, -1, 3) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + boxes = boxes[keep] + labels = labels[keep] + if self.return_masks: + masks = masks[keep] + if keypoints is not None: + keypoints = keypoints[keep] + + target = {} + target["boxes"] = boxes + target["labels"] = labels + if self.return_masks: + target["masks"] = masks + target["image_id"] = image_id + if keypoints is not None: + target["keypoints"] = keypoints + + # for conversion to coco api + area = torch.tensor([obj["area"] for obj in anno]) + iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno]) + target["area"] = area[keep] + target["iscrowd"] = iscrowd[keep] + + target["orig_size"] = torch.as_tensor([int(w), int(h)]) + # target["size"] = torch.as_tensor([int(w), int(h)]) + + return image, target + + +mscoco_category2name = { + 1: 'person', + 2: 'bicycle', + 3: 'car', + 4: 'motorcycle', + 5: 'airplane', + 6: 'bus', + 7: 'train', + 8: 'truck', + 9: 'boat', + 10: 'traffic light', + 11: 'fire hydrant', + 13: 'stop sign', + 14: 'parking meter', + 15: 'bench', + 16: 'bird', + 17: 'cat', + 18: 'dog', + 19: 'horse', + 20: 'sheep', + 21: 'cow', + 22: 'elephant', + 23: 'bear', + 24: 'zebra', + 25: 'giraffe', + 27: 'backpack', + 28: 'umbrella', + 31: 'handbag', + 32: 'tie', + 33: 'suitcase', + 34: 'frisbee', + 35: 'skis', + 36: 'snowboard', + 37: 'sports ball', + 38: 'kite', + 39: 'baseball bat', + 40: 'baseball glove', + 41: 'skateboard', + 42: 'surfboard', + 43: 'tennis racket', + 44: 'bottle', + 46: 'wine glass', + 47: 'cup', + 48: 'fork', + 49: 'knife', + 50: 'spoon', + 51: 'bowl', + 52: 'banana', + 53: 'apple', + 54: 'sandwich', + 55: 'orange', + 56: 'broccoli', + 57: 'carrot', + 58: 'hot dog', + 59: 'pizza', + 60: 'donut', + 61: 'cake', + 62: 'chair', + 63: 'couch', + 64: 'potted plant', + 65: 'bed', + 67: 'dining table', + 70: 'toilet', + 72: 'tv', + 73: 'laptop', + 74: 'mouse', + 75: 'remote', + 76: 'keyboard', + 77: 'cell phone', + 78: 'microwave', + 79: 'oven', + 80: 'toaster', + 81: 'sink', + 82: 'refrigerator', + 84: 'book', + 85: 'clock', + 86: 'vase', + 87: 'scissors', + 88: 'teddy bear', + 89: 'hair drier', + 90: 'toothbrush' +} + +mscoco_category2label = {k: i for i, k in enumerate(mscoco_category2name.keys())} +mscoco_label2category = {v: k for k, v in mscoco_category2label.items()} diff --git a/rtdetrv2_pytorch/src/data/dataset/coco_eval.py b/rtdetrv2_pytorch/src/data/dataset/coco_eval.py new file mode 100644 index 0000000..b50b287 --- /dev/null +++ b/rtdetrv2_pytorch/src/data/dataset/coco_eval.py @@ -0,0 +1,16 @@ +""" +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +COCO evaluator that works in distributed mode. +Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py +The difference is that there is less copy-pasting from pycocotools +in the end of the file, as python3 can suppress prints with contextlib + +# MiXaiLL76 replacing pycocotools with faster-coco-eval for better performance and support. +""" + +from ...core import register +from faster_coco_eval.utils.pytorch import FasterCocoEvaluator + +@register() +class CocoEvaluator(FasterCocoEvaluator): + pass diff --git a/rtdetrv2_pytorch/src/data/dataset/coco_utils.py b/rtdetrv2_pytorch/src/data/dataset/coco_utils.py new file mode 100644 index 0000000..be2b915 --- /dev/null +++ b/rtdetrv2_pytorch/src/data/dataset/coco_utils.py @@ -0,0 +1,194 @@ +""" +copy and modified https://github.com/pytorch/vision/blob/main/references/detection/coco_utils.py + +Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + + +import torch +import torch.utils.data +import torchvision +import torchvision.transforms.functional as TVF +from faster_coco_eval import COCO +import faster_coco_eval.core.mask as mask_util + +def convert_coco_poly_to_mask(segmentations, height, width): + masks = [] + for polygons in segmentations: + rles = mask_util.frPyObjects(polygons, height, width) + mask = mask_util.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = torch.as_tensor(mask, dtype=torch.uint8) + mask = mask.any(dim=2) + masks.append(mask) + if masks: + masks = torch.stack(masks, dim=0) + else: + masks = torch.zeros((0, height, width), dtype=torch.uint8) + return masks + + +class ConvertCocoPolysToMask: + def __call__(self, image, target): + w, h = image.size + + image_id = target["image_id"] + + anno = target["annotations"] + + anno = [obj for obj in anno if obj["iscrowd"] == 0] + + boxes = [obj["bbox"] for obj in anno] + # guard against no boxes via resizing + boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + + classes = [obj["category_id"] for obj in anno] + classes = torch.tensor(classes, dtype=torch.int64) + + segmentations = [obj["segmentation"] for obj in anno] + masks = convert_coco_poly_to_mask(segmentations, h, w) + + keypoints = None + if anno and "keypoints" in anno[0]: + keypoints = [obj["keypoints"] for obj in anno] + keypoints = torch.as_tensor(keypoints, dtype=torch.float32) + num_keypoints = keypoints.shape[0] + if num_keypoints: + keypoints = keypoints.view(num_keypoints, -1, 3) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + boxes = boxes[keep] + classes = classes[keep] + masks = masks[keep] + if keypoints is not None: + keypoints = keypoints[keep] + + target = {} + target["boxes"] = boxes + target["labels"] = classes + target["masks"] = masks + target["image_id"] = image_id + if keypoints is not None: + target["keypoints"] = keypoints + + # for conversion to coco api + area = torch.tensor([obj["area"] for obj in anno]) + iscrowd = torch.tensor([obj["iscrowd"] for obj in anno]) + target["area"] = area + target["iscrowd"] = iscrowd + + return image, target + + +def _coco_remove_images_without_annotations(dataset, cat_list=None): + def _has_only_empty_bbox(anno): + return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno) + + def _count_visible_keypoints(anno): + return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno) + + min_keypoints_per_image = 10 + + def _has_valid_annotation(anno): + # if it's empty, there is no annotation + if len(anno) == 0: + return False + # if all boxes have close to zero area, there is no annotation + if _has_only_empty_bbox(anno): + return False + # keypoints task have a slight different criteria for considering + # if an annotation is valid + if "keypoints" not in anno[0]: + return True + # for keypoint detection tasks, only consider valid images those + # containing at least min_keypoints_per_image + if _count_visible_keypoints(anno) >= min_keypoints_per_image: + return True + return False + + ids = [] + for ds_idx, img_id in enumerate(dataset.ids): + ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None) + anno = dataset.coco.loadAnns(ann_ids) + if cat_list: + anno = [obj for obj in anno if obj["category_id"] in cat_list] + if _has_valid_annotation(anno): + ids.append(ds_idx) + + dataset = torch.utils.data.Subset(dataset, ids) + return dataset + + +def convert_to_coco_api(ds): + coco_ds = COCO() + # annotation IDs need to start at 1, not 0, see torchvision issue #1530 + ann_id = 1 + dataset = {"images": [], "categories": [], "annotations": []} + categories = set() + for img_idx in range(len(ds)): + # find better way to get target + # targets = ds.get_annotations(img_idx) + # img, targets = ds[img_idx] + + # TODO (by lyuwenyu), load image and targets before `transforms` + img, targets = ds.load_item(img_idx) + width, height = img.size + + image_id = targets["image_id"].item() + img_dict = {} + img_dict["id"] = image_id + img_dict["width"] = width + img_dict["height"] = height + dataset["images"].append(img_dict) + bboxes = targets["boxes"].clone() + bboxes[:, 2:] -= bboxes[:, :2] # xyxy -> xywh + bboxes = bboxes.tolist() + labels = targets["labels"].tolist() + areas = targets["area"].tolist() + iscrowd = targets["iscrowd"].tolist() + if "masks" in targets: + masks = targets["masks"] + # make masks Fortran contiguous for coco_mask + masks = masks.permute(0, 2, 1).contiguous().permute(0, 2, 1) + if "keypoints" in targets: + keypoints = targets["keypoints"] + keypoints = keypoints.reshape(keypoints.shape[0], -1).tolist() + num_objs = len(bboxes) + for i in range(num_objs): + ann = {} + ann["image_id"] = image_id + ann["bbox"] = bboxes[i] + ann["category_id"] = labels[i] + categories.add(labels[i]) + ann["area"] = areas[i] + ann["iscrowd"] = iscrowd[i] + ann["id"] = ann_id + if "masks" in targets: + ann["segmentation"] = mask_util.encode(masks[i].numpy()) + if "keypoints" in targets: + ann["keypoints"] = keypoints[i] + ann["num_keypoints"] = sum(k != 0 for k in keypoints[i][2::3]) + dataset["annotations"].append(ann) + ann_id += 1 + dataset["categories"] = [{"id": i} for i in sorted(categories)] + coco_ds.dataset = dataset + coco_ds.createIndex() + return coco_ds + + +def get_coco_api_from_dataset(dataset): + # FIXME: This is... awful? + for _ in range(10): + if isinstance(dataset, torchvision.datasets.CocoDetection): + break + if isinstance(dataset, torch.utils.data.Subset): + dataset = dataset.dataset + if isinstance(dataset, torchvision.datasets.CocoDetection): + return dataset.coco + return convert_to_coco_api(dataset) + + diff --git a/rtdetrv2_pytorch/src/data/dataset/voc_detection.py b/rtdetrv2_pytorch/src/data/dataset/voc_detection.py new file mode 100644 index 0000000..a926a1f --- /dev/null +++ b/rtdetrv2_pytorch/src/data/dataset/voc_detection.py @@ -0,0 +1,75 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +from sympy import im +import torch +import torchvision +import torchvision.transforms.functional as TVF + +import os +from PIL import Image +from typing import Optional, Callable + +try: + from defusedxml.ElementTree import parse as ET_parse +except ImportError: + from xml.etree.ElementTree import parse as ET_parse + +from ._dataset import DetDataset +from .._misc import convert_to_tv_tensor +from ...core import register + +@register() +class VOCDetection(torchvision.datasets.VOCDetection, DetDataset): + __inject__ = ['transforms', ] + + def __init__(self, root: str, ann_file: str = "trainval.txt", label_file: str = "label_list.txt", transforms: Optional[Callable] = None): + + with open(os.path.join(root, ann_file), 'r') as f: + lines = [x.strip() for x in f.readlines()] + lines = [x.split(' ') for x in lines] + + self.images = [os.path.join(root, lin[0]) for lin in lines] + self.targets = [os.path.join(root, lin[1]) for lin in lines] + assert len(self.images) == len(self.targets) + + with open(os.path.join(root + label_file), 'r') as f: + labels = f.readlines() + labels = [lab.strip() for lab in labels] + + self.transforms = transforms + self.labels_map = {lab: i for i, lab in enumerate(labels)} + + def __getitem__(self, index: int): + image, target = self.load_item(index) + if self.transforms is not None: + image, target, _ = self.transforms(image, target, self) + # target["orig_size"] = torch.tensor(TVF.get_image_size(image)) + return image, target + + def load_item(self, index: int): + image = Image.open(self.images[index]).convert("RGB") + target = self.parse_voc_xml(ET_parse(self.annotations[index]).getroot()) + + output = {} + output["image_id"] = torch.tensor([index]) + for k in ['area', 'boxes', 'labels', 'iscrowd']: + output[k] = [] + + for blob in target['annotation']['object']: + box = [float(v) for v in blob['bndbox'].values()] + output["boxes"].append(box) + output["labels"].append(blob['name']) + output["area"].append((box[2] - box[0]) * (box[3] - box[1])) + output["iscrowd"].append(0) + + w, h = image.size + boxes = torch.tensor(output["boxes"]) if len(output["boxes"]) > 0 else torch.zeros(0, 4) + output['boxes'] = convert_to_tv_tensor(boxes, 'boxes', box_format='xyxy', spatial_size=[h, w]) + output['labels'] = torch.tensor([self.labels_map[lab] for lab in output["labels"]]) + output['area'] = torch.tensor(output['area']) + output["iscrowd"] = torch.tensor(output["iscrowd"]) + output["orig_size"] = torch.tensor([w, h]) + + return image, output + diff --git a/rtdetrv2_pytorch/src/data/dataset/voc_eval.py b/rtdetrv2_pytorch/src/data/dataset/voc_eval.py new file mode 100644 index 0000000..efe200f --- /dev/null +++ b/rtdetrv2_pytorch/src/data/dataset/voc_eval.py @@ -0,0 +1,10 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torchvision + + +class VOCEvaluator(object): + def __init__(self) -> None: + pass \ No newline at end of file diff --git a/rtdetrv2_pytorch/src/data/transforms/__init__.py b/rtdetrv2_pytorch/src/data/transforms/__init__.py new file mode 100644 index 0000000..9adb329 --- /dev/null +++ b/rtdetrv2_pytorch/src/data/transforms/__init__.py @@ -0,0 +1,20 @@ +""""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + + +from ._transforms import ( + EmptyTransform, + RandomPhotometricDistort, + RandomZoomOut, + RandomIoUCrop, + RandomHorizontalFlip, + Resize, + PadToSize, + SanitizeBoundingBoxes, + RandomCrop, + Normalize, + ConvertBoxes, + ConvertPILImage, +) +from .container import Compose +from .mosaic import Mosaic diff --git a/rtdetrv2_pytorch/src/data/transforms/_transforms.py b/rtdetrv2_pytorch/src/data/transforms/_transforms.py new file mode 100644 index 0000000..5758c91 --- /dev/null +++ b/rtdetrv2_pytorch/src/data/transforms/_transforms.py @@ -0,0 +1,148 @@ +""""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torch.nn as nn + +import torchvision +torchvision.disable_beta_transforms_warning() + +import torchvision.transforms.v2 as T +import torchvision.transforms.v2.functional as F + +import PIL +import PIL.Image + +from typing import Any, Dict, List, Optional + +from .._misc import convert_to_tv_tensor, _boxes_keys +from .._misc import Image, Video, Mask, BoundingBoxes +from .._misc import SanitizeBoundingBoxes + +from ...core import register + + +RandomPhotometricDistort = register()(T.RandomPhotometricDistort) +RandomZoomOut = register()(T.RandomZoomOut) +RandomHorizontalFlip = register()(T.RandomHorizontalFlip) +Resize = register()(T.Resize) +# ToImageTensor = register()(T.ToImageTensor) +# ConvertDtype = register()(T.ConvertDtype) +# PILToTensor = register()(T.PILToTensor) +SanitizeBoundingBoxes = register(name='SanitizeBoundingBoxes')(SanitizeBoundingBoxes) +RandomCrop = register()(T.RandomCrop) +Normalize = register()(T.Normalize) + + +@register() +class EmptyTransform(T.Transform): + def __init__(self, ) -> None: + super().__init__() + + def forward(self, *inputs): + inputs = inputs if len(inputs) > 1 else inputs[0] + return inputs + + +@register() +class PadToSize(T.Pad): + _transformed_types = ( + PIL.Image.Image, + Image, + Video, + Mask, + BoundingBoxes, + ) + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: + sp = F.get_spatial_size(flat_inputs[0]) + h, w = self.size[1] - sp[0], self.size[0] - sp[1] + self.padding = [0, 0, w, h] + return dict(padding=self.padding) + + def make_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: + return self._get_params(flat_inputs) + + def __init__(self, size, fill=0, padding_mode='constant') -> None: + if isinstance(size, int): + size = (size, size) + self.size = size + super().__init__(0, fill, padding_mode) + + def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: + fill = self._fill[type(inpt)] + padding = params['padding'] + return F.pad(inpt, padding=padding, fill=fill, padding_mode=self.padding_mode) # type: ignore[arg-type] + + def transform(self, inpt: Any, params: Dict[str, Any]) -> Any: + return self._transform(inpt, params) + + def __call__(self, *inputs: Any) -> Any: + outputs = super().forward(*inputs) + if len(outputs) > 1 and isinstance(outputs[1], dict): + outputs[1]['padding'] = torch.tensor(self.padding) + return outputs + + +@register() +class RandomIoUCrop(T.RandomIoUCrop): + def __init__(self, min_scale: float = 0.3, max_scale: float = 1, min_aspect_ratio: float = 0.5, max_aspect_ratio: float = 2, sampler_options: Optional[List[float]] = None, trials: int = 40, p: float = 1.0): + super().__init__(min_scale, max_scale, min_aspect_ratio, max_aspect_ratio, sampler_options, trials) + self.p = p + + def __call__(self, *inputs: Any) -> Any: + if torch.rand(1) >= self.p: + return inputs if len(inputs) > 1 else inputs[0] + + return super().forward(*inputs) + + +@register() +class ConvertBoxes(T.Transform): + _transformed_types = ( + BoundingBoxes, + ) + def __init__(self, fmt='', normalize=False) -> None: + super().__init__() + self.fmt = fmt + self.normalize = normalize + + def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: + spatial_size = getattr(inpt, _boxes_keys[1]) + if self.fmt: + in_fmt = inpt.format.value.lower() + inpt = torchvision.ops.box_convert(inpt, in_fmt=in_fmt, out_fmt=self.fmt.lower()) + inpt = convert_to_tv_tensor(inpt, key='boxes', box_format=self.fmt.upper(), spatial_size=spatial_size) + + if self.normalize: + inpt = inpt / torch.tensor(spatial_size[::-1]).tile(2)[None] + + return inpt + + def transform(self, inpt: Any, params: Dict[str, Any]) -> Any: + return self._transform(inpt, params) + + +@register() +class ConvertPILImage(T.Transform): + _transformed_types = ( + PIL.Image.Image, + ) + def __init__(self, dtype='float32', scale=True) -> None: + super().__init__() + self.dtype = dtype + self.scale = scale + + def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: + inpt = F.pil_to_tensor(inpt) + if self.dtype == 'float32': + inpt = inpt.float() + + if self.scale: + inpt = inpt / 255. + + inpt = Image(inpt) + + return inpt + + def transform(self, inpt: Any, params: Dict[str, Any]) -> Any: + return self._transform(inpt, params) diff --git a/rtdetrv2_pytorch/src/data/transforms/container.py b/rtdetrv2_pytorch/src/data/transforms/container.py new file mode 100644 index 0000000..bf567bb --- /dev/null +++ b/rtdetrv2_pytorch/src/data/transforms/container.py @@ -0,0 +1,95 @@ +""""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torch.nn as nn + +import torchvision +torchvision.disable_beta_transforms_warning() +import torchvision.transforms.v2 as T + +from typing import Any, Dict, List, Optional + +from ._transforms import EmptyTransform +from ...core import register, GLOBAL_CONFIG + + +@register() +class Compose(T.Compose): + def __init__(self, ops, policy=None) -> None: + transforms = [] + if ops is not None: + for op in ops: + if isinstance(op, dict): + name = op.pop('type') + transfom = getattr(GLOBAL_CONFIG[name]['_pymodule'], GLOBAL_CONFIG[name]['_name'])(**op) + transforms.append(transfom) + op['type'] = name + + elif isinstance(op, nn.Module): + transforms.append(op) + + else: + raise ValueError('') + else: + transforms =[EmptyTransform(), ] + + super().__init__(transforms=transforms) + + if policy is None: + policy = {'name': 'default'} + + self.policy = policy + self.global_samples = 0 + + def forward(self, *inputs: Any) -> Any: + return self.get_forward(self.policy['name'])(*inputs) + + def get_forward(self, name): + forwards = { + 'default': self.default_forward, + 'stop_epoch': self.stop_epoch_forward, + 'stop_sample': self.stop_sample_forward, + } + return forwards[name] + + def default_forward(self, *inputs: Any) -> Any: + sample = inputs if len(inputs) > 1 else inputs[0] + for transform in self.transforms: + sample = transform(sample) + return sample + + def stop_epoch_forward(self, *inputs: Any): + sample = inputs if len(inputs) > 1 else inputs[0] + dataset = sample[-1] + + cur_epoch = dataset.epoch + policy_ops = self.policy['ops'] + policy_epoch = self.policy['epoch'] + + for transform in self.transforms: + if type(transform).__name__ in policy_ops and cur_epoch >= policy_epoch: + pass + else: + sample = transform(sample) + + return sample + + + def stop_sample_forward(self, *inputs: Any): + sample = inputs if len(inputs) > 1 else inputs[0] + dataset = sample[-1] + + cur_epoch = dataset.epoch + policy_ops = self.policy['ops'] + policy_sample = self.policy['sample'] + + for transform in self.transforms: + if type(transform).__name__ in policy_ops and self.global_samples >= policy_sample: + pass + else: + sample = transform(sample) + + self.global_samples += 1 + + return sample diff --git a/rtdetrv2_pytorch/src/data/transforms/functional.py b/rtdetrv2_pytorch/src/data/transforms/functional.py new file mode 100644 index 0000000..336baa2 --- /dev/null +++ b/rtdetrv2_pytorch/src/data/transforms/functional.py @@ -0,0 +1,169 @@ +import torch +import torchvision.transforms.functional as F + +from packaging import version +from typing import Optional, List +from torch import Tensor + +# needed due to empty tensor bug in pytorch and torchvision 0.5 +import torchvision +if version.parse(torchvision.__version__) < version.parse('0.7'): + from torchvision.ops import _new_empty_tensor + from torchvision.ops.misc import _output_size + + +def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None): + # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor + """ + Equivalent to nn.functional.interpolate, but with support for empty batch sizes. + This will eventually be supported natively by PyTorch, and this + class can go away. + """ + if version.parse(torchvision.__version__) < version.parse('0.7'): + if input.numel() > 0: + return torch.nn.functional.interpolate( + input, size, scale_factor, mode, align_corners + ) + + output_shape = _output_size(2, input, size, scale_factor) + output_shape = list(input.shape[:-2]) + list(output_shape) + return _new_empty_tensor(input, output_shape) + else: + return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners) + + + +def crop(image, target, region): + cropped_image = F.crop(image, *region) + + target = target.copy() + i, j, h, w = region + + # should we do something wrt the original size? + target["size"] = torch.tensor([h, w]) + + fields = ["labels", "area", "iscrowd"] + + if "boxes" in target: + boxes = target["boxes"] + max_size = torch.as_tensor([w, h], dtype=torch.float32) + cropped_boxes = boxes - torch.as_tensor([j, i, j, i]) + cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size) + cropped_boxes = cropped_boxes.clamp(min=0) + area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1) + target["boxes"] = cropped_boxes.reshape(-1, 4) + target["area"] = area + fields.append("boxes") + + if "masks" in target: + # FIXME should we update the area here if there are no boxes? + target['masks'] = target['masks'][:, i:i + h, j:j + w] + fields.append("masks") + + # remove elements for which the boxes or masks that have zero area + if "boxes" in target or "masks" in target: + # favor boxes selection when defining which elements to keep + # this is compatible with previous implementation + if "boxes" in target: + cropped_boxes = target['boxes'].reshape(-1, 2, 2) + keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1) + else: + keep = target['masks'].flatten(1).any(1) + + for field in fields: + target[field] = target[field][keep] + + return cropped_image, target + + +def hflip(image, target): + flipped_image = F.hflip(image) + + w, h = image.size + + target = target.copy() + if "boxes" in target: + boxes = target["boxes"] + boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0]) + target["boxes"] = boxes + + if "masks" in target: + target['masks'] = target['masks'].flip(-1) + + return flipped_image, target + + +def resize(image, target, size, max_size=None): + # size can be min_size (scalar) or (w, h) tuple + + def get_size_with_aspect_ratio(image_size, size, max_size=None): + w, h = image_size + if max_size is not None: + min_original_size = float(min((w, h))) + max_original_size = float(max((w, h))) + if max_original_size / min_original_size * size > max_size: + size = int(round(max_size * min_original_size / max_original_size)) + + if (w <= h and w == size) or (h <= w and h == size): + return (h, w) + + if w < h: + ow = size + oh = int(size * h / w) + else: + oh = size + ow = int(size * w / h) + + # r = min(size / min(h, w), max_size / max(h, w)) + # ow = int(w * r) + # oh = int(h * r) + + return (oh, ow) + + def get_size(image_size, size, max_size=None): + if isinstance(size, (list, tuple)): + return size[::-1] + else: + return get_size_with_aspect_ratio(image_size, size, max_size) + + size = get_size(image.size, size, max_size) + rescaled_image = F.resize(image, size) + + if target is None: + return rescaled_image, None + + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size)) + ratio_width, ratio_height = ratios + + target = target.copy() + if "boxes" in target: + boxes = target["boxes"] + scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height]) + target["boxes"] = scaled_boxes + + if "area" in target: + area = target["area"] + scaled_area = area * (ratio_width * ratio_height) + target["area"] = scaled_area + + h, w = size + target["size"] = torch.tensor([h, w]) + + if "masks" in target: + target['masks'] = interpolate( + target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5 + + return rescaled_image, target + + +def pad(image, target, padding): + # assumes that we only pad on the bottom right corners + padded_image = F.pad(image, (0, 0, padding[0], padding[1])) + if target is None: + return padded_image, None + target = target.copy() + # should we do something wrt the original size? + target["size"] = torch.tensor(padded_image.size[::-1]) + if "masks" in target: + target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1])) + return padded_image, target diff --git a/rtdetrv2_pytorch/src/data/transforms/mosaic.py b/rtdetrv2_pytorch/src/data/transforms/mosaic.py new file mode 100644 index 0000000..fc08d1b --- /dev/null +++ b/rtdetrv2_pytorch/src/data/transforms/mosaic.py @@ -0,0 +1,72 @@ +""""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torchvision +torchvision.disable_beta_transforms_warning() +import torchvision.transforms.v2 as T +import torchvision.transforms.v2.functional as F + +import random +from PIL import Image + +from .._misc import convert_to_tv_tensor +from ...core import register + + +@register() +class Mosaic(T.Transform): + def __init__(self, size, max_size=None, ) -> None: + super().__init__() + self.resize = T.Resize(size=size, max_size=max_size) + self.crop = T.RandomCrop(size=max_size if max_size else size) + + # TODO add arg `output_size` for affine` + # self.random_perspective = T.RandomPerspective(distortion_scale=0.5, p=1., ) + self.random_affine = T.RandomAffine(degrees=0, translate=(0.1, 0.1), scale=(0.5, 1.5), fill=114) + + def forward(self, *inputs): + inputs = inputs if len(inputs) > 1 else inputs[0] + image, target, dataset = inputs + + images = [] + targets = [] + indices = random.choices(range(len(dataset)), k=3) + for i in indices: + image, target = dataset.load_item(i) + image, target = self.resize(image, target) + images.append(image) + targets.append(target) + + h, w = F.get_spatial_size(images[0]) + offset = [[0, 0], [w, 0], [0, h], [w, h]] + image = Image.new(mode=images[0].mode, size=(w * 2, h * 2), color=0) + for i, im in enumerate(images): + image.paste(im, offset[i]) + + offset = torch.tensor([[0, 0], [w, 0], [0, h], [w, h]]).repeat(1, 2) + target = {} + for k in targets[0]: + if k == 'boxes': + v = [t[k] + offset[i] for i, t in enumerate(targets)] + else: + v = [t[k] for t in targets] + + if isinstance(v[0], torch.Tensor): + v = torch.cat(v, dim=0) + + target[k] = v + + if 'boxes' in target: + # target['boxes'] = target['boxes'].clamp(0, 640 * 2 - 1) + w, h = image.size + target['boxes'] = convert_to_tv_tensor(target['boxes'], 'boxes', box_format='xyxy', spatial_size=[h, w]) + + if 'masks' in target: + target['masks'] = convert_to_tv_tensor(target['masks'], 'masks') + + image, target = self.random_affine(image, target) + # image, target = self.resize(image, target) + image, target = self.crop(image, target) + + return image, target, dataset diff --git a/rtdetrv2_pytorch/src/data/transforms/presets.py b/rtdetrv2_pytorch/src/data/transforms/presets.py new file mode 100644 index 0000000..137af31 --- /dev/null +++ b/rtdetrv2_pytorch/src/data/transforms/presets.py @@ -0,0 +1,2 @@ +""""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" diff --git a/rtdetrv2_pytorch/src/misc/__init__.py b/rtdetrv2_pytorch/src/misc/__init__.py new file mode 100644 index 0000000..cbe60a6 --- /dev/null +++ b/rtdetrv2_pytorch/src/misc/__init__.py @@ -0,0 +1,7 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +from .logger import * +from .visualizer import * +from .dist_utils import setup_seed, setup_print +from .profiler_utils import stats diff --git a/rtdetrv2_pytorch/src/misc/box_ops.py b/rtdetrv2_pytorch/src/misc/box_ops.py new file mode 100644 index 0000000..6c4c946 --- /dev/null +++ b/rtdetrv2_pytorch/src/misc/box_ops.py @@ -0,0 +1,103 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torchvision +from torch import Tensor +from typing import List, Tuple + + +def generalized_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor: + assert (boxes1[:, 2:] >= boxes1[:, :2]).all() + assert (boxes2[:, 2:] >= boxes2[:, :2]).all() + return torchvision.ops.generalized_box_iou(boxes1, boxes2) + + +# elementwise +def elementwise_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor: + """ + Args: + boxes1, [N, 4] + boxes2, [N, 4] + Returns: + iou, [N, ] + union, [N, ] + """ + area1 = torchvision.ops.box_area(boxes1) # [N, ] + area2 = torchvision.ops.box_area(boxes2) # [N, ] + lt = torch.max(boxes1[:, :2], boxes2[:, :2]) # [N, 2] + rb = torch.min(boxes1[:, 2:], boxes2[:, 2:]) # [N, 2] + wh = (rb - lt).clamp(min=0) # [N, 2] + inter = wh[:, 0] * wh[:, 1] # [N, ] + union = area1 + area2 - inter + iou = inter / union + return iou, union + + +def elementwise_generalized_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor: + """ + Args: + boxes1, [N, 4] with [x1, y1, x2, y2] + boxes2, [N, 4] with [x1, y1, x2, y2] + Returns: + giou, [N, ] + """ + assert (boxes1[:, 2:] >= boxes1[:, :2]).all() + assert (boxes2[:, 2:] >= boxes2[:, :2]).all() + iou, union = elementwise_box_iou(boxes1, boxes2) + lt = torch.min(boxes1[:, :2], boxes2[:, :2]) # [N, 2] + rb = torch.max(boxes1[:, 2:], boxes2[:, 2:]) # [N, 2] + wh = (rb - lt).clamp(min=0) # [N, 2] + area = wh[:, 0] * wh[:, 1] + return iou - (area - union) / area + + +def check_point_inside_box(points: Tensor, boxes: Tensor, eps=1e-9) -> Tensor: + """ + Args: + points, [K, 2], (x, y) + boxes, [N, 4], (x1, y1, y2, y2) + Returns: + Tensor (bool), [K, N] + """ + x, y = [p.unsqueeze(-1) for p in points.unbind(-1)] + x1, y1, x2, y2 = [x.unsqueeze(0) for x in boxes.unbind(-1)] + + l = x - x1 + t = y - y1 + r = x2 - x + b = y2 - y + + ltrb = torch.stack([l, t, r, b], dim=-1) + mask = ltrb.min(dim=-1).values > eps + + return mask + + +def point_box_distance(points: Tensor, boxes: Tensor) -> Tensor: + """ + Args: + boxes, [N, 4], (x1, y1, x2, y2) + points, [N, 2], (x, y) + Returns: + Tensor (N, 4), (l, t, r, b) + """ + x1y1, x2y2 = torch.split(boxes, 2, dim=-1) + lt = points - x1y1 + rb = x2y2 - points + return torch.concat([lt, rb], dim=-1) + + +def point_distance_box(points: Tensor, distances: Tensor) -> Tensor: + """ + Args: + points (Tensor), [N, 2], (x, y) + distances (Tensor), [N, 4], (l, t, r, b) + Returns: + boxes (Tensor), (N, 4), (x1, y1, x2, y2) + """ + lt, rb = torch.split(distances, 2, dim=-1) + x1y1 = -lt + points + x2y2 = rb + points + boxes = torch.concat([x1y1, x2y2], dim=-1) + return boxes diff --git a/rtdetrv2_pytorch/src/misc/dist_utils.py b/rtdetrv2_pytorch/src/misc/dist_utils.py new file mode 100644 index 0000000..79f7944 --- /dev/null +++ b/rtdetrv2_pytorch/src/misc/dist_utils.py @@ -0,0 +1,267 @@ +""" +reference +- https://github.com/pytorch/vision/blob/main/references/detection/utils.py +- https://github.com/facebookresearch/detr/blob/master/util/misc.py#L406 + +Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import os +import random +import numpy as np +import atexit + +import torch +import torch.nn as nn +import torch.distributed +import torch.backends.cudnn + +from torch.nn.parallel import DataParallel as DP +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP + +from torch.utils.data import DistributedSampler +# from torch.utils.data.dataloader import DataLoader +from ..data import DataLoader + + +def setup_distributed(print_rank: int=0, print_method: str='builtin', seed: int=None, ): + """ + env setup + args: + print_rank, + print_method, (builtin, rich) + seed, + """ + try: + # https://pytorch.org/docs/stable/elastic/run.html + RANK = int(os.getenv('RANK', -1)) + LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1)) + WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1)) + + # torch.distributed.init_process_group(backend=backend, init_method='env://') + torch.distributed.init_process_group(init_method='env://') + torch.distributed.barrier() + + rank = torch.distributed.get_rank() + torch.cuda.set_device(rank) + torch.cuda.empty_cache() + enabled_dist = True + print('Initialized distributed mode...') + + except: + enabled_dist = False + print('Not init distributed mode.') + + setup_print(get_rank() == print_rank, method=print_method) + if seed is not None: + setup_seed(seed) + + return enabled_dist + + +def setup_print(is_main, method='builtin'): + """This function disables printing when not in master process + """ + import builtins as __builtin__ + + if method == 'builtin': + builtin_print = __builtin__.print + + elif method == 'rich': + import rich + builtin_print = rich.print + + else: + raise AttributeError('') + + def print(*args, **kwargs): + force = kwargs.pop('force', False) + if is_main or force: + builtin_print(*args, **kwargs) + + __builtin__.print = print + + +def is_dist_available_and_initialized(): + if not torch.distributed.is_available(): + return False + if not torch.distributed.is_initialized(): + return False + return True + + +@atexit.register +def cleanup(): + """cleanup distributed environment + """ + if is_dist_available_and_initialized(): + torch.distributed.barrier() + torch.distributed.destroy_process_group() + + +def get_rank(): + if not is_dist_available_and_initialized(): + return 0 + return torch.distributed.get_rank() + + +def get_world_size(): + if not is_dist_available_and_initialized(): + return 1 + return torch.distributed.get_world_size() + + +def is_main_process(): + return get_rank() == 0 + + +def save_on_master(*args, **kwargs): + if is_main_process(): + torch.save(*args, **kwargs) + + + +def warp_model( + model: torch.nn.Module, + sync_bn: bool=False, + dist_mode: str='ddp', + find_unused_parameters: bool=False, + compile: bool=False, + compile_mode: str='reduce-overhead', + **kwargs +): + if is_dist_available_and_initialized(): + rank = get_rank() + model = nn.SyncBatchNorm.convert_sync_batchnorm(model) if sync_bn else model + if dist_mode == 'dp': + model = DP(model, device_ids=[rank], output_device=rank) + elif dist_mode == 'ddp': + model = DDP(model, device_ids=[rank], output_device=rank, find_unused_parameters=find_unused_parameters) + else: + raise AttributeError('') + + if compile: + model = torch.compile(model, mode=compile_mode) + + return model + +def de_model(model): + return de_parallel(de_complie(model)) + + +def warp_loader(loader, shuffle=False): + if is_dist_available_and_initialized(): + sampler = DistributedSampler(loader.dataset, shuffle=shuffle) + loader = DataLoader(loader.dataset, + loader.batch_size, + sampler=sampler, + drop_last=loader.drop_last, + collate_fn=loader.collate_fn, + pin_memory=loader.pin_memory, + num_workers=loader.num_workers, ) + return loader + + + +def is_parallel(model) -> bool: + # Returns True if model is of type DP or DDP + return type(model) in (torch.nn.parallel.DataParallel, torch.nn.parallel.DistributedDataParallel) + + +def de_parallel(model) -> nn.Module: + # De-parallelize a model: returns single-GPU model if model is of type DP or DDP + return model.module if is_parallel(model) else model + + +def reduce_dict(data, avg=True): + """ + Args + data dict: input, {k: v, ...} + avg bool: true + """ + world_size = get_world_size() + if world_size < 2: + return data + + with torch.no_grad(): + keys, values = [], [] + for k in sorted(data.keys()): + keys.append(k) + values.append(data[k]) + + values = torch.stack(values, dim=0) + torch.distributed.all_reduce(values) + + if avg is True: + values /= world_size + + return {k: v for k, v in zip(keys, values)} + + +def all_gather(data): + """ + Run all_gather on arbitrary picklable data (not necessarily tensors) + Args: + data: any picklable object + Returns: + list[data]: list of data gathered from each rank + """ + world_size = get_world_size() + if world_size == 1: + return [data] + data_list = [None] * world_size + torch.distributed.all_gather_object(data_list, data) + return data_list + + +import time +def sync_time(): + """sync_time + """ + if torch.cuda.is_available(): + torch.cuda.synchronize() + + return time.time() + + + +def setup_seed(seed: int, deterministic=False): + """setup_seed for reproducibility + torch.manual_seed(3407) is all you need. https://arxiv.org/abs/2109.08203 + """ + seed = seed + get_rank() + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + + # memory will be large when setting deterministic to True + if torch.backends.cudnn.is_available() and deterministic: + torch.backends.cudnn.deterministic = True + + +# for torch.compile +def check_compile(): + import torch + import warnings + gpu_ok = False + if torch.cuda.is_available(): + device_cap = torch.cuda.get_device_capability() + if device_cap in ((7, 0), (8, 0), (9, 0)): + gpu_ok = True + if not gpu_ok: + warnings.warn( + "GPU is not NVIDIA V100, A100, or H100. Speedup numbers may be lower " + "than expected." + ) + return gpu_ok + +def is_compile(model): + import torch._dynamo + return type(model) in (torch._dynamo.OptimizedModule, ) + +def de_complie(model): + return model._orig_mod if is_compile(model) else model diff --git a/rtdetrv2_pytorch/src/misc/lazy_loader.py b/rtdetrv2_pytorch/src/misc/lazy_loader.py new file mode 100644 index 0000000..e99ce59 --- /dev/null +++ b/rtdetrv2_pytorch/src/misc/lazy_loader.py @@ -0,0 +1,70 @@ +""" +https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/util/lazy_loader.py +""" + + +import types +import importlib + +class LazyLoader(types.ModuleType): + """Lazily import a module, mainly to avoid pulling in large dependencies. + + `paddle`, and `ffmpeg` are examples of modules that are large and not always + needed, and this allows them to only be loaded when they are used. + """ + + # The lint error here is incorrect. + def __init__(self, local_name, parent_module_globals, name, warning=None): + self._local_name = local_name + self._parent_module_globals = parent_module_globals + self._warning = warning + + # These members allows doctest correctly process this module member without + # triggering self._load(). self._load() mutates parant_module_globals and + # triggers a dict mutated during iteration error from doctest.py. + # - for from_module() + self.__module__ = name.rsplit(".", 1)[0] + # - for is_routine() + self.__wrapped__ = None + + super(LazyLoader, self).__init__(name) + + def _load(self): + """Load the module and insert it into the parent's globals.""" + # Import the target module and insert it into the parent's namespace + module = importlib.import_module(self.__name__) + self._parent_module_globals[self._local_name] = module + + # Emit a warning if one was specified + if self._warning: + # logging.warning(self._warning) + # Make sure to only warn once. + self._warning = None + + # Update this object's dict so that if someone keeps a reference to the + # LazyLoader, lookups are efficient (__getattr__ is only called on lookups + # that fail). + self.__dict__.update(module.__dict__) + + return module + + def __getattr__(self, item): + module = self._load() + return getattr(module, item) + + def __repr__(self): + # Carefully to not trigger _load, since repr may be called in very + # sensitive places. + return f"" + + def __dir__(self): + module = self._load() + return dir(module) + + +# import paddle.nn as nn +# nn = LazyLoader("nn", globals(), "paddle.nn") + +# class M(nn.Layer): +# def __init__(self) -> None: +# super().__init__() diff --git a/rtdetrv2_pytorch/src/misc/logger.py b/rtdetrv2_pytorch/src/misc/logger.py new file mode 100644 index 0000000..2ef0c27 --- /dev/null +++ b/rtdetrv2_pytorch/src/misc/logger.py @@ -0,0 +1,239 @@ +""" +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +https://github.com/facebookresearch/detr/blob/main/util/misc.py +Mostly copy-paste from torchvision references. +""" + +import time +import pickle +import datetime +from collections import defaultdict, deque +from typing import Dict + +import torch +import torch.distributed as tdist + +from .dist_utils import is_dist_available_and_initialized, get_world_size + + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=20, fmt=None): + if fmt is None: + fmt = "{median:.4f} ({global_avg:.4f})" + self.deque = deque(maxlen=window_size) + self.total = 0.0 + self.count = 0 + self.fmt = fmt + + def update(self, value, n=1): + self.deque.append(value) + self.count += n + self.total += value * n + + def synchronize_between_processes(self): + """ + Warning: does not synchronize the deque! + """ + if not is_dist_available_and_initialized(): + return + t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') + tdist.barrier() + tdist.all_reduce(t) + t = t.tolist() + self.count = int(t[0]) + self.total = t[1] + + @property + def median(self): + d = torch.tensor(list(self.deque)) + return d.median().item() + + @property + def avg(self): + d = torch.tensor(list(self.deque), dtype=torch.float32) + return d.mean().item() + + @property + def global_avg(self): + return self.total / self.count + + @property + def max(self): + return max(self.deque) + + @property + def value(self): + return self.deque[-1] + + def __str__(self): + return self.fmt.format( + median=self.median, + avg=self.avg, + global_avg=self.global_avg, + max=self.max, + value=self.value) + + +def all_gather(data): + """ + Run all_gather on arbitrary picklable data (not necessarily tensors) + Args: + data: any picklable object + Returns: + list[data]: list of data gathered from each rank + """ + world_size = get_world_size() + if world_size == 1: + return [data] + + # serialized to a Tensor + buffer = pickle.dumps(data) + storage = torch.ByteStorage.from_buffer(buffer) + tensor = torch.ByteTensor(storage).to("cuda") + + # obtain Tensor size of each rank + local_size = torch.tensor([tensor.numel()], device="cuda") + size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)] + tdist.all_gather(size_list, local_size) + size_list = [int(size.item()) for size in size_list] + max_size = max(size_list) + + # receiving Tensor from all ranks + # we pad the tensor because torch all_gather does not support + # gathering tensors of different shapes + tensor_list = [] + for _ in size_list: + tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda")) + if local_size != max_size: + padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda") + tensor = torch.cat((tensor, padding), dim=0) + tdist.all_gather(tensor_list, tensor) + + data_list = [] + for size, tensor in zip(size_list, tensor_list): + buffer = tensor.cpu().numpy().tobytes()[:size] + data_list.append(pickle.loads(buffer)) + + return data_list + + +def reduce_dict(input_dict, average=True) -> Dict[str, torch.Tensor]: + """ + Args: + input_dict (dict): all the values will be reduced + average (bool): whether to do average or sum + Reduce the values in the dictionary from all processes so that all processes + have the averaged results. Returns a dict with the same fields as + input_dict, after reduction. + """ + world_size = get_world_size() + if world_size < 2: + return input_dict + with torch.no_grad(): + names = [] + values = [] + # sort the keys so that they are consistent across processes + for k in sorted(input_dict.keys()): + names.append(k) + values.append(input_dict[k]) + values = torch.stack(values, dim=0) + tdist.all_reduce(values) + if average: + values /= world_size + reduced_dict = {k: v for k, v in zip(names, values)} + return reduced_dict + + +class MetricLogger(object): + def __init__(self, delimiter="\t"): + self.meters = defaultdict(SmoothedValue) + self.delimiter = delimiter + + def update(self, **kwargs): + for k, v in kwargs.items(): + if isinstance(v, torch.Tensor): + v = v.item() + assert isinstance(v, (float, int)) + self.meters[k].update(v) + + def __getattr__(self, attr): + if attr in self.meters: + return self.meters[attr] + if attr in self.__dict__: + return self.__dict__[attr] + raise AttributeError("'{}' object has no attribute '{}'".format( + type(self).__name__, attr)) + + def __str__(self): + loss_str = [] + for name, meter in self.meters.items(): + loss_str.append( + "{}: {}".format(name, str(meter)) + ) + return self.delimiter.join(loss_str) + + def synchronize_between_processes(self): + for meter in self.meters.values(): + meter.synchronize_between_processes() + + def add_meter(self, name, meter): + self.meters[name] = meter + + def log_every(self, iterable, print_freq, header=None): + i = 0 + if not header: + header = '' + start_time = time.time() + end = time.time() + iter_time = SmoothedValue(fmt='{avg:.4f}') + data_time = SmoothedValue(fmt='{avg:.4f}') + space_fmt = ':' + str(len(str(len(iterable)))) + 'd' + if torch.cuda.is_available(): + log_msg = self.delimiter.join([ + header, + '[{0' + space_fmt + '}/{1}]', + 'eta: {eta}', + '{meters}', + 'time: {time}', + 'data: {data}', + 'max mem: {memory:.0f}' + ]) + else: + log_msg = self.delimiter.join([ + header, + '[{0' + space_fmt + '}/{1}]', + 'eta: {eta}', + '{meters}', + 'time: {time}', + 'data: {data}' + ]) + MB = 1024.0 * 1024.0 + for obj in iterable: + data_time.update(time.time() - end) + yield obj + iter_time.update(time.time() - end) + if i % print_freq == 0 or i == len(iterable) - 1: + eta_seconds = iter_time.global_avg * (len(iterable) - i) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + if torch.cuda.is_available(): + print(log_msg.format( + i, len(iterable), eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time), + memory=torch.cuda.max_memory_allocated() / MB)) + else: + print(log_msg.format( + i, len(iterable), eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time))) + i += 1 + end = time.time() + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('{} Total time: {} ({:.4f} s / it)'.format( + header, total_time_str, total_time / len(iterable))) + diff --git a/rtdetrv2_pytorch/src/misc/profiler_utils.py b/rtdetrv2_pytorch/src/misc/profiler_utils.py new file mode 100644 index 0000000..b63dcba --- /dev/null +++ b/rtdetrv2_pytorch/src/misc/profiler_utils.py @@ -0,0 +1,65 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import re +import torch +import torch.nn as nn +from torch import Tensor + +from typing import List + +def stats( + model: nn.Module, + data: Tensor=None, + input_shape: List=[1, 3, 640, 640], + device: str='cpu', + verbose=False) -> str: + + is_training = model.training + + model.train() + num_params = sum([p.numel() for p in model.parameters() if p.requires_grad]) + + model.eval() + model = model.to(device) + + if data is None: + data = torch.rand(*input_shape, device=device) + + def trace_handler(prof): + print(prof.key_averages().table( + sort_by="self_cuda_time_total", row_limit=-1)) + + num_active = 2 + with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + schedule=torch.profiler.schedule( + wait=1, + warmup=1, + active=num_active, + repeat=1 + ), + # on_trace_ready=trace_handler, + # on_trace_ready=torch.profiler.tensorboard_trace_handler('./log') + # with_modules=True, + with_flops=True, + ) as p: + for _ in range(5): + _ = model(data) + p.step() + + if is_training: + model.train() + + info = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1) + num_flops = sum([float(v.strip()) for v in re.findall('(\d+.?\d+ *\n)', info)]) / num_active + + if verbose: + # print(info) + print(f'Total number of trainable parameters: {num_params}') + print(f'Total number of flops: {int(num_flops)}M with {input_shape}') + + return {'n_parameters': num_params, 'n_flops': num_flops, 'info': info} diff --git a/rtdetrv2_pytorch/src/misc/visualizer.py b/rtdetrv2_pytorch/src/misc/visualizer.py new file mode 100644 index 0000000..b9bb7f8 --- /dev/null +++ b/rtdetrv2_pytorch/src/misc/visualizer.py @@ -0,0 +1,34 @@ +""""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torch.utils.data + +import torchvision +torchvision.disable_beta_transforms_warning() + +import PIL + +__all__ = ['show_sample'] + +def show_sample(sample): + """for coco dataset/dataloader + """ + import matplotlib.pyplot as plt + from torchvision.transforms.v2 import functional as F + from torchvision.utils import draw_bounding_boxes + + image, target = sample + if isinstance(image, PIL.Image.Image): + image = F.to_image_tensor(image) + + image = F.convert_dtype(image, torch.uint8) + annotated_image = draw_bounding_boxes(image, target["boxes"], colors="yellow", width=3) + + fig, ax = plt.subplots() + ax.imshow(annotated_image.permute(1, 2, 0).numpy()) + ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[]) + fig.tight_layout() + fig.show() + plt.show() + diff --git a/rtdetrv2_pytorch/src/nn/__init__.py b/rtdetrv2_pytorch/src/nn/__init__.py new file mode 100644 index 0000000..37d12fc --- /dev/null +++ b/rtdetrv2_pytorch/src/nn/__init__.py @@ -0,0 +1,17 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + + +from .arch import * +from .criterion import * +from .postprocessor import * + +# +from .backbone import * + + +from .backbone import ( + get_activation, + FrozenBatchNorm2d, + freeze_batch_norm2d, +) \ No newline at end of file diff --git a/rtdetrv2_pytorch/src/nn/arch/__init__.py b/rtdetrv2_pytorch/src/nn/arch/__init__.py new file mode 100644 index 0000000..57774ad --- /dev/null +++ b/rtdetrv2_pytorch/src/nn/arch/__init__.py @@ -0,0 +1,6 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + + +from .classification import Classification, ClassHead +from .yolo import YOLO \ No newline at end of file diff --git a/rtdetrv2_pytorch/src/nn/arch/classification.py b/rtdetrv2_pytorch/src/nn/arch/classification.py new file mode 100644 index 0000000..7b47b0f --- /dev/null +++ b/rtdetrv2_pytorch/src/nn/arch/classification.py @@ -0,0 +1,45 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + + +import torch +import torch.nn as nn + +from ...core import register + + +__all__ = ['Classification', 'ClassHead'] + + +@register() +class Classification(torch.nn.Module): + __inject__ = ['backbone', 'head'] + + def __init__(self, backbone: nn.Module, head: nn.Module=None): + super().__init__() + + self.backbone = backbone + self.head = head + + def forward(self, x): + x = self.backbone(x) + + if self.head is not None: + x = self.head(x) + + return x + + +@register() +class ClassHead(nn.Module): + def __init__(self, hidden_dim, num_classes): + super().__init__() + self.pool = nn.AdaptiveAvgPool2d(1) + self.proj = nn.Linear(hidden_dim, num_classes) + + def forward(self, x): + x = x[0] if isinstance(x, (list, tuple)) else x + x = self.pool(x) + x = x.reshape(x.shape[0], -1) + x = self.proj(x) + return x diff --git a/rtdetrv2_pytorch/src/nn/arch/yolo.py b/rtdetrv2_pytorch/src/nn/arch/yolo.py new file mode 100644 index 0000000..c3c1fae --- /dev/null +++ b/rtdetrv2_pytorch/src/nn/arch/yolo.py @@ -0,0 +1,33 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch + +from ...core import register + + +__all__ = ['YOLO', ] + + +@register() +class YOLO(torch.nn.Module): + __inject__ = ['backbone', 'neck', 'head', ] + + def __init__(self, backbone: torch.nn.Module, neck, head): + super().__init__() + self.backbone = backbone + self.neck = neck + self.head = head + + def forward(self, x, **kwargs): + x = self.backbone(x) + x = self.neck(x) + x = self.head(x) + return x + + def deploy(self, ): + self.eval() + for m in self.modules(): + if m is not self and hasattr(m, 'deploy'): + m.deploy() + return self diff --git a/rtdetrv2_pytorch/src/nn/backbone/__init__.py b/rtdetrv2_pytorch/src/nn/backbone/__init__.py new file mode 100644 index 0000000..b001c3f --- /dev/null +++ b/rtdetrv2_pytorch/src/nn/backbone/__init__.py @@ -0,0 +1,18 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +from .common import ( + get_activation, + FrozenBatchNorm2d, + freeze_batch_norm2d, +) +from .presnet import PResNet +from .test_resnet import MResNet + +from .timm_model import TimmModel +from .torchvision_model import TorchVisionModel + +from .csp_resnet import CSPResNet +from .csp_darknet import CSPDarkNet, CSPPAN + +from .hgnetv2 import HGNetv2 \ No newline at end of file diff --git a/rtdetrv2_pytorch/src/nn/backbone/common.py b/rtdetrv2_pytorch/src/nn/backbone/common.py new file mode 100644 index 0000000..1a6604e --- /dev/null +++ b/rtdetrv2_pytorch/src/nn/backbone/common.py @@ -0,0 +1,97 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torch.nn as nn + + +class FrozenBatchNorm2d(nn.Module): + """copy and modified from https://github.com/facebookresearch/detr/blob/master/models/backbone.py + BatchNorm2d where the batch statistics and the affine parameters are fixed. + Copy-paste from torchvision.misc.ops with added eps before rqsrt, + without which any other models than torchvision.models.resnet[18,34,50,101] + produce nans. + """ + def __init__(self, num_features, eps=1e-5): + super(FrozenBatchNorm2d, self).__init__() + n = num_features + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + self.eps = eps + self.num_features = n + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + num_batches_tracked_key = prefix + 'num_batches_tracked' + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super(FrozenBatchNorm2d, self)._load_from_state_dict( + state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs) + + def forward(self, x): + # move reshapes to the beginning + # to make it fuser-friendly + w = self.weight.reshape(1, -1, 1, 1) + b = self.bias.reshape(1, -1, 1, 1) + rv = self.running_var.reshape(1, -1, 1, 1) + rm = self.running_mean.reshape(1, -1, 1, 1) + scale = w * (rv + self.eps).rsqrt() + bias = b - rm * scale + return x * scale + bias + + def extra_repr(self): + return ( + "{num_features}, eps={eps}".format(**self.__dict__) + ) + +def freeze_batch_norm2d(module: nn.Module) -> nn.Module: + if isinstance(module, nn.BatchNorm2d): + module = FrozenBatchNorm2d(module.num_features) + else: + for name, child in module.named_children(): + _child = freeze_batch_norm2d(child) + if _child is not child: + setattr(module, name, _child) + return module + + +def get_activation(act: str, inplace: bool=True): + """get activation + """ + if act is None: + return nn.Identity() + + elif isinstance(act, nn.Module): + return act + + act = act.lower() + + if act == 'silu' or act == 'swish': + m = nn.SiLU() + + elif act == 'relu': + m = nn.ReLU() + + elif act == 'leaky_relu': + m = nn.LeakyReLU() + + elif act == 'silu': + m = nn.SiLU() + + elif act == 'gelu': + m = nn.GELU() + + elif act == 'hardsigmoid': + m = nn.Hardsigmoid() + + else: + raise RuntimeError('') + + if hasattr(m, 'inplace'): + m.inplace = inplace + + return m diff --git a/rtdetrv2_pytorch/src/nn/backbone/csp_darknet.py b/rtdetrv2_pytorch/src/nn/backbone/csp_darknet.py new file mode 100644 index 0000000..bb89947 --- /dev/null +++ b/rtdetrv2_pytorch/src/nn/backbone/csp_darknet.py @@ -0,0 +1,177 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import math +import warnings + +from .common import get_activation +from ...core import register + + +def autopad(k, p=None): + if p is None: + p = k // 2 if isinstance(k, int) else [x // 2 for x in k] + return p + +def make_divisible(c, d): + return math.ceil(c / d) * d + + +class Conv(nn.Module): + def __init__(self, cin, cout, k=1, s=1, p=None, g=1, act='silu') -> None: + super().__init__() + self.conv = nn.Conv2d(cin, cout, k, s, autopad(k, p), groups=g, bias=False) + self.bn = nn.BatchNorm2d(cout) + self.act = get_activation(act, inplace=True) + + def forward(self, x): + return self.act(self.bn(self.conv(x))) + + +class Bottleneck(nn.Module): + # Standard bottleneck + def __init__(self, c1, c2, shortcut=True, g=1, e=0.5, act='silu'): + super().__init__() + c_ = int(c2 * e) # hidden channels + self.cv1 = Conv(c1, c_, 1, 1, act=act) + self.cv2 = Conv(c_, c2, 3, 1, g=g, act=act) + self.add = shortcut and c1 == c2 + + def forward(self, x): + return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) + + +class C3(nn.Module): + # CSP Bottleneck with 3 convolutions + def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, act='silu'): # ch_in, ch_out, number, shortcut, groups, expansion + super().__init__() + c_ = int(c2 * e) # hidden channels + self.cv1 = Conv(c1, c_, 1, 1, act=act) + self.cv2 = Conv(c1, c_, 1, 1, act=act) + self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0, act=act) for _ in range(n))) + self.cv3 = Conv(2 * c_, c2, 1, act=act) + + def forward(self, x): + return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1)) + + +class SPPF(nn.Module): + # Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher + def __init__(self, c1, c2, k=5, act='silu'): # equivalent to SPP(k=(5, 9, 13)) + super().__init__() + c_ = c1 // 2 # hidden channels + self.cv1 = Conv(c1, c_, 1, 1, act=act) + self.cv2 = Conv(c_ * 4, c2, 1, 1, act=act) + self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2) + + def forward(self, x): + x = self.cv1(x) + with warnings.catch_warnings(): + warnings.simplefilter('ignore') # suppress torch 1.9.0 max_pool2d() warning + y1 = self.m(x) + y2 = self.m(y1) + return self.cv2(torch.cat([x, y1, y2, self.m(y2)], 1)) + + +@register() +class CSPDarkNet(nn.Module): + __share__ = ['depth_multi', 'width_multi'] + + def __init__(self, in_channels=3, width_multi=1.0, depth_multi=1.0, return_idx=[2, 3, -1], act='silu', ) -> None: + super().__init__() + + channels = [64, 128, 256, 512, 1024] + channels = [make_divisible(c * width_multi, 8) for c in channels] + + depths = [3, 6, 9, 3] + depths = [max(round(d * depth_multi), 1) for d in depths] + + self.layers = nn.ModuleList([Conv(in_channels, channels[0], 6, 2, 2, act=act)]) + for i, (c, d) in enumerate(zip(channels, depths), 1): + layer = nn.Sequential(*[Conv(c, channels[i], 3, 2, act=act), C3(channels[i], channels[i], n=d, act=act)]) + self.layers.append(layer) + + self.layers.append(SPPF(channels[-1], channels[-1], k=5, act=act)) + + self.return_idx = return_idx + self.out_channels = [channels[i] for i in self.return_idx] + self.strides = [[2, 4, 8, 16, 32][i] for i in self.return_idx] + self.depths = depths + self.act = act + + def forward(self, x): + outputs = [] + for _, m in enumerate(self.layers): + x = m(x) + outputs.append(x) + + return [outputs[i] for i in self.return_idx] + + +@register() +class CSPPAN(nn.Module): + """ + P5 ---> 1x1 ---------------------------------> concat --> c3 --> det + | up | conv /2 + P4 ---> concat ---> c3 ---> 1x1 --> concat ---> c3 -----------> det + | up | conv /2 + P3 -----------------------> concat ---> c3 ---------------------> det + """ + __share__ = ['depth_multi', ] + + def __init__(self, in_channels=[256, 512, 1024], depth_multi=1., act='silu') -> None: + super().__init__() + depth = max(round(3 * depth_multi), 1) + + self.out_channels = in_channels + self.fpn_stems = nn.ModuleList([Conv(cin, cout, 1, 1, act=act) for cin, cout in zip(in_channels[::-1], in_channels[::-1][1:])]) + self.fpn_csps = nn.ModuleList([C3(cin, cout, depth, False, act=act) for cin, cout in zip(in_channels[::-1], in_channels[::-1][1:])]) + + self.pan_stems = nn.ModuleList([Conv(c, c, 3, 2, act=act) for c in in_channels[:-1]]) + self.pan_csps = nn.ModuleList([C3(c, c, depth, False, act=act) for c in in_channels[1:]]) + + def forward(self, feats): + fpn_feats = [] + for i, feat in enumerate(feats[::-1]): + if i == 0: + feat = self.fpn_stems[i](feat) + fpn_feats.append(feat) + else: + _feat = F.interpolate(fpn_feats[-1], scale_factor=2, mode='nearest') + feat = torch.concat([_feat, feat], dim=1) + feat = self.fpn_csps[i-1](feat) + if i < len(self.fpn_stems): + feat = self.fpn_stems[i](feat) + fpn_feats.append(feat) + + pan_feats = [] + for i, feat in enumerate(fpn_feats[::-1]): + if i == 0: + pan_feats.append(feat) + else: + _feat = self.pan_stems[i-1](pan_feats[-1]) + feat = torch.concat([_feat, feat], dim=1) + feat = self.pan_csps[i-1](feat) + pan_feats.append(feat) + + return pan_feats + + +if __name__ == '__main__': + + data = torch.rand(1, 3, 320, 640) + + width_multi = 0.75 + depth_multi = 0.33 + + m = CSPDarkNet(3, width_multi=width_multi, depth_multi=depth_multi, act='silu') + outputs = m(data) + print([o.shape for o in outputs]) + + m = CSPPAN(in_channels=m.out_channels, depth_multi=depth_multi, act='silu') + outputs = m(outputs) + print([o.shape for o in outputs]) diff --git a/rtdetrv2_pytorch/src/nn/backbone/csp_resnet.py b/rtdetrv2_pytorch/src/nn/backbone/csp_resnet.py new file mode 100644 index 0000000..ee3c493 --- /dev/null +++ b/rtdetrv2_pytorch/src/nn/backbone/csp_resnet.py @@ -0,0 +1,277 @@ +""" +https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.6/ppdet/modeling/backbones/cspresnet.py + +Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F +from collections import OrderedDict + +from .common import get_activation + +from ...core import register + +__all__ = ['CSPResNet'] + + +donwload_url = { + 's': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/CSPResNetb_s_pretrained_from_paddle.pth', + 'm': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/CSPResNetb_m_pretrained_from_paddle.pth', + 'l': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/CSPResNetb_l_pretrained_from_paddle.pth', + 'x': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/CSPResNetb_x_pretrained_from_paddle.pth', +} + + +class ConvBNLayer(nn.Module): + def __init__(self, ch_in, ch_out, filter_size=3, stride=1, groups=1, padding=0, act=None): + super().__init__() + self.conv = nn.Conv2d(ch_in, ch_out, filter_size, stride, padding, groups=groups, bias=False) + self.bn = nn.BatchNorm2d(ch_out) + self.act = get_activation(act) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.conv(x) + x = self.bn(x) + x = self.act(x) + return x + +class RepVggBlock(nn.Module): + def __init__(self, ch_in, ch_out, act='relu', alpha: bool=False): + super().__init__() + self.ch_in = ch_in + self.ch_out = ch_out + self.conv1 = ConvBNLayer( + ch_in, ch_out, 3, stride=1, padding=1, act=None) + self.conv2 = ConvBNLayer( + ch_in, ch_out, 1, stride=1, padding=0, act=None) + self.act = get_activation(act) + + if alpha: + self.alpha = nn.Parameter(torch.ones(1, )) + else: + self.alpha = None + + def forward(self, x): + if hasattr(self, 'conv'): + y = self.conv(x) + else: + if self.alpha: + y = self.conv1(x) + self.alpha * self.conv2(x) + else: + y = self.conv1(x) + self.conv2(x) + y = self.act(y) + return y + + def convert_to_deploy(self): + if not hasattr(self, 'conv'): + self.conv = nn.Conv2d(self.ch_in, self.ch_out, 3, 1, padding=1) + + kernel, bias = self.get_equivalent_kernel_bias() + self.conv.weight.data = kernel + self.conv.bias.data = bias + + def get_equivalent_kernel_bias(self): + kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1) + kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2) + + if self.alpha: + return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor( + kernel1x1), bias3x3 + self.alpha * bias1x1 + else: + return kernel3x3 + self._pad_1x1_to_3x3_tensor( + kernel1x1), bias3x3 + bias1x1 + + def _pad_1x1_to_3x3_tensor(self, kernel1x1): + if kernel1x1 is None: + return 0 + else: + return F.pad(kernel1x1, [1, 1, 1, 1]) + + def _fuse_bn_tensor(self, branch: ConvBNLayer): + if branch is None: + return 0, 0 + kernel = branch.conv.weight + running_mean = branch.norm.running_mean + running_var = branch.norm.running_var + gamma = branch.norm.weight + beta = branch.norm.bias + eps = branch.norm.eps + std = (running_var + eps).sqrt() + t = (gamma / std).reshape(-1, 1, 1, 1) + return kernel * t, beta - running_mean * gamma / std + + +class BasicBlock(nn.Module): + def __init__(self, + ch_in, + ch_out, + act='relu', + shortcut=True, + use_alpha=False): + super().__init__() + assert ch_in == ch_out + self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act) + self.conv2 = RepVggBlock(ch_out, ch_out, act=act, alpha=use_alpha) + self.shortcut = shortcut + + def forward(self, x): + y = self.conv1(x) + y = self.conv2(y) + if self.shortcut: + return x + y + else: + return y + + +class EffectiveSELayer(nn.Module): + """ Effective Squeeze-Excitation + From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667 + """ + + def __init__(self, channels, act='hardsigmoid'): + super(EffectiveSELayer, self).__init__() + self.fc = nn.Conv2d(channels, channels, kernel_size=1, padding=0) + self.act = get_activation(act) + + def forward(self, x: torch.Tensor): + x_se = x.mean((2, 3), keepdim=True) + x_se = self.fc(x_se) + x_se = self.act(x_se) + return x * x_se + + +class CSPResStage(nn.Module): + def __init__(self, + block_fn, + ch_in, + ch_out, + n, + stride, + act='relu', + attn='eca', + use_alpha=False): + super().__init__() + ch_mid = (ch_in + ch_out) // 2 + if stride == 2: + self.conv_down = ConvBNLayer( + ch_in, ch_mid, 3, stride=2, padding=1, act=act) + else: + self.conv_down = None + self.conv1 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act) + self.conv2 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act) + self.blocks = nn.Sequential(*[ + block_fn( + ch_mid // 2, + ch_mid // 2, + act=act, + shortcut=True, + use_alpha=use_alpha) for i in range(n) + ]) + if attn: + self.attn = EffectiveSELayer(ch_mid, act='hardsigmoid') + else: + self.attn = None + + self.conv3 = ConvBNLayer(ch_mid, ch_out, 1, act=act) + + def forward(self, x): + if self.conv_down is not None: + x = self.conv_down(x) + y1 = self.conv1(x) + y2 = self.blocks(self.conv2(x)) + y = torch.concat([y1, y2], dim=1) + if self.attn is not None: + y = self.attn(y) + y = self.conv3(y) + return y + + +@register() +class CSPResNet(nn.Module): + layers = [3, 6, 6, 3] + channels = [64, 128, 256, 512, 1024] + model_cfg = { + 's': {'depth_mult': 0.33, 'width_mult': 0.50, }, + 'm': {'depth_mult': 0.67, 'width_mult': 0.75, }, + 'l': {'depth_mult': 1.00, 'width_mult': 1.00, }, + 'x': {'depth_mult': 1.33, 'width_mult': 1.25, }, + } + + def __init__(self, + name: str, + act='silu', + return_idx=[1, 2, 3], + use_large_stem=True, + use_alpha=False, + pretrained=False): + + super().__init__() + depth_mult = self.model_cfg[name]['depth_mult'] + width_mult = self.model_cfg[name]['width_mult'] + + channels = [max(round(c * width_mult), 1) for c in self.channels] + layers = [max(round(l * depth_mult), 1) for l in self.layers] + act = get_activation(act) + + if use_large_stem: + self.stem = nn.Sequential(OrderedDict([ + ('conv1', ConvBNLayer( + 3, channels[0] // 2, 3, stride=2, padding=1, act=act)), + ('conv2', ConvBNLayer( + channels[0] // 2, + channels[0] // 2, + 3, + stride=1, + padding=1, + act=act)), ('conv3', ConvBNLayer( + channels[0] // 2, + channels[0], + 3, + stride=1, + padding=1, + act=act))])) + else: + self.stem = nn.Sequential(OrderedDict([ + ('conv1', ConvBNLayer( + 3, channels[0] // 2, 3, stride=2, padding=1, act=act)), + ('conv2', ConvBNLayer( + channels[0] // 2, + channels[0], + 3, + stride=1, + padding=1, + act=act))])) + + n = len(channels) - 1 + self.stages = nn.Sequential(OrderedDict([(str(i), CSPResStage( + BasicBlock, + channels[i], + channels[i + 1], + layers[i], + 2, + act=act, + use_alpha=use_alpha)) for i in range(n)])) + + self._out_channels = channels[1:] + self._out_strides = [4 * 2**i for i in range(n)] + self.return_idx = return_idx + + if pretrained: + if isinstance(pretrained, bool) or 'http' in pretrained: + state = torch.hub.load_state_dict_from_url(donwload_url[name], map_location='cpu') + else: + state = torch.load(pretrained, map_location='cpu') + self.load_state_dict(state) + print(f'Load CSPResNet_{name} state_dict') + + def forward(self, x): + x = self.stem(x) + outs = [] + for idx, stage in enumerate(self.stages): + x = stage(x) + if idx in self.return_idx: + outs.append(x) + + return outs diff --git a/rtdetrv2_pytorch/src/nn/backbone/hgnetv2.py b/rtdetrv2_pytorch/src/nn/backbone/hgnetv2.py new file mode 100644 index 0000000..31cabbb --- /dev/null +++ b/rtdetrv2_pytorch/src/nn/backbone/hgnetv2.py @@ -0,0 +1,428 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. + +https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py +""" + +import torch +import torch.nn as nn +import torch.nn.init as init +import torch.nn.functional as F + +from torch import Tensor +from typing import List, Tuple + +from .common import FrozenBatchNorm2d +from ...core import register + + +__all__ = ['HGNetv2'] + + +class LearnableAffineBlock(nn.Module): + def __init__(self, scale_value=1.0, bias_value=0.0): + super().__init__() + self.scale = nn.Parameter(torch.tensor([scale_value])) + self.bias = nn.Parameter(torch.tensor([bias_value])) + + def forward(self, x: Tensor) -> Tensor: + return self.scale * x + self.bias + + +class ConvBNAct(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=0, + groups=1, + use_act=True, + use_lab=False): + super().__init__() + self.use_act = use_act + self.use_lab = use_lab + if padding == 'same': + self.conv = nn.Sequential( + nn.ZeroPad2d([0, 1, 0, 1]), + nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride, + groups=groups, + bias=False + ) + ) + else: + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride, + padding=(kernel_size - 1) // 2, + groups=groups, + bias=False + ) + self.bn = nn.BatchNorm2d(out_channels) + if self.use_act: + self.act = nn.ReLU() + if self.use_lab: + self.lab = LearnableAffineBlock() + + def forward(self, x: Tensor) -> Tensor: + x = self.conv(x) + x = self.bn(x) + if self.use_act: + x = self.act(x) + if self.use_lab: + x = self.lab(x) + return x + + +class LightConvBNAct(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + groups=1, + use_lab=False): + super().__init__() + self.conv1 = ConvBNAct( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + use_act=False, + use_lab=use_lab + ) + self.conv2 = ConvBNAct( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=kernel_size, + groups=out_channels, + use_act=True, + use_lab=use_lab + ) + + def forward(self, x: Tensor) -> Tensor: + x = self.conv1(x) + x = self.conv2(x) + return x + + +class StemBlock(nn.Module): + def __init__(self, + in_channels, + mid_channels, + out_channels, + use_lab=False): + super().__init__() + self.stem1 = ConvBNAct( + in_channels=in_channels, + out_channels=mid_channels, + kernel_size=3, + stride=2, + use_lab=use_lab + ) + self.stem2a = ConvBNAct( + in_channels=mid_channels, + out_channels=mid_channels // 2, + kernel_size=2, + stride=1, + padding='same', + use_lab=use_lab + ) + self.stem2b = ConvBNAct( + in_channels=mid_channels // 2, + out_channels=mid_channels, + kernel_size=2, + stride=1, + padding='same', + use_lab=use_lab + ) + self.stem3 = ConvBNAct( + in_channels=mid_channels * 2, + out_channels=mid_channels, + kernel_size=3, + stride=2, + use_lab=use_lab + ) + self.stem4 = ConvBNAct( + in_channels=mid_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + use_lab=use_lab + ) + + self.pool = nn.Sequential( + nn.ZeroPad2d([0, 1, 0, 1]), + nn.MaxPool2d(2, 1, ceil_mode=True) + ) + + def forward(self, x: Tensor) -> Tensor: + x = self.stem1(x) + x2 = self.stem2a(x) + x2 = self.stem2b(x2) + x1 = self.pool(x) + x = torch.concat([x1, x2], dim=1) + x = self.stem3(x) + x = self.stem4(x) + + return x + + +class HG_Block(nn.Module): + def __init__(self, + in_channels, + mid_channels, + out_channels, + kernel_size=3, + layer_num=6, + identity=False, + light_block=True, + use_lab=False): + super().__init__() + self.identity = identity + + self.layers = nn.ModuleList() + block_type = "LightConvBNAct" if light_block else "ConvBNAct" + for i in range(layer_num): + self.layers.append( + eval(block_type)(in_channels=in_channels + if i == 0 else mid_channels, + out_channels=mid_channels, + stride=1, + kernel_size=kernel_size, + use_lab=use_lab)) + # feature aggregation + total_channels = in_channels + layer_num * mid_channels + self.aggregation_squeeze_conv = ConvBNAct( + in_channels=total_channels, + out_channels=out_channels // 2, + kernel_size=1, + stride=1, + use_lab=use_lab) + self.aggregation_excitation_conv = ConvBNAct( + in_channels=out_channels // 2, + out_channels=out_channels, + kernel_size=1, + stride=1, + use_lab=use_lab) + + def forward(self, x): + identity = x + output = [] + output.append(x) + for layer in self.layers: + x = layer(x) + output.append(x) + x = torch.concat(output, dim=1) + x = self.aggregation_squeeze_conv(x) + x = self.aggregation_excitation_conv(x) + if self.identity: + x = x + identity + return x + + +class HG_Stage(nn.Module): + def __init__(self, + in_channels, + mid_channels, + out_channels, + block_num, + layer_num=6, + downsample=True, + light_block=True, + kernel_size=3, + use_lab=False): + super().__init__() + self.downsample = downsample + if downsample: + self.downsample = ConvBNAct( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=3, + stride=2, + groups=in_channels, + use_act=False, + use_lab=use_lab) + + blocks_list = [] + for i in range(block_num): + blocks_list.append( + HG_Block( + in_channels=in_channels if i == 0 else out_channels, + mid_channels=mid_channels, + out_channels=out_channels, + kernel_size=kernel_size, + layer_num=layer_num, + identity=False if i == 0 else True, + light_block=light_block, + use_lab=use_lab)) + self.blocks = nn.Sequential(*blocks_list) + + def forward(self, x): + if self.downsample: + x = self.downsample(x) + x = self.blocks(x) + return x + + +@register() +class HGNetv2(nn.Module): + """ + Args: + stem_channels: list. Number of channels for the stem block. + stage_type: str. The stage configuration of PPHGNet. such as the number of channels, stride, etc. + use_lab: boolean. Whether to use LearnableAffineBlock in network. + lr_mult_list: list. Control the learning rate of different stages. + Returns: + model: nn.Module. + """ + + arch_configs = { + 'L': { + 'stem_channels': [3, 32, 48], + 'stage_config': { + # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num + "stage1": [48, 48, 128, 1, False, False, 3, 6], + "stage2": [128, 96, 512, 1, True, False, 3, 6], + "stage3": [512, 192, 1024, 3, True, True, 5, 6], + "stage4": [1024, 384, 2048, 1, True, True, 5, 6], + }, + 'url': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/PPHGNetV2_L_ssld_pretrained_from_paddle.pth', + + }, + 'X': { + 'stem_channels': [3, 32, 64], + 'stage_config': { + # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num + "stage1": [64, 64, 128, 1, False, False, 3, 6], + "stage2": [128, 128, 512, 2, True, False, 3, 6], + "stage3": [512, 256, 1024, 5, True, True, 5, 6], + "stage4": [1024, 512, 2048, 2, True, True, 5, 6], + }, + 'url': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/PPHGNetV2_X_ssld_pretrained_from_paddle.pth', + + }, + 'H': { + 'stem_channels': [3, 48, 96], + 'stage_config': { + # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num + "stage1": [96, 96, 192, 2, False, False, 3, 6], + "stage2": [192, 192, 512, 3, True, False, 3, 6], + "stage3": [512, 384, 1024, 6, True, True, 5, 6], + "stage4": [1024, 768, 2048, 3, True, True, 5, 6], + }, + 'url': 'https://github.com/lyuwenyu/storage/releases/download/v0.1/PPHGNetV2_H_ssld_pretrained_from_paddle.pth', + } + } + + def __init__(self, + name, + use_lab=False, + return_idx=[1, 2, 3], + freeze_at=-1, + freeze_norm=False, + pretrained=False): + super().__init__() + self.use_lab = use_lab + self.return_idx = return_idx + + stem_channels = self.arch_configs[name]['stem_channels'] + stage_config = self.arch_configs[name]['stage_config'] + download_url = self.arch_configs[name]['url'] + + self._out_strides = [4, 8, 16, 32] + self._out_channels = [stage_config[k][2] for k in stage_config] + + # stem + self.stem = StemBlock( + in_channels=stem_channels[0], + mid_channels=stem_channels[1], + out_channels=stem_channels[2], + use_lab=use_lab + ) + + # stages + self.stages = nn.ModuleList() + for i, k in enumerate(stage_config): + in_channels, mid_channels, out_channels, block_num, downsample, light_block, kernel_size, layer_num = stage_config[ + k] + self.stages.append( + HG_Stage( + in_channels, + mid_channels, + out_channels, + block_num, + layer_num, + downsample, + light_block, + kernel_size, + use_lab)) + + self._init_weights() + + if freeze_at >= 0: + self._freeze_parameters(self.stem) + for i in range(min(freeze_at, 4)): + self._freeze_parameters(self.stages[i]) + + if freeze_norm: + self._freeze_norm(self) + + if pretrained: + if isinstance(pretrained, bool) or 'http' in pretrained: + state = torch.hub.load_state_dict_from_url(download_url, map_location='cpu') + else: + state = torch.load(pretrained, map_location='cpu') + self.load_state_dict(state) + print(f'Load HGNetv2_{name} state_dict') + + + def _init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + init.kaiming_normal_(m.weight) + elif isinstance(m, (nn.BatchNorm2d)): + init.constant_(m.weight, 1) + init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + init.constant_(m.bias, 0) + + def _freeze_parameters(self, m: nn.Module): + for p in m.parameters(): + p.requires_grad = False + + def _freeze_norm(self, m: nn.Module): + if isinstance(m, nn.BatchNorm2d): + m = FrozenBatchNorm2d(m.num_features) + else: + for name, child in m.named_children(): + _child = self._freeze_norm(child) + if _child is not child: + setattr(m, name, _child) + return m + + + def forward(self, x: Tensor) -> List[Tensor]: + x = self.stem(x) + outs = [] + for idx, stage in enumerate(self.stages): + x = stage(x) + if idx in self.return_idx: + outs.append(x) + return outs + + + +if __name__ == '__main__': + + m = HGNetv2(name='X', pretrained=False, freeze_at=-1, freeze_norm=False) + data = torch.randn(1, 3, 640, 640) + + output = m(data) + print([o.shape for o in output]) + + output[0].mean().backward() diff --git a/rtdetrv2_pytorch/src/nn/backbone/presnet.py b/rtdetrv2_pytorch/src/nn/backbone/presnet.py new file mode 100644 index 0000000..baf8594 --- /dev/null +++ b/rtdetrv2_pytorch/src/nn/backbone/presnet.py @@ -0,0 +1,245 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" +import torch +import torch.nn as nn +import torch.nn.functional as F + +from collections import OrderedDict + +from .common import get_activation, FrozenBatchNorm2d + +from ...core import register + + +__all__ = ['PResNet'] + + +ResNet_cfg = { + 18: [2, 2, 2, 2], + 34: [3, 4, 6, 3], + 50: [3, 4, 6, 3], + 101: [3, 4, 23, 3], + # 152: [3, 8, 36, 3], +} + + +donwload_url = { + 18: 'https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet18_vd_pretrained_from_paddle.pth', + 34: 'https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet34_vd_pretrained_from_paddle.pth', + 50: 'https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet50_vd_ssld_v2_pretrained_from_paddle.pth', + 101: 'https://github.com/lyuwenyu/storage/releases/download/v0.1/ResNet101_vd_ssld_pretrained_from_paddle.pth', +} + + +class ConvNormLayer(nn.Module): + def __init__(self, ch_in, ch_out, kernel_size, stride, padding=None, bias=False, act=None): + super().__init__() + self.conv = nn.Conv2d( + ch_in, + ch_out, + kernel_size, + stride, + padding=(kernel_size-1)//2 if padding is None else padding, + bias=bias) + self.norm = nn.BatchNorm2d(ch_out) + self.act = get_activation(act) + + def forward(self, x): + return self.act(self.norm(self.conv(x))) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, ch_in, ch_out, stride, shortcut, act='relu', variant='b'): + super().__init__() + + self.shortcut = shortcut + + if not shortcut: + if variant == 'd' and stride == 2: + self.short = nn.Sequential(OrderedDict([ + ('pool', nn.AvgPool2d(2, 2, 0, ceil_mode=True)), + ('conv', ConvNormLayer(ch_in, ch_out, 1, 1)) + ])) + else: + self.short = ConvNormLayer(ch_in, ch_out, 1, stride) + + self.branch2a = ConvNormLayer(ch_in, ch_out, 3, stride, act=act) + self.branch2b = ConvNormLayer(ch_out, ch_out, 3, 1, act=None) + self.act = nn.Identity() if act is None else get_activation(act) + + + def forward(self, x): + out = self.branch2a(x) + out = self.branch2b(out) + if self.shortcut: + short = x + else: + short = self.short(x) + + out = out + short + out = self.act(out) + + return out + + +class BottleNeck(nn.Module): + expansion = 4 + + def __init__(self, ch_in, ch_out, stride, shortcut, act='relu', variant='b'): + super().__init__() + + if variant == 'a': + stride1, stride2 = stride, 1 + else: + stride1, stride2 = 1, stride + + width = ch_out + + self.branch2a = ConvNormLayer(ch_in, width, 1, stride1, act=act) + self.branch2b = ConvNormLayer(width, width, 3, stride2, act=act) + self.branch2c = ConvNormLayer(width, ch_out * self.expansion, 1, 1) + + self.shortcut = shortcut + if not shortcut: + if variant == 'd' and stride == 2: + self.short = nn.Sequential(OrderedDict([ + ('pool', nn.AvgPool2d(2, 2, 0, ceil_mode=True)), + ('conv', ConvNormLayer(ch_in, ch_out * self.expansion, 1, 1)) + ])) + else: + self.short = ConvNormLayer(ch_in, ch_out * self.expansion, 1, stride) + + self.act = nn.Identity() if act is None else get_activation(act) + + def forward(self, x): + out = self.branch2a(x) + out = self.branch2b(out) + out = self.branch2c(out) + + if self.shortcut: + short = x + else: + short = self.short(x) + + out = out + short + out = self.act(out) + + return out + + +class Blocks(nn.Module): + def __init__(self, block, ch_in, ch_out, count, stage_num, act='relu', variant='b'): + super().__init__() + + self.blocks = nn.ModuleList() + for i in range(count): + self.blocks.append( + block( + ch_in, + ch_out, + stride=2 if i == 0 and stage_num != 2 else 1, + shortcut=False if i == 0 else True, + variant=variant, + act=act) + ) + + if i == 0: + ch_in = ch_out * block.expansion + + def forward(self, x): + out = x + for block in self.blocks: + out = block(out) + return out + + +@register() +class PResNet(nn.Module): + def __init__( + self, + depth, + variant='d', + num_stages=4, + return_idx=[0, 1, 2, 3], + act='relu', + freeze_at=-1, + freeze_norm=True, + pretrained=False): + super().__init__() + + block_nums = ResNet_cfg[depth] + ch_in = 64 + if variant in ['c', 'd']: + conv_def = [ + [3, ch_in // 2, 3, 2, "conv1_1"], + [ch_in // 2, ch_in // 2, 3, 1, "conv1_2"], + [ch_in // 2, ch_in, 3, 1, "conv1_3"], + ] + else: + conv_def = [[3, ch_in, 7, 2, "conv1_1"]] + + self.conv1 = nn.Sequential(OrderedDict([ + (name, ConvNormLayer(cin, cout, k, s, act=act)) for cin, cout, k, s, name in conv_def + ])) + + ch_out_list = [64, 128, 256, 512] + block = BottleNeck if depth >= 50 else BasicBlock + + _out_channels = [block.expansion * v for v in ch_out_list] + _out_strides = [4, 8, 16, 32] + + self.res_layers = nn.ModuleList() + for i in range(num_stages): + stage_num = i + 2 + self.res_layers.append( + Blocks(block, ch_in, ch_out_list[i], block_nums[i], stage_num, act=act, variant=variant) + ) + ch_in = _out_channels[i] + + self.return_idx = return_idx + self.out_channels = [_out_channels[_i] for _i in return_idx] + self.out_strides = [_out_strides[_i] for _i in return_idx] + + if freeze_at >= 0: + self._freeze_parameters(self.conv1) + for i in range(min(freeze_at, num_stages)): + self._freeze_parameters(self.res_layers[i]) + + if freeze_norm: + self._freeze_norm(self) + + if pretrained: + if isinstance(pretrained, bool) or 'http' in pretrained: + state = torch.hub.load_state_dict_from_url(donwload_url[depth], map_location='cpu') + else: + state = torch.load(pretrained, map_location='cpu') + self.load_state_dict(state) + print(f'Load PResNet{depth} state_dict') + + def _freeze_parameters(self, m: nn.Module): + for p in m.parameters(): + p.requires_grad = False + + def _freeze_norm(self, m: nn.Module): + if isinstance(m, nn.BatchNorm2d): + m = FrozenBatchNorm2d(m.num_features) + else: + for name, child in m.named_children(): + _child = self._freeze_norm(child) + if _child is not child: + setattr(m, name, _child) + return m + + def forward(self, x): + conv1 = self.conv1(x) + x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1) + outs = [] + for idx, stage in enumerate(self.res_layers): + x = stage(x) + if idx in self.return_idx: + outs.append(x) + return outs + + diff --git a/rtdetrv2_pytorch/src/nn/backbone/test_resnet.py b/rtdetrv2_pytorch/src/nn/backbone/test_resnet.py new file mode 100644 index 0000000..72740d4 --- /dev/null +++ b/rtdetrv2_pytorch/src/nn/backbone/test_resnet.py @@ -0,0 +1,81 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from collections import OrderedDict + + +from ...core import register + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, in_planes, planes, stride=1): + super(BasicBlock, self).__init__() + + self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,stride=1, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + + self.shortcut = nn.Sequential() + if stride != 1 or in_planes != self.expansion*planes: + self.shortcut = nn.Sequential( + nn.Conv2d(in_planes, self.expansion*planes,kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(self.expansion*planes) + ) + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = self.bn2(self.conv2(out)) + out += self.shortcut(x) + out = F.relu(out) + return out + + + +class _ResNet(nn.Module): + def __init__(self, block, num_blocks, num_classes=10): + super().__init__() + self.in_planes = 64 + + self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(64) + + self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) + self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) + self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) + self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) + + self.linear = nn.Linear(512 * block.expansion, num_classes) + + def _make_layer(self, block, planes, num_blocks, stride): + strides = [stride] + [1]*(num_blocks-1) + layers = [] + for stride in strides: + layers.append(block(self.in_planes, planes, stride)) + self.in_planes = planes * block.expansion + return nn.Sequential(*layers) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = F.avg_pool2d(out, 4) + out = out.view(out.size(0), -1) + out = self.linear(out) + return out + + +@register() +class MResNet(nn.Module): + def __init__(self, num_classes=10, num_blocks=[2, 2, 2, 2]) -> None: + super().__init__() + self.model = _ResNet(BasicBlock, num_blocks, num_classes) + + def forward(self, x): + return self.model(x) + diff --git a/rtdetrv2_pytorch/src/nn/backbone/timm_model.py b/rtdetrv2_pytorch/src/nn/backbone/timm_model.py new file mode 100644 index 0000000..7fa19c0 --- /dev/null +++ b/rtdetrv2_pytorch/src/nn/backbone/timm_model.py @@ -0,0 +1,70 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. + +https://towardsdatascience.com/getting-started-with-pytorch-image-models-timm-a-practitioners-guide-4e77b4bf9055#0583 +""" + +import torch +from torchvision.models.feature_extraction import get_graph_node_names, create_feature_extractor + +from .utils import IntermediateLayerGetter +from ...core import register + + +@register() +class TimmModel(torch.nn.Module): + def __init__(self, \ + name, + return_layers, + pretrained=False, + exportable=True, + features_only=True, + **kwargs) -> None: + + super().__init__() + + import timm + model = timm.create_model( + name, + pretrained=pretrained, + exportable=exportable, + features_only=features_only, + **kwargs + ) + # nodes, _ = get_graph_node_names(model) + # print(nodes) + # features = {'': ''} + # model = create_feature_extractor(model, return_nodes=features) + + assert set(return_layers).issubset(model.feature_info.module_name()), \ + f'return_layers should be a subset of {model.feature_info.module_name()}' + + # self.model = model + self.model = IntermediateLayerGetter(model, return_layers) + + return_idx = [model.feature_info.module_name().index(name) for name in return_layers] + self.strides = [model.feature_info.reduction()[i] for i in return_idx] + self.channels = [model.feature_info.channels()[i] for i in return_idx] + self.return_idx = return_idx + self.return_layers = return_layers + + def forward(self, x: torch.Tensor): + outputs = self.model(x) + # outputs = [outputs[i] for i in self.return_idx] + return outputs + + +if __name__ == '__main__': + + model = TimmModel(name='resnet34', return_layers=['layer2', 'layer3']) + data = torch.rand(1, 3, 640, 640) + outputs = model(data) + + for output in outputs: + print(output.shape) + + """ + model: + type: TimmModel + name: resnet34 + return_layers: ['layer2', 'layer4'] + """ diff --git a/rtdetrv2_pytorch/src/nn/backbone/torchvision_model.py b/rtdetrv2_pytorch/src/nn/backbone/torchvision_model.py new file mode 100644 index 0000000..de3294f --- /dev/null +++ b/rtdetrv2_pytorch/src/nn/backbone/torchvision_model.py @@ -0,0 +1,49 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torchvision + +from ...core import register +from .utils import IntermediateLayerGetter + +__all__ = ['TorchVisionModel'] + +@register() +class TorchVisionModel(torch.nn.Module): + def __init__(self, name, return_layers, weights=None, **kwargs) -> None: + super().__init__() + + if weights is not None: + weights = getattr(torchvision.models.get_model_weights(name), weights) + + model = torchvision.models.get_model(name, weights=weights, **kwargs) + + # TODO hard code. + if hasattr(model, 'features'): + model = IntermediateLayerGetter(model.features, return_layers) + else: + model = IntermediateLayerGetter(model, return_layers) + + self.model = model + + def forward(self, x): + return self.model(x) + + +# TorchVisionModel('swin_t', return_layers=['5', '7']) +# TorchVisionModel('resnet34', return_layers=['layer2','layer3', 'layer4']) + +""" +TorchVisionModel: + name: swin_t + return_layers: ['5', '7'] + weights: DEFAULT + + +model: + type: TorchVisionModel + name: resnet34 + return_layers: ['layer2','layer3', 'layer4'] + weights: DEFAULT +""" \ No newline at end of file diff --git a/rtdetrv2_pytorch/src/nn/backbone/utils.py b/rtdetrv2_pytorch/src/nn/backbone/utils.py new file mode 100644 index 0000000..3ec4ef7 --- /dev/null +++ b/rtdetrv2_pytorch/src/nn/backbone/utils.py @@ -0,0 +1,55 @@ +""" +https://github.com/pytorch/vision/blob/main/torchvision/models/_utils.py + +Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +from collections import OrderedDict +from typing import Dict, List + + +import torch.nn as nn + + +class IntermediateLayerGetter(nn.ModuleDict): + """ + Module wrapper that returns intermediate layers from a model + + It has a strong assumption that the modules have been registered + into the model in the same order as they are used. + This means that one should **not** reuse the same nn.Module + twice in the forward if you want this to work. + + Additionally, it is only able to query submodules that are directly + assigned to the model. So if `model` is passed, `model.feature1` can + be returned, but not `model.feature1.layer2`. + """ + + _version = 3 + + def __init__(self, model: nn.Module, return_layers: List[str]) -> None: + if not set(return_layers).issubset([name for name, _ in model.named_children()]): + raise ValueError("return_layers are not present in model. {}"\ + .format([name for name, _ in model.named_children()])) + orig_return_layers = return_layers + return_layers = {str(k): str(k) for k in return_layers} + layers = OrderedDict() + for name, module in model.named_children(): + layers[name] = module + if name in return_layers: + del return_layers[name] + if not return_layers: + break + + super().__init__(layers) + self.return_layers = orig_return_layers + + def forward(self, x): + outputs = [] + for name, module in self.items(): + x = module(x) + if name in self.return_layers: + outputs.append(x) + + return outputs + diff --git a/rtdetrv2_pytorch/src/nn/criterion/__init__.py b/rtdetrv2_pytorch/src/nn/criterion/__init__.py new file mode 100644 index 0000000..485d636 --- /dev/null +++ b/rtdetrv2_pytorch/src/nn/criterion/__init__.py @@ -0,0 +1,10 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + + +import torch.nn as nn +from ...core import register + +from .det_criterion import DetCriterion + +CrossEntropyLoss = register()(nn.CrossEntropyLoss) diff --git a/rtdetrv2_pytorch/src/nn/criterion/det_criterion.py b/rtdetrv2_pytorch/src/nn/criterion/det_criterion.py new file mode 100644 index 0000000..3d87982 --- /dev/null +++ b/rtdetrv2_pytorch/src/nn/criterion/det_criterion.py @@ -0,0 +1,171 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torch.nn.functional as F +import torch.distributed +import torchvision + +from ...misc import box_ops +from ...misc import dist_utils +from ...core import register + + +@register() +class DetCriterion(torch.nn.Module): + """Default Detection Criterion + """ + __share__ = ['num_classes'] + __inject__ = ['matcher'] + + def __init__(self, + losses, + weight_dict, + num_classes=80, + alpha=0.75, + gamma=2.0, + box_fmt='cxcywh', + matcher=None): + """ + Args: + losses (list[str]): requested losses, support ['boxes', 'vfl', 'focal'] + weight_dict (dict[str, float)]: corresponding losses weight, including + ['loss_bbox', 'loss_giou', 'loss_vfl', 'loss_focal'] + box_fmt (str): in box format, 'cxcywh' or 'xyxy' + matcher (Matcher): matcher used to match source to target + """ + super().__init__() + self.losses = losses + self.weight_dict = weight_dict + self.alpha = alpha + self.gamma = gamma + self.num_classes = num_classes + self.box_fmt = box_fmt + assert matcher is not None, '' + self.matcher = matcher + + def forward(self, outputs, targets, **kwargs): + """ + Args: + outputs: Dict[Tensor], 'pred_boxes', 'pred_logits', 'meta'. + targets, List[Dict[str, Tensor]], len(targets) == batch_size. + kwargs, store other information such as current epoch id. + Return: + losses, Dict[str, Tensor] + """ + matched = self.matcher(outputs, targets) + values = matched['values'] + indices = matched['indices'] + num_boxes = self._get_positive_nums(indices) + + # Compute all the requested losses + losses = {} + for loss in self.losses: + l_dict = self.get_loss(loss, outputs, targets, indices, num_boxes) + l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict} + losses.update(l_dict) + return losses + + def _get_src_permutation_idx(self, indices): + # permute predictions following indices + batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) + src_idx = torch.cat([src for (src, _) in indices]) + return batch_idx, src_idx + + def _get_tgt_permutation_idx(self, indices): + # permute targets following indices + batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) + tgt_idx = torch.cat([tgt for (_, tgt) in indices]) + return batch_idx, tgt_idx + + def _get_positive_nums(self, indices): + # number of positive samples + num_pos = sum(len(i) for (i, _) in indices) + num_pos = torch.as_tensor([num_pos], dtype=torch.float32, device=indices[0][0].device) + if dist_utils.is_dist_available_and_initialized(): + torch.distributed.all_reduce(num_pos) + num_pos = torch.clamp(num_pos / dist_utils.get_world_size(), min=1).item() + return num_pos + + def loss_labels_focal(self, outputs, targets, indices, num_boxes): + assert 'pred_logits' in outputs + src_logits = outputs['pred_logits'] + + idx = self._get_src_permutation_idx(indices) + target_classes_o = torch.cat([t["labels"][j] for t, (_, j) in zip(targets, indices)]) + target_classes = torch.full(src_logits.shape[:2], self.num_classes, + dtype=torch.int64, device=src_logits.device) + target_classes[idx] = target_classes_o + + target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1].to(src_logits.dtype) + loss = torchvision.ops.sigmoid_focal_loss(src_logits, target, self.alpha, self.gamma, reduction='none') + loss = loss.sum() / num_boxes + return {'loss_focal': loss} + + def loss_labels_vfl(self, outputs, targets, indices, num_boxes): + assert 'pred_boxes' in outputs + idx = self._get_src_permutation_idx(indices) + + src_boxes = outputs['pred_boxes'][idx] + target_boxes = torch.cat([t['boxes'][j] for t, (_, j) in zip(targets, indices)], dim=0) + + src_boxes = torchvision.ops.box_convert(src_boxes, in_fmt=self.box_fmt, out_fmt='xyxy') + target_boxes = torchvision.ops.box_convert(target_boxes, in_fmt=self.box_fmt, out_fmt='xyxy') + iou, _ = box_ops.elementwise_box_iou(src_boxes.detach(), target_boxes) + + src_logits: torch.Tensor = outputs['pred_logits'] + target_classes_o = torch.cat([t["labels"][j] for t, (_, j) in zip(targets, indices)]) + target_classes = torch.full(src_logits.shape[:2], self.num_classes, + dtype=torch.int64, device=src_logits.device) + target_classes[idx] = target_classes_o + target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1] + + target_score_o = torch.zeros_like(target_classes, dtype=src_logits.dtype) + target_score_o[idx] = iou.to(src_logits.dtype) + target_score = target_score_o.unsqueeze(-1) * target + + src_score = F.sigmoid(src_logits.detach()) + weight = self.alpha * src_score.pow(self.gamma) * (1 - target) + target_score + + loss = F.binary_cross_entropy_with_logits(src_logits, target_score, weight=weight, reduction='none') + loss = loss.sum() / num_boxes + return {'loss_vfl': loss} + + def loss_boxes(self, outputs, targets, indices, num_boxes): + assert 'pred_boxes' in outputs + idx = self._get_src_permutation_idx(indices) + src_boxes = outputs['pred_boxes'][idx] + target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0) + + losses = {} + loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none') + losses['loss_bbox'] = loss_bbox.sum() / num_boxes + + src_boxes = torchvision.ops.box_convert(src_boxes, in_fmt=self.box_fmt, out_fmt='xyxy') + target_boxes = torchvision.ops.box_convert(target_boxes, in_fmt=self.box_fmt, out_fmt='xyxy') + loss_giou = 1 - box_ops.elementwise_generalized_box_iou(src_boxes, target_boxes) + losses['loss_giou'] = loss_giou.sum() / num_boxes + return losses + + def loss_boxes_giou(self, outputs, targets, indices, num_boxes): + assert 'pred_boxes' in outputs + idx = self._get_src_permutation_idx(indices) + src_boxes = outputs['pred_boxes'][idx] + target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0) + + losses = {} + src_boxes = torchvision.ops.box_convert(src_boxes, in_fmt=self.box_fmt, out_fmt='xyxy') + target_boxes = torchvision.ops.box_convert(target_boxes, in_fmt=self.box_fmt, out_fmt='xyxy') + loss_giou = 1 - box_ops.elementwise_generalized_box_iou(src_boxes, target_boxes) + losses['loss_giou'] = loss_giou.sum() / num_boxes + return losses + + def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs): + loss_map = { + 'boxes': self.loss_boxes, + 'giou': self.loss_boxes_giou, + 'vfl': self.loss_labels_vfl, + 'focal': self.loss_labels_focal, + } + assert loss in loss_map, f'do you really want to compute {loss} loss?' + return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs) diff --git a/rtdetrv2_pytorch/src/nn/postprocessor/__init__.py b/rtdetrv2_pytorch/src/nn/postprocessor/__init__.py new file mode 100644 index 0000000..147c3d4 --- /dev/null +++ b/rtdetrv2_pytorch/src/nn/postprocessor/__init__.py @@ -0,0 +1,5 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + + +from .nms_postprocessor import DetNMSPostProcessor \ No newline at end of file diff --git a/rtdetrv2_pytorch/src/nn/postprocessor/box_revert.py b/rtdetrv2_pytorch/src/nn/postprocessor/box_revert.py new file mode 100644 index 0000000..0f1a14f --- /dev/null +++ b/rtdetrv2_pytorch/src/nn/postprocessor/box_revert.py @@ -0,0 +1,62 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torchvision +from torch import Tensor +from enum import Enum + + +class BoxProcessFormat(Enum): + """Box process format + + Available formats are + * ``RESIZE`` + * ``RESIZE_KEEP_RATIO`` + * ``RESIZE_KEEP_RATIO_PADDING`` + """ + RESIZE = 1 + RESIZE_KEEP_RATIO = 2 + RESIZE_KEEP_RATIO_PADDING = 3 + + +def box_revert( + boxes: Tensor, + orig_sizes: Tensor=None, + eval_sizes: Tensor=None, + inpt_sizes: Tensor=None, + inpt_padding: Tensor=None, + normalized: bool=True, + in_fmt: str='cxcywh', + out_fmt: str='xyxy', + process_fmt=BoxProcessFormat.RESIZE, +) -> Tensor: + """ + Args: + boxes(Tensor), [N, :, 4], (x1, y1, x2, y2), pred boxes. + inpt_sizes(Tensor), [N, 2], (w, h). input sizes. + orig_sizes(Tensor), [N, 2], (w, h). origin sizes. + inpt_padding (Tensor), [N, 2], (w_pad, h_pad, ...). + (inpt_sizes + inpt_padding) == eval_sizes + """ + assert in_fmt in ('cxcywh', 'xyxy'), '' + + if normalized and eval_sizes is not None: + boxes = boxes * eval_sizes.repeat(1, 2).unsqueeze(1) + + if inpt_padding is not None: + if in_fmt == 'xyxy': + boxes -= inpt_padding[:, :2].repeat(1, 2).unsqueeze(1) + elif in_fmt == 'cxcywh': + boxes[..., :2] -= inpt_padding[:, :2].repeat(1, 2).unsqueeze(1) + + if orig_sizes is not None: + orig_sizes = orig_sizes.repeat(1, 2).unsqueeze(1) + if inpt_sizes is not None: + inpt_sizes = inpt_sizes.repeat(1, 2).unsqueeze(1) + boxes = boxes * (orig_sizes / inpt_sizes) + else: + boxes = boxes * orig_sizes + + boxes = torchvision.ops.box_convert(boxes, in_fmt=in_fmt, out_fmt=out_fmt) + return boxes diff --git a/rtdetrv2_pytorch/src/nn/postprocessor/detr_postprocessor.py b/rtdetrv2_pytorch/src/nn/postprocessor/detr_postprocessor.py new file mode 100644 index 0000000..48f5f2b --- /dev/null +++ b/rtdetrv2_pytorch/src/nn/postprocessor/detr_postprocessor.py @@ -0,0 +1,81 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import torchvision + + +__all__ = ['DetDETRPostProcessor'] + +from .box_revert import box_revert +from .box_revert import BoxProcessFormat + +def mod(a, b): + out = a - a // b * b + return out + +class DetDETRPostProcessor(nn.Module): + def __init__( + self, + num_classes=80, + use_focal_loss=True, + num_top_queries=300, + box_process_format=BoxProcessFormat.RESIZE, + ) -> None: + super().__init__() + self.use_focal_loss = use_focal_loss + self.num_top_queries = num_top_queries + self.num_classes = int(num_classes) + self.box_process_format = box_process_format + self.deploy_mode = False + + def extra_repr(self) -> str: + return f'use_focal_loss={self.use_focal_loss}, num_classes={self.num_classes}, num_top_queries={self.num_top_queries}' + + def forward(self, outputs, **kwargs): + logits, boxes = outputs['pred_logits'], outputs['pred_boxes'] + + if self.use_focal_loss: + scores = F.sigmoid(logits) + scores, index = torch.topk(scores.flatten(1), self.num_top_queries, dim=-1) + labels = index % self.num_classes + # labels = mod(index, self.num_classes) # for tensorrt + index = index // self.num_classes + boxes = boxes.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, boxes.shape[-1])) + + else: + scores = F.softmax(logits)[:, :, :-1] + scores, labels = scores.max(dim=-1) + if scores.shape[1] > self.num_top_queries: + scores, index = torch.topk(scores, self.num_top_queries, dim=-1) + labels = torch.gather(labels, dim=1, index=index) + boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1])) + + if kwargs is not None: + boxes = box_revert( + boxes, + in_fmt='cxcywh', + out_fmt='xyxy', + process_fmt=self.box_process_format, + normalized=True, + **kwargs, + ) + + # TODO for onnx export + if self.deploy_mode: + return labels, boxes, scores + + results = [] + for lab, box, sco in zip(labels, boxes, scores): + result = dict(labels=lab, boxes=box, scores=sco) + results.append(result) + + return results + + def deploy(self, ): + self.eval() + self.deploy_mode = True + return self diff --git a/rtdetrv2_pytorch/src/nn/postprocessor/nms_postprocessor.py b/rtdetrv2_pytorch/src/nn/postprocessor/nms_postprocessor.py new file mode 100644 index 0000000..b094594 --- /dev/null +++ b/rtdetrv2_pytorch/src/nn/postprocessor/nms_postprocessor.py @@ -0,0 +1,79 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torch.nn.functional as F +import torch.distributed +import torchvision +from torch import Tensor + +from ...core import register + +from typing import Dict + + +__all__ = ['DetNMSPostProcessor', ] + + +@register() +class DetNMSPostProcessor(torch.nn.Module): + def __init__(self, \ + iou_threshold=0.7, + score_threshold=0.01, + keep_topk=300, + box_fmt='cxcywh', + logit_fmt='sigmoid') -> None: + super().__init__() + self.iou_threshold = iou_threshold + self.score_threshold = score_threshold + self.keep_topk = keep_topk + self.box_fmt = box_fmt.lower() + self.logit_fmt = logit_fmt.lower() + self.logit_func = getattr(F, self.logit_fmt, None) + self.deploy_mode = False + + def forward(self, outputs: Dict[str, Tensor], orig_target_sizes: Tensor): + logits, boxes = outputs['pred_logits'], outputs['pred_boxes'] + pred_boxes = torchvision.ops.box_convert(boxes, in_fmt=self.box_fmt, out_fmt='xyxy') + pred_boxes *= orig_target_sizes.repeat(1, 2).unsqueeze(1) + + values, pred_labels = torch.max(logits, dim=-1) + + if self.logit_func: + pred_scores = self.logit_func(values) + else: + pred_scores = values + + # TODO for onnx export + if self.deploy_mode: + blobs = { + 'pred_labels': pred_labels, + 'pred_boxes': pred_boxes, + 'pred_scores': pred_scores + } + return blobs + + results = [] + for i in range(logits.shape[0]): + score_keep = pred_scores[i] > self.score_threshold + pred_box = pred_boxes[i][score_keep] + pred_label = pred_labels[i][score_keep] + pred_score = pred_scores[i][score_keep] + + keep = torchvision.ops.batched_nms(pred_box, pred_score, pred_label, self.iou_threshold) + keep = keep[:self.keep_topk] + + blob = { + 'labels': pred_label[keep], + 'boxes': pred_box[keep], + 'scores': pred_score[keep], + } + + results.append(blob) + + return results + + def deploy(self, ): + self.eval() + self.deploy_mode = True + return self diff --git a/rtdetrv2_pytorch/src/optim/__init__.py b/rtdetrv2_pytorch/src/optim/__init__.py new file mode 100644 index 0000000..2b04f32 --- /dev/null +++ b/rtdetrv2_pytorch/src/optim/__init__.py @@ -0,0 +1,7 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +from .ema import * +from .optim import * +from .amp import * +from .warmup import * \ No newline at end of file diff --git a/rtdetrv2_pytorch/src/optim/amp.py b/rtdetrv2_pytorch/src/optim/amp.py new file mode 100644 index 0000000..4e69b59 --- /dev/null +++ b/rtdetrv2_pytorch/src/optim/amp.py @@ -0,0 +1,12 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + + +import torch.cuda.amp as amp + +from ..core import register + + +__all__ = ['GradScaler'] + +GradScaler = register()(amp.grad_scaler.GradScaler) diff --git a/rtdetrv2_pytorch/src/optim/ema.py b/rtdetrv2_pytorch/src/optim/ema.py new file mode 100644 index 0000000..e42a0a8 --- /dev/null +++ b/rtdetrv2_pytorch/src/optim/ema.py @@ -0,0 +1,92 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + + +import torch +import torch.nn as nn + +import math +from copy import deepcopy + +from ..core import register +from ..misc import dist_utils + +__all__ = ['ModelEMA'] + + +@register() +class ModelEMA(object): + """ + Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models + Keep a moving average of everything in the model state_dict (parameters and buffers). + This is intended to allow functionality like + https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage + A smoothed version of the weights is necessary for some training schemes to perform well. + This class is sensitive where it is initialized in the sequence of model init, + GPU assignment and distributed training wrappers. + """ + def __init__(self, model: nn.Module, decay: float=0.9999, warmups: int=2000, ): + super().__init__() + + self.module = deepcopy(dist_utils.de_parallel(model)).eval() + # if next(model.parameters()).device.type != 'cpu': + # self.module.half() # FP16 EMA + + self.decay = decay + self.warmups = warmups + self.updates = 0 # number of EMA updates + self.decay_fn = lambda x: decay * (1 - math.exp(-x / warmups)) # decay exponential ramp (to help early epochs) + + for p in self.module.parameters(): + p.requires_grad_(False) + + + def update(self, model: nn.Module): + # Update EMA parameters + with torch.no_grad(): + self.updates += 1 + d = self.decay_fn(self.updates) + msd = dist_utils.de_parallel(model).state_dict() + for k, v in self.module.state_dict().items(): + if v.dtype.is_floating_point: + v *= d + v += (1 - d) * msd[k].detach() + + def to(self, *args, **kwargs): + self.module = self.module.to(*args, **kwargs) + return self + + def state_dict(self, ): + return dict(module=self.module.state_dict(), updates=self.updates) + + def load_state_dict(self, state, strict=True): + self.module.load_state_dict(state['module'], strict=strict) + if 'updates' in state: + self.updates = state['updates'] + + def forwad(self, ): + raise RuntimeError('ema...') + + def extra_repr(self) -> str: + return f'decay={self.decay}, warmups={self.warmups}' + + + +class ExponentialMovingAverage(torch.optim.swa_utils.AveragedModel): + """Maintains moving averages of model parameters using an exponential decay. + ``ema_avg = decay * avg_model_param + (1 - decay) * model_param`` + `torch.optim.swa_utils.AveragedModel `_ + is used to compute the EMA. + """ + def __init__(self, model, decay, device="cpu", use_buffers=True): + + self.decay_fn = lambda x: decay * (1 - math.exp(-x / 2000)) + + def ema_avg(avg_model_param, model_param, num_averaged): + decay = self.decay_fn(num_averaged) + return decay * avg_model_param + (1 - decay) * model_param + + super().__init__(model, device, ema_avg, use_buffers=use_buffers) + + + diff --git a/rtdetrv2_pytorch/src/optim/optim.py b/rtdetrv2_pytorch/src/optim/optim.py new file mode 100644 index 0000000..843f900 --- /dev/null +++ b/rtdetrv2_pytorch/src/optim/optim.py @@ -0,0 +1,23 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + + +import torch.optim as optim +import torch.optim.lr_scheduler as lr_scheduler + +from ..core import register + + +__all__ = ['AdamW', 'SGD', 'Adam', 'MultiStepLR', 'CosineAnnealingLR', 'OneCycleLR', 'LambdaLR'] + + + +SGD = register()(optim.SGD) +Adam = register()(optim.Adam) +AdamW = register()(optim.AdamW) + + +MultiStepLR = register()(lr_scheduler.MultiStepLR) +CosineAnnealingLR = register()(lr_scheduler.CosineAnnealingLR) +OneCycleLR = register()(lr_scheduler.OneCycleLR) +LambdaLR = register()(lr_scheduler.LambdaLR) diff --git a/rtdetrv2_pytorch/src/optim/warmup.py b/rtdetrv2_pytorch/src/optim/warmup.py new file mode 100644 index 0000000..b2634f9 --- /dev/null +++ b/rtdetrv2_pytorch/src/optim/warmup.py @@ -0,0 +1,47 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +from torch.optim.lr_scheduler import LRScheduler + +from ..core import register + + +class Warmup(object): + def __init__(self, lr_scheduler: LRScheduler, warmup_duration: int, last_step: int=-1) -> None: + self.lr_scheduler = lr_scheduler + self.warmup_end_values = [pg['lr'] for pg in lr_scheduler.optimizer.param_groups] + self.last_step = last_step + self.warmup_duration = warmup_duration + self.step() + + def state_dict(self): + return {k: v for k, v in self.__dict__.items() if k != 'lr_scheduler'} + + def load_state_dict(self, state_dict): + self.__dict__.update(state_dict) + + def get_warmup_factor(self, step, **kwargs): + raise NotImplementedError + + def step(self, ): + self.last_step += 1 + if self.last_step >= self.warmup_duration: + return + factor = self.get_warmup_factor(self.last_step) + for i, pg in enumerate(self.lr_scheduler.optimizer.param_groups): + pg['lr'] = factor * self.warmup_end_values[i] + + def finished(self, ): + if self.last_step >= self.warmup_duration: + return True + return False + + +@register() +class LinearWarmup(Warmup): + def __init__(self, lr_scheduler: LRScheduler, warmup_duration: int, last_step: int = -1) -> None: + super().__init__(lr_scheduler, warmup_duration, last_step) + + def get_warmup_factor(self, step): + return min(1.0, (step + 1) / self.warmup_duration) + diff --git a/rtdetrv2_pytorch/src/solver/__init__.py b/rtdetrv2_pytorch/src/solver/__init__.py new file mode 100644 index 0000000..de1611e --- /dev/null +++ b/rtdetrv2_pytorch/src/solver/__init__.py @@ -0,0 +1,15 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +from ._solver import BaseSolver +from .clas_solver import ClasSolver +from .det_solver import DetSolver + + + +from typing import Dict + +TASKS :Dict[str, BaseSolver] = { + 'classification': ClasSolver, + 'detection': DetSolver, +} \ No newline at end of file diff --git a/rtdetrv2_pytorch/src/solver/_solver.py b/rtdetrv2_pytorch/src/solver/_solver.py new file mode 100644 index 0000000..51e9bef --- /dev/null +++ b/rtdetrv2_pytorch/src/solver/_solver.py @@ -0,0 +1,191 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torch.nn as nn + +from datetime import datetime +from pathlib import Path +from typing import Dict +import atexit + +from ..misc import dist_utils +from ..core import BaseConfig + + +def to(m: nn.Module, device: str): + if m is None: + return None + return m.to(device) + + +class BaseSolver(object): + def __init__(self, cfg: BaseConfig) -> None: + self.cfg = cfg + + def _setup(self, ): + """Avoid instantiating unnecessary classes + """ + cfg = self.cfg + if cfg.device: + device = torch.device(cfg.device) + else: + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + self.model = cfg.model + + # NOTE (lyuwenyu): must load_tuning_state before ema instance building + if self.cfg.tuning: + print(f'tuning checkpoint from {self.cfg.tuning}') + self.load_tuning_state(self.cfg.tuning) + + self.model = dist_utils.warp_model(self.model.to(device), sync_bn=cfg.sync_bn, \ + find_unused_parameters=cfg.find_unused_parameters) + + self.criterion = to(cfg.criterion, device) + self.postprocessor = to(cfg.postprocessor, device) + + self.ema = to(cfg.ema, device) + self.scaler = cfg.scaler + + self.device = device + self.last_epoch = self.cfg.last_epoch + + self.output_dir = Path(cfg.output_dir) + self.output_dir.mkdir(parents=True, exist_ok=True) + self.writer = cfg.writer + + if self.writer: + atexit.register(self.writer.close) + if dist_utils.is_main_process(): + self.writer.add_text(f'config', '{:s}'.format(cfg.__repr__()), 0) + + def cleanup(self, ): + if self.writer: + atexit.register(self.writer.close) + + def train(self, ): + self._setup() + self.optimizer = self.cfg.optimizer + self.lr_scheduler = self.cfg.lr_scheduler + self.lr_warmup_scheduler = self.cfg.lr_warmup_scheduler + + self.train_dataloader = dist_utils.warp_loader(self.cfg.train_dataloader, \ + shuffle=self.cfg.train_dataloader.shuffle) + self.val_dataloader = dist_utils.warp_loader(self.cfg.val_dataloader, \ + shuffle=self.cfg.val_dataloader.shuffle) + + self.evaluator = self.cfg.evaluator + + # NOTE instantiating order + if self.cfg.resume: + print(f'Resume checkpoint from {self.cfg.resume}') + self.load_resume_state(self.cfg.resume) + + def eval(self, ): + self._setup() + + self.val_dataloader = dist_utils.warp_loader(self.cfg.val_dataloader, \ + shuffle=self.cfg.val_dataloader.shuffle) + + self.evaluator = self.cfg.evaluator + + if self.cfg.resume: + print(f'Resume checkpoint from {self.cfg.resume}') + self.load_resume_state(self.cfg.resume) + + def to(self, device): + for k, v in self.__dict__.items(): + if hasattr(v, 'to'): + v.to(device) + + def state_dict(self): + """state dict, train/eval + """ + state = {} + state['date'] = datetime.now().isoformat() + + # TODO for resume + state['last_epoch'] = self.last_epoch + + for k, v in self.__dict__.items(): + if hasattr(v, 'state_dict'): + v = dist_utils.de_parallel(v) + state[k] = v.state_dict() + + return state + + + def load_state_dict(self, state): + """load state dict, train/eval + """ + # TODO + if 'last_epoch' in state: + self.last_epoch = state['last_epoch'] + print('Load last_epoch') + + for k, v in self.__dict__.items(): + if hasattr(v, 'load_state_dict') and k in state: + v = dist_utils.de_parallel(v) + v.load_state_dict(state[k]) + print(f'Load {k}.state_dict') + + if hasattr(v, 'load_state_dict') and k not in state: + print(f'Not load {k}.state_dict') + + + def load_resume_state(self, path: str): + """load resume + """ + # for cuda:0 memory + if path.startswith('http'): + state = torch.hub.load_state_dict_from_url(path, map_location='cpu') + else: + state = torch.load(path, map_location='cpu') + + self.load_state_dict(state) + + + def load_tuning_state(self, path: str,): + """only load model for tuning and skip missed/dismatched keys + """ + if path.startswith('http'): + state = torch.hub.load_state_dict_from_url(path, map_location='cpu') + else: + state = torch.load(path, map_location='cpu') + + module = dist_utils.de_parallel(self.model) + + # TODO hard code + if 'ema' in state: + stat, infos = self._matched_state(module.state_dict(), state['ema']['module']) + else: + stat, infos = self._matched_state(module.state_dict(), state['model']) + + module.load_state_dict(stat, strict=False) + print(f'Load model.state_dict, {infos}') + + + @staticmethod + def _matched_state(state: Dict[str, torch.Tensor], params: Dict[str, torch.Tensor]): + missed_list = [] + unmatched_list = [] + matched_state = {} + for k, v in state.items(): + if k in params: + if v.shape == params[k].shape: + matched_state[k] = params[k] + else: + unmatched_list.append(k) + else: + missed_list.append(k) + + return matched_state, {'missed': missed_list, 'unmatched': unmatched_list} + + + def fit(self, ): + raise NotImplementedError('') + + + def val(self, ): + raise NotImplementedError('') diff --git a/rtdetrv2_pytorch/src/solver/clas_engine.py b/rtdetrv2_pytorch/src/solver/clas_engine.py new file mode 100644 index 0000000..ad24077 --- /dev/null +++ b/rtdetrv2_pytorch/src/solver/clas_engine.py @@ -0,0 +1,74 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torch.nn as nn + +from ..misc import (MetricLogger, SmoothedValue, reduce_dict) + + +def train_one_epoch(model: nn.Module, criterion: nn.Module, dataloader, optimizer, ema, epoch, device): + """ + """ + model.train() + + metric_logger = MetricLogger(delimiter=" ") + metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}')) + print_freq = 100 + header = 'Epoch: [{}]'.format(epoch) + + for imgs, labels in metric_logger.log_every(dataloader, print_freq, header): + imgs = imgs.to(device) + labels = labels.to(device) + + preds = model(imgs) + loss: torch.Tensor = criterion(preds, labels) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + if ema is not None: + ema.update(model) + + loss_reduced_values = {k: v.item() for k, v in reduce_dict({'loss': loss}).items()} + metric_logger.update(**loss_reduced_values) + metric_logger.update(lr=optimizer.param_groups[0]["lr"]) + + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + + stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()} + return stats + + + +@torch.no_grad() +def evaluate(model, criterion, dataloader, device): + model.eval() + + metric_logger = MetricLogger(delimiter=" ") + # metric_logger.add_meter('acc', SmoothedValue(window_size=1, fmt='{global_avg:.4f}')) + # metric_logger.add_meter('loss', SmoothedValue(window_size=1, fmt='{value:.2f}')) + metric_logger.add_meter('acc', SmoothedValue(window_size=1)) + metric_logger.add_meter('loss', SmoothedValue(window_size=1)) + + header = 'Test:' + for imgs, labels in metric_logger.log_every(dataloader, 10, header): + imgs, labels = imgs.to(device), labels.to(device) + preds = model(imgs) + + acc = (preds.argmax(dim=-1) == labels).sum() / preds.shape[0] + loss = criterion(preds, labels) + + dict_reduced = reduce_dict({'acc': acc, 'loss': loss}) + reduced_values = {k: v.item() for k, v in dict_reduced.items()} + metric_logger.update(**reduced_values) + + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + + stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()} + return stats + + diff --git a/rtdetrv2_pytorch/src/solver/clas_solver.py b/rtdetrv2_pytorch/src/solver/clas_solver.py new file mode 100644 index 0000000..dc0860c --- /dev/null +++ b/rtdetrv2_pytorch/src/solver/clas_solver.py @@ -0,0 +1,71 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import time +import json +import datetime +from pathlib import Path + +import torch +import torch.nn as nn + +from ..misc import dist_utils +from ._solver import BaseSolver +from .clas_engine import train_one_epoch, evaluate + + +class ClasSolver(BaseSolver): + + def fit(self, ): + print("Start training") + self.train() + args = self.cfg + + n_parameters = sum(p.numel() for p in self.model.parameters() if p.requires_grad) + print('Number of params:', n_parameters) + + output_dir = Path(args.output_dir) + output_dir.mkdir(exist_ok=True) + + start_time = time.time() + start_epoch = self.last_epoch + 1 + for epoch in range(start_epoch, args.epoches): + + if dist_utils.is_dist_available_and_initialized(): + self.train_dataloader.sampler.set_epoch(epoch) + + train_stats = train_one_epoch(self.model, + self.criterion, + self.train_dataloader, + self.optimizer, + self.ema, + epoch=epoch, + device=self.device) + self.lr_scheduler.step() + self.last_epoch += 1 + + if output_dir: + checkpoint_paths = [output_dir / 'checkpoint.pth'] + # extra checkpoint before LR drop and every 100 epochs + if (epoch + 1) % args.checkpoint_freq == 0: + checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') + for checkpoint_path in checkpoint_paths: + dist_utils.save_on_master(self.state_dict(epoch), checkpoint_path) + + module = self.ema.module if self.ema else self.model + test_stats = evaluate(module, self.criterion, self.val_dataloader, self.device) + + log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, + **{f'test_{k}': v for k, v in test_stats.items()}, + 'epoch': epoch, + 'n_parameters': n_parameters} + + if output_dir and dist_utils.is_main_process(): + with (output_dir / "log.txt").open("a") as f: + f.write(json.dumps(log_stats) + "\n") + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('Training time {}'.format(total_time_str)) + + diff --git a/rtdetrv2_pytorch/src/solver/det_engine.py b/rtdetrv2_pytorch/src/solver/det_engine.py new file mode 100644 index 0000000..441ef39 --- /dev/null +++ b/rtdetrv2_pytorch/src/solver/det_engine.py @@ -0,0 +1,157 @@ +""" +Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +https://github.com/facebookresearch/detr/blob/main/engine.py + +Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import sys +import math +from typing import Iterable + +import torch +import torch.amp +from torch.utils.tensorboard import SummaryWriter +from torch.cuda.amp.grad_scaler import GradScaler + +from ..optim import ModelEMA, Warmup +from ..data import CocoEvaluator +from ..misc import MetricLogger, SmoothedValue, dist_utils + + +def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, + data_loader: Iterable, optimizer: torch.optim.Optimizer, + device: torch.device, epoch: int, max_norm: float = 0, **kwargs): + model.train() + criterion.train() + metric_logger = MetricLogger(delimiter=" ") + metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}')) + header = 'Epoch: [{}]'.format(epoch) + + print_freq = kwargs.get('print_freq', 10) + writer :SummaryWriter = kwargs.get('writer', None) + + ema :ModelEMA = kwargs.get('ema', None) + scaler :GradScaler = kwargs.get('scaler', None) + lr_warmup_scheduler :Warmup = kwargs.get('lr_warmup_scheduler', None) + + for i, (samples, targets) in enumerate(metric_logger.log_every(data_loader, print_freq, header)): + samples = samples.to(device) + targets = [{k: v.to(device) for k, v in t.items()} for t in targets] + global_step = epoch * len(data_loader) + i + metas = dict(epoch=epoch, step=i, global_step=global_step) + + if scaler is not None: + with torch.autocast(device_type=str(device), cache_enabled=True): + outputs = model(samples, targets=targets) + + with torch.autocast(device_type=str(device), enabled=False): + loss_dict = criterion(outputs, targets, **metas) + + loss = sum(loss_dict.values()) + scaler.scale(loss).backward() + + if max_norm > 0: + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) + + scaler.step(optimizer) + scaler.update() + optimizer.zero_grad() + + else: + outputs = model(samples, targets=targets) + loss_dict = criterion(outputs, targets, **metas) + + loss : torch.Tensor = sum(loss_dict.values()) + optimizer.zero_grad() + loss.backward() + + if max_norm > 0: + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) + + optimizer.step() + + # ema + if ema is not None: + ema.update(model) + + if lr_warmup_scheduler is not None: + lr_warmup_scheduler.step() + + loss_dict_reduced = dist_utils.reduce_dict(loss_dict) + loss_value = sum(loss_dict_reduced.values()) + + if not math.isfinite(loss_value): + print("Loss is {}, stopping training".format(loss_value)) + print(loss_dict_reduced) + sys.exit(1) + + metric_logger.update(loss=loss_value, **loss_dict_reduced) + metric_logger.update(lr=optimizer.param_groups[0]["lr"]) + + if writer and dist_utils.is_main_process(): + writer.add_scalar('Loss/total', loss_value.item(), global_step) + for j, pg in enumerate(optimizer.param_groups): + writer.add_scalar(f'Lr/pg_{j}', pg['lr'], global_step) + for k, v in loss_dict_reduced.items(): + writer.add_scalar(f'Loss/{k}', v.item(), global_step) + + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + return {k: meter.global_avg for k, meter in metric_logger.meters.items()} + + +@torch.no_grad() +def evaluate(model: torch.nn.Module, criterion: torch.nn.Module, postprocessor, data_loader, coco_evaluator: CocoEvaluator, device): + model.eval() + criterion.eval() + coco_evaluator.cleanup() + iou_types = coco_evaluator.iou_types + + metric_logger = MetricLogger(delimiter=" ") + header = 'Test:' + + for samples, targets in metric_logger.log_every(data_loader, 10, header): + samples = samples.to(device) + targets = [{k: v.to(device) for k, v in t.items()} for t in targets] + + outputs = model(samples) + + # TODO (lyuwenyu), fix dataset converted using `convert_to_coco_api`? + orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0) + + results = postprocessor(outputs, orig_target_sizes) + + # if 'segm' in postprocessor.keys(): + # target_sizes = torch.stack([t["size"] for t in targets], dim=0) + # results = postprocessor['segm'](results, outputs, orig_target_sizes, target_sizes) + + res = {target['image_id'].item(): output for target, output in zip(targets, results)} + if coco_evaluator is not None: + coco_evaluator.update(res) + + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + if coco_evaluator is not None: + coco_evaluator.synchronize_between_processes() + + # accumulate predictions from all images + if coco_evaluator is not None: + coco_evaluator.accumulate() + coco_evaluator.summarize() + + stats = {} + # stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()} + if coco_evaluator is not None: + if 'bbox' in iou_types: + stats['coco_eval_bbox'] = coco_evaluator.coco_eval['bbox'].stats.tolist() + if 'segm' in iou_types: + stats['coco_eval_masks'] = coco_evaluator.coco_eval['segm'].stats.tolist() + + return stats, coco_evaluator + + + diff --git a/rtdetrv2_pytorch/src/solver/det_solver.py b/rtdetrv2_pytorch/src/solver/det_solver.py new file mode 100644 index 0000000..af81989 --- /dev/null +++ b/rtdetrv2_pytorch/src/solver/det_solver.py @@ -0,0 +1,131 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import time +import json +import datetime + +import torch + +from ..misc import dist_utils, profiler_utils + +from ._solver import BaseSolver +from .det_engine import train_one_epoch, evaluate + + +class DetSolver(BaseSolver): + + def fit(self, ): + print("Start training") + self.train() + args = self.cfg + + n_parameters = sum([p.numel() for p in self.model.parameters() if p.requires_grad]) + print(f'number of trainable parameters: {n_parameters}') + + best_stat = {'epoch': -1, } + + start_time = time.time() + start_epcoch = self.last_epoch + 1 + + for epoch in range(start_epcoch, args.epoches): + + self.train_dataloader.set_epoch(epoch) + # self.train_dataloader.dataset.set_epoch(epoch) + if dist_utils.is_dist_available_and_initialized(): + self.train_dataloader.sampler.set_epoch(epoch) + + train_stats = train_one_epoch( + self.model, + self.criterion, + self.train_dataloader, + self.optimizer, + self.device, + epoch, + max_norm=args.clip_max_norm, + print_freq=args.print_freq, + ema=self.ema, + scaler=self.scaler, + lr_warmup_scheduler=self.lr_warmup_scheduler, + writer=self.writer + ) + + if self.lr_warmup_scheduler is None or self.lr_warmup_scheduler.finished(): + self.lr_scheduler.step() + + self.last_epoch += 1 + + if self.output_dir: + checkpoint_paths = [self.output_dir / 'last.pth'] + # extra checkpoint before LR drop and every 100 epochs + if (epoch + 1) % args.checkpoint_freq == 0: + checkpoint_paths.append(self.output_dir / f'checkpoint{epoch:04}.pth') + for checkpoint_path in checkpoint_paths: + dist_utils.save_on_master(self.state_dict(), checkpoint_path) + + module = self.ema.module if self.ema else self.model + test_stats, coco_evaluator = evaluate( + module, + self.criterion, + self.postprocessor, + self.val_dataloader, + self.evaluator, + self.device + ) + + # TODO + for k in test_stats: + if self.writer and dist_utils.is_main_process(): + for i, v in enumerate(test_stats[k]): + self.writer.add_scalar(f'Test/{k}_{i}'.format(k), v, epoch) + + if k in best_stat: + best_stat['epoch'] = epoch if test_stats[k][0] > best_stat[k] else best_stat['epoch'] + best_stat[k] = max(best_stat[k], test_stats[k][0]) + else: + best_stat['epoch'] = epoch + best_stat[k] = test_stats[k][0] + + if best_stat['epoch'] == epoch and self.output_dir: + dist_utils.save_on_master(self.state_dict(), self.output_dir / 'best.pth') + + print(f'best_stat: {best_stat}') + + log_stats = { + **{f'train_{k}': v for k, v in train_stats.items()}, + **{f'test_{k}': v for k, v in test_stats.items()}, + 'epoch': epoch, + 'n_parameters': n_parameters + } + + if self.output_dir and dist_utils.is_main_process(): + with (self.output_dir / "log.txt").open("a") as f: + f.write(json.dumps(log_stats) + "\n") + + # for evaluation logs + if coco_evaluator is not None: + (self.output_dir / 'eval').mkdir(exist_ok=True) + if "bbox" in coco_evaluator.coco_eval: + filenames = ['latest.pth'] + if epoch % 50 == 0: + filenames.append(f'{epoch:03}.pth') + for name in filenames: + torch.save(coco_evaluator.coco_eval["bbox"].eval, + self.output_dir / "eval" / name) + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('Training time {}'.format(total_time_str)) + + + def val(self, ): + self.eval() + + module = self.ema.module if self.ema else self.model + test_stats, coco_evaluator = evaluate(module, self.criterion, self.postprocessor, + self.val_dataloader, self.evaluator, self.device) + + if self.output_dir: + dist_utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, self.output_dir / "eval.pth") + + return diff --git a/rtdetrv2_pytorch/src/zoo/__init__.py b/rtdetrv2_pytorch/src/zoo/__init__.py new file mode 100644 index 0000000..b1bf6c5 --- /dev/null +++ b/rtdetrv2_pytorch/src/zoo/__init__.py @@ -0,0 +1,5 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + + +from . import rtdetr diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/__init__.py b/rtdetrv2_pytorch/src/zoo/rtdetr/__init__.py new file mode 100644 index 0000000..6addf4f --- /dev/null +++ b/rtdetrv2_pytorch/src/zoo/rtdetr/__init__.py @@ -0,0 +1,14 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + + +from .rtdetr import RTDETR +from .matcher import HungarianMatcher +from .hybrid_encoder import HybridEncoder +from .rtdetr_decoder import RTDETRTransformer +from .rtdetr_criterion import RTDETRCriterion +from .rtdetr_postprocessor import RTDETRPostProcessor + +# v2 +from .rtdetrv2_decoder import RTDETRTransformerv2 +from .rtdetrv2_criterion import RTDETRCriterionv2 \ No newline at end of file diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/box_ops.py b/rtdetrv2_pytorch/src/zoo/rtdetr/box_ops.py new file mode 100644 index 0000000..9c52c2b --- /dev/null +++ b/rtdetrv2_pytorch/src/zoo/rtdetr/box_ops.py @@ -0,0 +1,90 @@ +""" +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +https://github.com/facebookresearch/detr/blob/main/util/box_ops.py +""" + +import torch +from torch import Tensor +from torchvision.ops.boxes import box_area + + +def box_cxcywh_to_xyxy(x: Tensor) -> Tensor: + x_c, y_c, w, h = x.unbind(-1) + b = [(x_c - 0.5 * w), (y_c - 0.5 * h), + (x_c + 0.5 * w), (y_c + 0.5 * h)] + return torch.stack(b, dim=-1) + + +def box_xyxy_to_cxcywh(x: Tensor) -> Tensor: + x0, y0, x1, y1 = x.unbind(-1) + b = [(x0 + x1) / 2, (y0 + y1) / 2, + (x1 - x0), (y1 - y0)] + return torch.stack(b, dim=-1) + + +# modified from torchvision to also return the union +def box_iou(boxes1: Tensor, boxes2: Tensor): + area1 = box_area(boxes1) + area2 = box_area(boxes2) + + lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] + rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] + + wh = (rb - lt).clamp(min=0) # [N,M,2] + inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] + + union = area1[:, None] + area2 - inter + + iou = inter / union + return iou, union + + +def generalized_box_iou(boxes1, boxes2): + """ + Generalized IoU from https://giou.stanford.edu/ + + The boxes should be in [x0, y0, x1, y1] format + + Returns a [N, M] pairwise matrix, where N = len(boxes1) + and M = len(boxes2) + """ + # degenerate boxes gives inf / nan results + # so do an early check + assert (boxes1[:, 2:] >= boxes1[:, :2]).all() + assert (boxes2[:, 2:] >= boxes2[:, :2]).all() + iou, union = box_iou(boxes1, boxes2) + + lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) + rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) + + wh = (rb - lt).clamp(min=0) # [N,M,2] + area = wh[:, :, 0] * wh[:, :, 1] + + return iou - (area - union) / area + + +def masks_to_boxes(masks): + """Compute the bounding boxes around the provided masks + + The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. + + Returns a [N, 4] tensors, with the boxes in xyxy format + """ + if masks.numel() == 0: + return torch.zeros((0, 4), device=masks.device) + + h, w = masks.shape[-2:] + + y = torch.arange(0, h, dtype=torch.float) + x = torch.arange(0, w, dtype=torch.float) + y, x = torch.meshgrid(y, x) + + x_mask = (masks * x.unsqueeze(0)) + x_max = x_mask.flatten(1).max(-1)[0] + x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] + + y_mask = (masks * y.unsqueeze(0)) + y_max = y_mask.flatten(1).max(-1)[0] + y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] + + return torch.stack([x_min, y_min, x_max, y_max], 1) \ No newline at end of file diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/conver_params.py b/rtdetrv2_pytorch/src/zoo/rtdetr/conver_params.py new file mode 100644 index 0000000..e93366a --- /dev/null +++ b/rtdetrv2_pytorch/src/zoo/rtdetr/conver_params.py @@ -0,0 +1,72 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch + +def main(args) -> None: + import cvperception + from cvperception.core import load_config, merge_config, create + cfg = load_config(args.config) + model: torch.nn.Module = create(cfg['model'], merge_config(cfg)) + + if args.version == 1: + state = model.state_dict() + keys = [k for k in state.keys() if 'num_batches_tracked' not in k] + + elif args.version == 2: + state = model.state_dict() + ignore_keys = ['anchors', 'valid_mask', 'num_points_scale'] + keys = [k for k in state.keys() if 'num_batches_tracked' not in k] + keys = [k for k in keys if not any([x in k for x in ignore_keys])] + + import paddle + p_state = paddle.load(args.pdparams) + pkeys = list(p_state.keys()) + + assert len(keys) == len(pkeys), f'{len(keys)}, {len(pkeys)}' + + new_state = {} + for i, k in enumerate(keys): + pp = p_state[pkeys[i]] + pp = torch.tensor(pp.numpy()) + + if 'denoising_class_embed' in k: + new_state[k] = torch.concat([pp, torch.zeros(1, pp.shape[-1])], dim=0) + continue + + tp = state[k] + if len(tp.shape) == 2: + new_state[k] = pp.T + elif len(tp.shape) == 1: + new_state[k] = pp + else: + assert tp.shape == pp.shape, f'{k}, {pp.shape}, {tp.shape}' + new_state[k] = pp + + assert len(new_state) == len(p_state), '' + + # checkpoint = {'ema': {'module': new_state, }} + # torch.save(checkpoint, args.output_file) + + model.load_state_dict(new_state, strict=False) + + checkpoint = {'ema': {'module': model.state_dict(), }} + torch.save(checkpoint, args.output_file) + + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--config', type=str, ) + parser.add_argument('-p', '--pdparams', type=str, ) + parser.add_argument('-o', '--output_file', type=str, ) + parser.add_argument('-v', '--version', type=int, default=1) + + args = parser.parse_args() + main(args) + + # python ./src/cvperception/zoo/rtdetr/conver_params.py -c configs/rtdetr/rtdetr_r18vd_6x_coco.yml -p rtdetr_r18vd_dec3_6x_coco.pdparams -o rtdetr_r18vd_dec3_6x_coco_new.pth + # python ./src/cvperception/zoo/rtdetr/conver_params.py -c configs/rtdetr/rtdetr_r18vd_6x_coco.yml -p rtdetr_r18vd_5x_coco_objects365.pdparams -o rtdetr_r18vd_5x_coco_objects365_new.pth + # python ./src/cvperception/zoo/rtdetr/conver_params.py -c configs/rtdetrv2/rtdetrv2_r50vd_120e_coco.yml -p rtdetr_r50vd_1x_objects365.pdparams -o rtdetrv2_r50vd_1x_objects365_new.pth -v 2 + diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/denoising.py b/rtdetrv2_pytorch/src/zoo/rtdetr/denoising.py new file mode 100644 index 0000000..4723b67 --- /dev/null +++ b/rtdetrv2_pytorch/src/zoo/rtdetr/denoising.py @@ -0,0 +1,104 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch + +from .utils import inverse_sigmoid +from .box_ops import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh + + + +def get_contrastive_denoising_training_group(targets, + num_classes, + num_queries, + class_embed, + num_denoising=100, + label_noise_ratio=0.5, + box_noise_scale=1.0,): + """cnd""" + if num_denoising <= 0: + return None, None, None, None + + num_gts = [len(t['labels']) for t in targets] + device = targets[0]['labels'].device + + max_gt_num = max(num_gts) + if max_gt_num == 0: + return None, None, None, None + + num_group = num_denoising // max_gt_num + num_group = 1 if num_group == 0 else num_group + # pad gt to max_num of a batch + bs = len(num_gts) + + input_query_class = torch.full([bs, max_gt_num], num_classes, dtype=torch.int32, device=device) + input_query_bbox = torch.zeros([bs, max_gt_num, 4], device=device) + pad_gt_mask = torch.zeros([bs, max_gt_num], dtype=torch.bool, device=device) + + for i in range(bs): + num_gt = num_gts[i] + if num_gt > 0: + input_query_class[i, :num_gt] = targets[i]['labels'] + input_query_bbox[i, :num_gt] = targets[i]['boxes'] + pad_gt_mask[i, :num_gt] = 1 + # each group has positive and negative queries. + input_query_class = input_query_class.tile([1, 2 * num_group]) + input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1]) + pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group]) + # positive and negative mask + negative_gt_mask = torch.zeros([bs, max_gt_num * 2, 1], device=device) + negative_gt_mask[:, max_gt_num:] = 1 + negative_gt_mask = negative_gt_mask.tile([1, num_group, 1]) + positive_gt_mask = 1 - negative_gt_mask + # contrastive denoising training positive index + positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask + dn_positive_idx = torch.nonzero(positive_gt_mask)[:, 1] + dn_positive_idx = torch.split(dn_positive_idx, [n * num_group for n in num_gts]) + # total denoising queries + num_denoising = int(max_gt_num * 2 * num_group) + + if label_noise_ratio > 0: + mask = torch.rand_like(input_query_class, dtype=torch.float) < (label_noise_ratio * 0.5) + # randomly put a new one here + new_label = torch.randint_like(mask, 0, num_classes, dtype=input_query_class.dtype) + input_query_class = torch.where(mask & pad_gt_mask, new_label, input_query_class) + + if box_noise_scale > 0: + known_bbox = box_cxcywh_to_xyxy(input_query_bbox) + diff = torch.tile(input_query_bbox[..., 2:] * 0.5, [1, 1, 2]) * box_noise_scale + rand_sign = torch.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0 + rand_part = torch.rand_like(input_query_bbox) + rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (1 - negative_gt_mask) + known_bbox += (rand_sign * rand_part * diff) + known_bbox = torch.clip(known_bbox, min=0.0, max=1.0) + input_query_bbox = box_xyxy_to_cxcywh(known_bbox) + input_query_bbox_unact = inverse_sigmoid(input_query_bbox) + + input_query_logits = class_embed(input_query_class) + + tgt_size = num_denoising + num_queries + attn_mask = torch.full([tgt_size, tgt_size], False, dtype=torch.bool, device=device) + # match query cannot see the reconstruction + attn_mask[num_denoising:, :num_denoising] = True + + # reconstruct cannot see each other + for i in range(num_group): + if i == 0: + attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1): num_denoising] = True + if i == num_group - 1: + attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), :max_gt_num * i * 2] = True + else: + attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), max_gt_num * 2 * (i + 1): num_denoising] = True + attn_mask[max_gt_num * 2 * i: max_gt_num * 2 * (i + 1), :max_gt_num * 2 * i] = True + + dn_meta = { + "dn_positive_idx": dn_positive_idx, + "dn_num_group": num_group, + "dn_num_split": [num_denoising, num_queries] + } + + # print(input_query_class.shape) # torch.Size([4, 196, 256]) + # print(input_query_bbox.shape) # torch.Size([4, 196, 4]) + # print(attn_mask.shape) # torch.Size([496, 496]) + + return input_query_logits, input_query_bbox_unact, attn_mask, dn_meta diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/hybrid_encoder.py b/rtdetrv2_pytorch/src/zoo/rtdetr/hybrid_encoder.py new file mode 100644 index 0000000..e8c22cc --- /dev/null +++ b/rtdetrv2_pytorch/src/zoo/rtdetr/hybrid_encoder.py @@ -0,0 +1,330 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import copy +from collections import OrderedDict + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .utils import get_activation + +from ...core import register + + +__all__ = ['HybridEncoder'] + + + +class ConvNormLayer(nn.Module): + def __init__(self, ch_in, ch_out, kernel_size, stride, padding=None, bias=False, act=None): + super().__init__() + self.conv = nn.Conv2d( + ch_in, + ch_out, + kernel_size, + stride, + padding=(kernel_size-1)//2 if padding is None else padding, + bias=bias) + self.norm = nn.BatchNorm2d(ch_out) + self.act = nn.Identity() if act is None else get_activation(act) + + def forward(self, x): + return self.act(self.norm(self.conv(x))) + + +class RepVggBlock(nn.Module): + def __init__(self, ch_in, ch_out, act='relu'): + super().__init__() + self.ch_in = ch_in + self.ch_out = ch_out + self.conv1 = ConvNormLayer(ch_in, ch_out, 3, 1, padding=1, act=None) + self.conv2 = ConvNormLayer(ch_in, ch_out, 1, 1, padding=0, act=None) + self.act = nn.Identity() if act is None else get_activation(act) + + def forward(self, x): + if hasattr(self, 'conv'): + y = self.conv(x) + else: + y = self.conv1(x) + self.conv2(x) + + return self.act(y) + + def convert_to_deploy(self): + if not hasattr(self, 'conv'): + self.conv = nn.Conv2d(self.ch_in, self.ch_out, 3, 1, padding=1) + + kernel, bias = self.get_equivalent_kernel_bias() + self.conv.weight.data = kernel + self.conv.bias.data = bias + + def get_equivalent_kernel_bias(self): + kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1) + kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2) + + return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1), bias3x3 + bias1x1 + + def _pad_1x1_to_3x3_tensor(self, kernel1x1): + if kernel1x1 is None: + return 0 + else: + return F.pad(kernel1x1, [1, 1, 1, 1]) + + def _fuse_bn_tensor(self, branch: ConvNormLayer): + if branch is None: + return 0, 0 + kernel = branch.conv.weight + running_mean = branch.norm.running_mean + running_var = branch.norm.running_var + gamma = branch.norm.weight + beta = branch.norm.bias + eps = branch.norm.eps + std = (running_var + eps).sqrt() + t = (gamma / std).reshape(-1, 1, 1, 1) + return kernel * t, beta - running_mean * gamma / std + + +class CSPRepLayer(nn.Module): + def __init__(self, + in_channels, + out_channels, + num_blocks=3, + expansion=1.0, + bias=None, + act="silu"): + super(CSPRepLayer, self).__init__() + hidden_channels = int(out_channels * expansion) + self.conv1 = ConvNormLayer(in_channels, hidden_channels, 1, 1, bias=bias, act=act) + self.conv2 = ConvNormLayer(in_channels, hidden_channels, 1, 1, bias=bias, act=act) + self.bottlenecks = nn.Sequential(*[ + RepVggBlock(hidden_channels, hidden_channels, act=act) for _ in range(num_blocks) + ]) + if hidden_channels != out_channels: + self.conv3 = ConvNormLayer(hidden_channels, out_channels, 1, 1, bias=bias, act=act) + else: + self.conv3 = nn.Identity() + + def forward(self, x): + x_1 = self.conv1(x) + x_1 = self.bottlenecks(x_1) + x_2 = self.conv2(x) + return self.conv3(x_1 + x_2) + + +# transformer +class TransformerEncoderLayer(nn.Module): + def __init__(self, + d_model, + nhead, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + normalize_before=False): + super().__init__() + self.normalize_before = normalize_before + + self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout, batch_first=True) + + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + self.activation = get_activation(activation) + + @staticmethod + def with_pos_embed(tensor, pos_embed): + return tensor if pos_embed is None else tensor + pos_embed + + def forward(self, src, src_mask=None, pos_embed=None) -> torch.Tensor: + residual = src + if self.normalize_before: + src = self.norm1(src) + q = k = self.with_pos_embed(src, pos_embed) + src, _ = self.self_attn(q, k, value=src, attn_mask=src_mask) + + src = residual + self.dropout1(src) + if not self.normalize_before: + src = self.norm1(src) + + residual = src + if self.normalize_before: + src = self.norm2(src) + src = self.linear2(self.dropout(self.activation(self.linear1(src)))) + src = residual + self.dropout2(src) + if not self.normalize_before: + src = self.norm2(src) + return src + + +class TransformerEncoder(nn.Module): + def __init__(self, encoder_layer, num_layers, norm=None): + super(TransformerEncoder, self).__init__() + self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for _ in range(num_layers)]) + self.num_layers = num_layers + self.norm = norm + + def forward(self, src, src_mask=None, pos_embed=None) -> torch.Tensor: + output = src + for layer in self.layers: + output = layer(output, src_mask=src_mask, pos_embed=pos_embed) + + if self.norm is not None: + output = self.norm(output) + + return output + + +@register() +class HybridEncoder(nn.Module): + __share__ = ['eval_spatial_size', ] + + def __init__(self, + in_channels=[512, 1024, 2048], + feat_strides=[8, 16, 32], + hidden_dim=256, + nhead=8, + dim_feedforward = 1024, + dropout=0.0, + enc_act='gelu', + use_encoder_idx=[2], + num_encoder_layers=1, + pe_temperature=10000, + expansion=1.0, + depth_mult=1.0, + act='silu', + eval_spatial_size=None, + version='v2'): + super().__init__() + self.in_channels = in_channels + self.feat_strides = feat_strides + self.hidden_dim = hidden_dim + self.use_encoder_idx = use_encoder_idx + self.num_encoder_layers = num_encoder_layers + self.pe_temperature = pe_temperature + self.eval_spatial_size = eval_spatial_size + self.out_channels = [hidden_dim for _ in range(len(in_channels))] + self.out_strides = feat_strides + + # channel projection + self.input_proj = nn.ModuleList() + for in_channel in in_channels: + if version == 'v1': + proj = nn.Sequential( + nn.Conv2d(in_channel, hidden_dim, kernel_size=1, bias=False), + nn.BatchNorm2d(hidden_dim)) + elif version == 'v2': + proj = nn.Sequential(OrderedDict([ + ('conv', nn.Conv2d(in_channel, hidden_dim, kernel_size=1, bias=False)), + ('norm', nn.BatchNorm2d(hidden_dim)) + ])) + else: + raise AttributeError() + + self.input_proj.append(proj) + + # encoder transformer + encoder_layer = TransformerEncoderLayer( + hidden_dim, + nhead=nhead, + dim_feedforward=dim_feedforward, + dropout=dropout, + activation=enc_act) + + self.encoder = nn.ModuleList([ + TransformerEncoder(copy.deepcopy(encoder_layer), num_encoder_layers) for _ in range(len(use_encoder_idx)) + ]) + + # top-down fpn + self.lateral_convs = nn.ModuleList() + self.fpn_blocks = nn.ModuleList() + for _ in range(len(in_channels) - 1, 0, -1): + self.lateral_convs.append(ConvNormLayer(hidden_dim, hidden_dim, 1, 1, act=act)) + self.fpn_blocks.append( + CSPRepLayer(hidden_dim * 2, hidden_dim, round(3 * depth_mult), act=act, expansion=expansion) + ) + + # bottom-up pan + self.downsample_convs = nn.ModuleList() + self.pan_blocks = nn.ModuleList() + for _ in range(len(in_channels) - 1): + self.downsample_convs.append( + ConvNormLayer(hidden_dim, hidden_dim, 3, 2, act=act) + ) + self.pan_blocks.append( + CSPRepLayer(hidden_dim * 2, hidden_dim, round(3 * depth_mult), act=act, expansion=expansion) + ) + + self._reset_parameters() + + def _reset_parameters(self): + if self.eval_spatial_size: + for idx in self.use_encoder_idx: + stride = self.feat_strides[idx] + pos_embed = self.build_2d_sincos_position_embedding( + self.eval_spatial_size[1] // stride, self.eval_spatial_size[0] // stride, + self.hidden_dim, self.pe_temperature) + setattr(self, f'pos_embed{idx}', pos_embed) + # self.register_buffer(f'pos_embed{idx}', pos_embed) + + @staticmethod + def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.): + """ + """ + grid_w = torch.arange(int(w), dtype=torch.float32) + grid_h = torch.arange(int(h), dtype=torch.float32) + grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing='ij') + assert embed_dim % 4 == 0, \ + 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding' + pos_dim = embed_dim // 4 + omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim + omega = 1. / (temperature ** omega) + + out_w = grid_w.flatten()[..., None] @ omega[None] + out_h = grid_h.flatten()[..., None] @ omega[None] + + return torch.concat([out_w.sin(), out_w.cos(), out_h.sin(), out_h.cos()], dim=1)[None, :, :] + + def forward(self, feats): + assert len(feats) == len(self.in_channels) + proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)] + + # encoder + if self.num_encoder_layers > 0: + for i, enc_ind in enumerate(self.use_encoder_idx): + h, w = proj_feats[enc_ind].shape[2:] + # flatten [B, C, H, W] to [B, HxW, C] + src_flatten = proj_feats[enc_ind].flatten(2).permute(0, 2, 1) + if self.training or self.eval_spatial_size is None: + pos_embed = self.build_2d_sincos_position_embedding( + w, h, self.hidden_dim, self.pe_temperature).to(src_flatten.device) + else: + pos_embed = getattr(self, f'pos_embed{enc_ind}', None).to(src_flatten.device) + + memory :torch.Tensor = self.encoder[i](src_flatten, pos_embed=pos_embed) + proj_feats[enc_ind] = memory.permute(0, 2, 1).reshape(-1, self.hidden_dim, h, w).contiguous() + + # broadcasting and fusion + inner_outs = [proj_feats[-1]] + for idx in range(len(self.in_channels) - 1, 0, -1): + feat_heigh = inner_outs[0] + feat_low = proj_feats[idx - 1] + feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](feat_heigh) + inner_outs[0] = feat_heigh + upsample_feat = F.interpolate(feat_heigh, scale_factor=2., mode='nearest') + inner_out = self.fpn_blocks[len(self.in_channels)-1-idx](torch.concat([upsample_feat, feat_low], dim=1)) + inner_outs.insert(0, inner_out) + + outs = [inner_outs[0]] + for idx in range(len(self.in_channels) - 1): + feat_low = outs[-1] + feat_height = inner_outs[idx + 1] + downsample_feat = self.downsample_convs[idx](feat_low) + out = self.pan_blocks[idx](torch.concat([downsample_feat, feat_height], dim=1)) + outs.append(out) + + return outs diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/matcher.py b/rtdetrv2_pytorch/src/zoo/rtdetr/matcher.py new file mode 100644 index 0000000..580ea91 --- /dev/null +++ b/rtdetrv2_pytorch/src/zoo/rtdetr/matcher.py @@ -0,0 +1,111 @@ +""" +Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +Modules to compute the matching cost and solve the corresponding LSAP. + +Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from scipy.optimize import linear_sum_assignment +from typing import Dict + +from .box_ops import box_cxcywh_to_xyxy, generalized_box_iou + +from ...core import register + + +@register() +class HungarianMatcher(nn.Module): + """This class computes an assignment between the targets and the predictions of the network + + For efficiency reasons, the targets don't include the no_object. Because of this, in general, + there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, + while the others are un-matched (and thus treated as non-objects). + """ + + __share__ = ['use_focal_loss', ] + + def __init__(self, weight_dict, use_focal_loss=False, alpha=0.25, gamma=2.0): + """Creates the matcher + + Params: + cost_class: This is the relative weight of the classification error in the matching cost + cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost + cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost + """ + super().__init__() + self.cost_class = weight_dict['cost_class'] + self.cost_bbox = weight_dict['cost_bbox'] + self.cost_giou = weight_dict['cost_giou'] + + self.use_focal_loss = use_focal_loss + self.alpha = alpha + self.gamma = gamma + + assert self.cost_class != 0 or self.cost_bbox != 0 or self.cost_giou != 0, "all costs cant be 0" + + @torch.no_grad() + def forward(self, outputs: Dict[str, torch.Tensor], targets): + """ Performs the matching + + Params: + outputs: This is a dict that contains at least these entries: + "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits + "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates + + targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: + "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth + objects in the target) containing the class labels + "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates + + Returns: + A list of size batch_size, containing tuples of (index_i, index_j) where: + - index_i is the indices of the selected predictions (in order) + - index_j is the indices of the corresponding selected targets (in order) + For each batch element, it holds: + len(index_i) = len(index_j) = min(num_queries, num_target_boxes) + """ + bs, num_queries = outputs["pred_logits"].shape[:2] + + # We flatten to compute the cost matrices in a batch + if self.use_focal_loss: + out_prob = F.sigmoid(outputs["pred_logits"].flatten(0, 1)) + else: + out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1) # [batch_size * num_queries, num_classes] + + out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] + + # Also concat the target labels and boxes + tgt_ids = torch.cat([v["labels"] for v in targets]) + tgt_bbox = torch.cat([v["boxes"] for v in targets]) + + # Compute the classification cost. Contrary to the loss, we don't use the NLL, + # but approximate it in 1 - proba[target class]. + # The 1 is a constant that doesn't change the matching, it can be ommitted. + if self.use_focal_loss: + out_prob = out_prob[:, tgt_ids] + neg_cost_class = (1 - self.alpha) * (out_prob ** self.gamma) * (-(1 - out_prob + 1e-8).log()) + pos_cost_class = self.alpha * ((1 - out_prob) ** self.gamma) * (-(out_prob + 1e-8).log()) + cost_class = pos_cost_class - neg_cost_class + else: + cost_class = -out_prob[:, tgt_ids] + + # Compute the L1 cost between boxes + cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1) + + # Compute the giou cost betwen boxes + cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)) + + # Final cost matrix + C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou + C = C.view(bs, num_queries, -1).cpu() + + sizes = [len(v["boxes"]) for v in targets] + indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))] + indices = [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] + + return {'indices': indices} + \ No newline at end of file diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr.py b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr.py new file mode 100644 index 0000000..373f7bf --- /dev/null +++ b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr.py @@ -0,0 +1,44 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import random +import numpy as np +from typing import List + +from ...core import register + + +__all__ = ['RTDETR', ] + + +@register() +class RTDETR(nn.Module): + __inject__ = ['backbone', 'encoder', 'decoder', ] + + def __init__(self, \ + backbone: nn.Module, + encoder: nn.Module, + decoder: nn.Module, + ): + super().__init__() + self.backbone = backbone + self.decoder = decoder + self.encoder = encoder + + def forward(self, x, targets=None): + x = self.backbone(x) + x = self.encoder(x) + x = self.decoder(x, targets) + + return x + + def deploy(self, ): + self.eval() + for m in self.modules(): + if hasattr(m, 'convert_to_deploy'): + m.convert_to_deploy() + return self diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr_criterion.py b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr_criterion.py new file mode 100644 index 0000000..ab269e8 --- /dev/null +++ b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr_criterion.py @@ -0,0 +1,282 @@ +""" +reference: +https://github.com/facebookresearch/detr/blob/main/models/detr.py + +Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + + +import torch +import torch.nn as nn +import torch.distributed +import torch.nn.functional as F +import torchvision + +from .box_ops import box_cxcywh_to_xyxy, box_iou, generalized_box_iou +from ...misc.dist_utils import get_world_size, is_dist_available_and_initialized +from ...core import register + + + +@register() +class RTDETRCriterion(nn.Module): + """ This class computes the loss for DETR. + The process happens in two steps: + 1) we compute hungarian assignment between ground truth boxes and the outputs of the model + 2) we supervise each pair of matched ground-truth / prediction (supervise class and box) + """ + __share__ = ['num_classes', ] + __inject__ = ['matcher', ] + + def __init__(self, matcher, weight_dict, losses, alpha=0.2, gamma=2.0, eos_coef=1e-4, num_classes=80): + """ Create the criterion. + Parameters: + num_classes: number of object categories, omitting the special no-object category + matcher: module able to compute a matching between targets and proposals + weight_dict: dict containing as key the names of the losses and as values their relative weight. + eos_coef: relative classification weight applied to the no-object category + losses: list of all the losses to be applied. See get_loss for list of available losses. + """ + super().__init__() + self.num_classes = num_classes + self.matcher = matcher + self.weight_dict = weight_dict + self.losses = losses + + empty_weight = torch.ones(self.num_classes + 1) + empty_weight[-1] = eos_coef + self.register_buffer('empty_weight', empty_weight) + + self.alpha = alpha + self.gamma = gamma + + + def loss_labels(self, outputs, targets, indices, num_boxes, log=True): + """Classification loss (NLL) + targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes] + """ + assert 'pred_logits' in outputs + src_logits = outputs['pred_logits'] + + idx = self._get_src_permutation_idx(indices) + target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) + target_classes = torch.full(src_logits.shape[:2], self.num_classes, + dtype=torch.int64, device=src_logits.device) + target_classes[idx] = target_classes_o + + loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight) + losses = {'loss_ce': loss_ce} + + if log: + # TODO this should probably be a separate loss, not hacked in this one here + losses['class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0] + return losses + + def loss_labels_focal(self, outputs, targets, indices, num_boxes, log=True): + assert 'pred_logits' in outputs + src_logits = outputs['pred_logits'] + + idx = self._get_src_permutation_idx(indices) + target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) + target_classes = torch.full(src_logits.shape[:2], self.num_classes, + dtype=torch.int64, device=src_logits.device) + target_classes[idx] = target_classes_o + + target = F.one_hot(target_classes, num_classes=self.num_classes+1)[..., :-1] + loss = torchvision.ops.sigmoid_focal_loss(src_logits, target, self.alpha, self.gamma, reduction='none') + loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes + + return {'loss_focal': loss} + + def loss_labels_vfl(self, outputs, targets, indices, num_boxes, log=True): + assert 'pred_boxes' in outputs + idx = self._get_src_permutation_idx(indices) + + src_boxes = outputs['pred_boxes'][idx] + target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0) + ious, _ = box_iou(box_cxcywh_to_xyxy(src_boxes), box_cxcywh_to_xyxy(target_boxes)) + ious = torch.diag(ious).detach() + + src_logits = outputs['pred_logits'] + target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) + target_classes = torch.full(src_logits.shape[:2], self.num_classes, + dtype=torch.int64, device=src_logits.device) + target_classes[idx] = target_classes_o + target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1] + + target_score_o = torch.zeros_like(target_classes, dtype=src_logits.dtype) + target_score_o[idx] = ious.to(target_score_o.dtype) + target_score = target_score_o.unsqueeze(-1) * target + + pred_score = F.sigmoid(src_logits).detach() + weight = self.alpha * pred_score.pow(self.gamma) * (1 - target) + target_score + + loss = F.binary_cross_entropy_with_logits(src_logits, target_score, weight=weight, reduction='none') + loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes + return {'loss_vfl': loss} + + @torch.no_grad() + def loss_cardinality(self, outputs, targets, indices, num_boxes): + """ Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes + This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients + """ + pred_logits = outputs['pred_logits'] + device = pred_logits.device + tgt_lengths = torch.as_tensor([len(v["labels"]) for v in targets], device=device) + # Count the number of predictions that are NOT "no-object" (which is the last class) + card_pred = (pred_logits.argmax(-1) != pred_logits.shape[-1] - 1).sum(1) + card_err = F.l1_loss(card_pred.float(), tgt_lengths.float()) + losses = {'cardinality_error': card_err} + return losses + + def loss_boxes(self, outputs, targets, indices, num_boxes): + """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss + targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4] + The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size. + """ + assert 'pred_boxes' in outputs + idx = self._get_src_permutation_idx(indices) + src_boxes = outputs['pred_boxes'][idx] + target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0) + + losses = {} + + loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none') + losses['loss_bbox'] = loss_bbox.sum() / num_boxes + + loss_giou = 1 - torch.diag(generalized_box_iou(\ + box_cxcywh_to_xyxy(src_boxes), box_cxcywh_to_xyxy(target_boxes))) + losses['loss_giou'] = loss_giou.sum() / num_boxes + return losses + + def _get_src_permutation_idx(self, indices): + # permute predictions following indices + batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) + src_idx = torch.cat([src for (src, _) in indices]) + return batch_idx, src_idx + + def _get_tgt_permutation_idx(self, indices): + # permute targets following indices + batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) + tgt_idx = torch.cat([tgt for (_, tgt) in indices]) + return batch_idx, tgt_idx + + def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs): + loss_map = { + 'labels': self.loss_labels, + 'boxes': self.loss_boxes, + 'cardinality': self.loss_cardinality, + 'focal': self.loss_labels_focal, + 'vfl': self.loss_labels_vfl, + } + assert loss in loss_map, f'do you really want to compute {loss} loss?' + return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs) + + def forward(self, outputs, targets, **kwargs): + """ This performs the loss computation. + Parameters: + outputs: dict of tensors, see the output specification of the model for the format + targets: list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc + """ + outputs_without_aux = {k: v for k, v in outputs.items() if 'aux' not in k} + + # Compute the average number of target boxes accross all nodes, for normalization purposes + num_boxes = sum(len(t["labels"]) for t in targets) + num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) + if is_dist_available_and_initialized(): + torch.distributed.all_reduce(num_boxes) + num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item() + + # Retrieve the matching between the outputs of the last layer and the targets + indices = self.matcher(outputs_without_aux, targets)['indices'] + + # Compute all the requested losses + losses = {} + for loss in self.losses: + l_dict = self.get_loss(loss, outputs, targets, indices, num_boxes) + l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict} + losses.update(l_dict) + + # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. + if 'aux_outputs' in outputs: + for i, aux_outputs in enumerate(outputs['aux_outputs']): + indices = self.matcher(aux_outputs, targets)['indices'] + for loss in self.losses: + if loss == 'masks': + # Intermediate masks losses are too costly to compute, we ignore them. + continue + kwargs = {} + if loss == 'labels': + # Logging is enabled only for the last layer + kwargs = {'log': False} + + l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs) + l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict} + l_dict = {k + f'_aux_{i}': v for k, v in l_dict.items()} + losses.update(l_dict) + + # In case of cdn auxiliary losses. For rtdetr + if 'dn_aux_outputs' in outputs: + assert 'dn_meta' in outputs, '' + indices = self.get_cdn_matched_indices(outputs['dn_meta'], targets) + dn_num_boxes = num_boxes * outputs['dn_meta']['dn_num_group'] + for i, aux_outputs in enumerate(outputs['dn_aux_outputs']): + for loss in self.losses: + if loss == 'masks': + # Intermediate masks losses are too costly to compute, we ignore them. + continue + + l_dict = self.get_loss(loss, aux_outputs, targets, indices, dn_num_boxes, **kwargs) + l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict} + l_dict = {k + f'_dn_{i}': v for k, v in l_dict.items()} + losses.update(l_dict) + + return losses + + @staticmethod + def get_cdn_matched_indices(dn_meta, targets): + """get_cdn_matched_indices + """ + dn_positive_idx, dn_num_group = dn_meta["dn_positive_idx"], dn_meta["dn_num_group"] + num_gts = [len(t['labels']) for t in targets] + device = targets[0]['labels'].device + + dn_match_indices = [] + for i, num_gt in enumerate(num_gts): + if num_gt > 0: + gt_idx = torch.arange(num_gt, dtype=torch.int64, device=device) + gt_idx = gt_idx.tile(dn_num_group) + assert len(dn_positive_idx[i]) == len(gt_idx) + dn_match_indices.append((dn_positive_idx[i], gt_idx)) + else: + dn_match_indices.append((torch.zeros(0, dtype=torch.int64, device=device), \ + torch.zeros(0, dtype=torch.int64, device=device))) + + return dn_match_indices + + + + + +@torch.no_grad() +def accuracy(output, target, topk=(1,)): + """Computes the precision@k for the specified values of k""" + if target.numel() == 0: + return [torch.zeros([], device=output.device)] + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].view(-1).float().sum(0) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + + + diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr_decoder.py b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr_decoder.py new file mode 100644 index 0000000..536fbf8 --- /dev/null +++ b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr_decoder.py @@ -0,0 +1,583 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import math +import copy +from collections import OrderedDict + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.nn.init as init + +from .denoising import get_contrastive_denoising_training_group +from .utils import deformable_attention_core_func, get_activation, inverse_sigmoid +from .utils import bias_init_with_prob + + +from ...core import register + + +__all__ = ['RTDETRTransformer'] + + + +class MLP(nn.Module): + def __init__(self, input_dim, hidden_dim, output_dim, num_layers, act='relu'): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + self.act = nn.Identity() if act is None else get_activation(act) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = self.act(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + + +class MSDeformableAttention(nn.Module): + def __init__(self, embed_dim=256, num_heads=8, num_levels=4, num_points=4,): + """ + Multi-Scale Deformable Attention Module + """ + super(MSDeformableAttention, self).__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.num_levels = num_levels + self.num_points = num_points + self.total_points = num_heads * num_levels * num_points + + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" + + self.sampling_offsets = nn.Linear(embed_dim, self.total_points * 2,) + self.attention_weights = nn.Linear(embed_dim, self.total_points) + self.value_proj = nn.Linear(embed_dim, embed_dim) + self.output_proj = nn.Linear(embed_dim, embed_dim) + + self.ms_deformable_attn_core = deformable_attention_core_func + + self._reset_parameters() + + + def _reset_parameters(self): + # sampling_offsets + init.constant_(self.sampling_offsets.weight, 0) + thetas = torch.arange(self.num_heads, dtype=torch.float32) * (2.0 * math.pi / self.num_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = grid_init / grid_init.abs().max(-1, keepdim=True).values + grid_init = grid_init.reshape(self.num_heads, 1, 1, 2).tile([1, self.num_levels, self.num_points, 1]) + scaling = torch.arange(1, self.num_points + 1, dtype=torch.float32).reshape(1, 1, -1, 1) + grid_init *= scaling + self.sampling_offsets.bias.data[...] = grid_init.flatten() + + # attention_weights + init.constant_(self.attention_weights.weight, 0) + init.constant_(self.attention_weights.bias, 0) + + # proj + init.xavier_uniform_(self.value_proj.weight) + init.constant_(self.value_proj.bias, 0) + init.xavier_uniform_(self.output_proj.weight) + init.constant_(self.output_proj.bias, 0) + + + def forward(self, + query, + reference_points, + value, + value_spatial_shapes, + value_mask=None): + """ + Args: + query (Tensor): [bs, query_length, C] + reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area + value (Tensor): [bs, value_length, C] + value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] + value_level_start_index (List): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...] + value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements + + Returns: + output (Tensor): [bs, Length_{query}, C] + """ + bs, Len_q = query.shape[:2] + Len_v = value.shape[1] + + value = self.value_proj(value) + if value_mask is not None: + value_mask = value_mask.astype(value.dtype).unsqueeze(-1) + value *= value_mask + value = value.reshape(bs, Len_v, self.num_heads, self.head_dim) + + sampling_offsets = self.sampling_offsets(query).reshape( + bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2) + attention_weights = self.attention_weights(query).reshape( + bs, Len_q, self.num_heads, self.num_levels * self.num_points) + attention_weights = F.softmax(attention_weights, dim=-1).reshape( + bs, Len_q, self.num_heads, self.num_levels, self.num_points) + + if reference_points.shape[-1] == 2: + offset_normalizer = torch.tensor(value_spatial_shapes) + offset_normalizer = offset_normalizer.flip([1]).reshape( + 1, 1, 1, self.num_levels, 1, 2) + sampling_locations = reference_points.reshape( + bs, Len_q, 1, self.num_levels, 1, 2 + ) + sampling_offsets / offset_normalizer + elif reference_points.shape[-1] == 4: + sampling_locations = ( + reference_points[:, :, None, :, None, :2] + sampling_offsets / + self.num_points * reference_points[:, :, None, :, None, 2:] * 0.5) + else: + raise ValueError( + "Last dim of reference_points must be 2 or 4, but get {} instead.". + format(reference_points.shape[-1])) + + output = self.ms_deformable_attn_core(value, value_spatial_shapes, sampling_locations, attention_weights) + + output = self.output_proj(output) + + return output + + +class TransformerDecoderLayer(nn.Module): + def __init__(self, + d_model=256, + n_head=8, + dim_feedforward=1024, + dropout=0., + activation="relu", + n_levels=4, + n_points=4,): + super(TransformerDecoderLayer, self).__init__() + + # self attention + self.self_attn = nn.MultiheadAttention(d_model, n_head, dropout=dropout, batch_first=True) + self.dropout1 = nn.Dropout(dropout) + self.norm1 = nn.LayerNorm(d_model) + + # cross attention + self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels, n_points) + self.dropout2 = nn.Dropout(dropout) + self.norm2 = nn.LayerNorm(d_model) + + # ffn + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.activation = getattr(F, activation) + self.dropout3 = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + self.dropout4 = nn.Dropout(dropout) + self.norm3 = nn.LayerNorm(d_model) + + # self._reset_parameters() + + # def _reset_parameters(self): + # linear_init_(self.linear1) + # linear_init_(self.linear2) + # xavier_uniform_(self.linear1.weight) + # xavier_uniform_(self.linear2.weight) + + def with_pos_embed(self, tensor, pos): + return tensor if pos is None else tensor + pos + + def forward_ffn(self, tgt): + return self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) + + def forward(self, + tgt, + reference_points, + memory, + memory_spatial_shapes, + memory_level_start_index, + attn_mask=None, + memory_mask=None, + query_pos_embed=None): + # self attention + q = k = self.with_pos_embed(tgt, query_pos_embed) + + # if attn_mask is not None: + # attn_mask = torch.where( + # attn_mask.to(torch.bool), + # torch.zeros_like(attn_mask), + # torch.full_like(attn_mask, float('-inf'), dtype=tgt.dtype)) + + tgt2, _ = self.self_attn(q, k, value=tgt, attn_mask=attn_mask) + tgt = tgt + self.dropout1(tgt2) + tgt = self.norm1(tgt) + + # cross attention + tgt2 = self.cross_attn(\ + self.with_pos_embed(tgt, query_pos_embed), + reference_points, + memory, + memory_spatial_shapes, + memory_mask) + tgt = tgt + self.dropout2(tgt2) + tgt = self.norm2(tgt) + + # ffn + tgt2 = self.forward_ffn(tgt) + tgt = tgt + self.dropout4(tgt2) + tgt = self.norm3(tgt.clamp(min=-65504, max=65504)) + + return tgt + + +class TransformerDecoder(nn.Module): + def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1): + super(TransformerDecoder, self).__init__() + self.layers = nn.ModuleList([copy.deepcopy(decoder_layer) for _ in range(num_layers)]) + self.hidden_dim = hidden_dim + self.num_layers = num_layers + self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx + + def forward(self, + tgt, + ref_points_unact, + memory, + memory_spatial_shapes, + memory_level_start_index, + bbox_head, + score_head, + query_pos_head, + attn_mask=None, + memory_mask=None): + output = tgt + dec_out_bboxes = [] + dec_out_logits = [] + ref_points_detach = F.sigmoid(ref_points_unact) + + for i, layer in enumerate(self.layers): + ref_points_input = ref_points_detach.unsqueeze(2) + query_pos_embed = query_pos_head(ref_points_detach) + + output = layer(output, ref_points_input, memory, + memory_spatial_shapes, memory_level_start_index, + attn_mask, memory_mask, query_pos_embed) + + inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points_detach)) + + if self.training: + dec_out_logits.append(score_head[i](output)) + if i == 0: + dec_out_bboxes.append(inter_ref_bbox) + else: + dec_out_bboxes.append(F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points))) + + elif i == self.eval_idx: + dec_out_logits.append(score_head[i](output)) + dec_out_bboxes.append(inter_ref_bbox) + break + + ref_points = inter_ref_bbox + ref_points_detach = inter_ref_bbox.detach( + ) if self.training else inter_ref_bbox + + return torch.stack(dec_out_bboxes), torch.stack(dec_out_logits) + + +@register() +class RTDETRTransformer(nn.Module): + __share__ = ['num_classes'] + def __init__(self, + num_classes=80, + hidden_dim=256, + num_queries=300, + position_embed_type='sine', + feat_channels=[512, 1024, 2048], + feat_strides=[8, 16, 32], + num_levels=3, + num_points=4, + nhead=8, + num_layers=6, + dim_feedforward=1024, + dropout=0., + activation="relu", + num_denoising=100, + label_noise_ratio=0.5, + box_noise_scale=1.0, + learnt_init_query=False, + eval_spatial_size=None, + eval_idx=-1, + eps=1e-2, + aux_loss=True, + version='v1'): + + super(RTDETRTransformer, self).__init__() + assert position_embed_type in ['sine', 'learned'], \ + f'ValueError: position_embed_type not supported {position_embed_type}!' + assert len(feat_channels) <= num_levels + assert len(feat_strides) == len(feat_channels) + for _ in range(num_levels - len(feat_strides)): + feat_strides.append(feat_strides[-1] * 2) + + self.hidden_dim = hidden_dim + self.nhead = nhead + self.feat_strides = feat_strides + self.num_levels = num_levels + self.num_classes = num_classes + self.num_queries = num_queries + self.eps = eps + self.num_layers = num_layers + self.eval_spatial_size = eval_spatial_size + self.aux_loss = aux_loss + + # backbone feature projection + self._build_input_proj_layer(feat_channels) + + # Transformer module + decoder_layer = TransformerDecoderLayer(hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels, num_points) + self.decoder = TransformerDecoder(hidden_dim, decoder_layer, num_layers, eval_idx) + + self.num_denoising = num_denoising + self.label_noise_ratio = label_noise_ratio + self.box_noise_scale = box_noise_scale + # denoising part + if num_denoising > 0: + # self.denoising_class_embed = nn.Embedding(num_classes, hidden_dim, padding_idx=num_classes-1) # TODO for load paddle weights + self.denoising_class_embed = nn.Embedding(num_classes+1, hidden_dim, padding_idx=num_classes) + init.normal_(self.denoising_class_embed.weight[:-1]) + + # decoder embedding + self.learnt_init_query = learnt_init_query + if learnt_init_query: + self.tgt_embed = nn.Embedding(num_queries, hidden_dim) + self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2) + + # encoder head + if version == 'v1': + self.enc_output = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim), + nn.LayerNorm(hidden_dim,) + ) + else: + self.enc_output = nn.Sequential(OrderedDict([ + ('proj', nn.Linear(hidden_dim, hidden_dim)), + ('norm', nn.LayerNorm(hidden_dim,)), + ])) + + self.enc_score_head = nn.Linear(hidden_dim, num_classes) + self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3) + + # decoder head + self.dec_score_head = nn.ModuleList([ + nn.Linear(hidden_dim, num_classes) + for _ in range(num_layers) + ]) + self.dec_bbox_head = nn.ModuleList([ + MLP(hidden_dim, hidden_dim, 4, num_layers=3) + for _ in range(num_layers) + ]) + + # init encoder output anchors and valid_mask + if self.eval_spatial_size: + self.anchors, self.valid_mask = self._generate_anchors() + + self._reset_parameters() + + def _reset_parameters(self): + bias = bias_init_with_prob(0.01) + + init.constant_(self.enc_score_head.bias, bias) + init.constant_(self.enc_bbox_head.layers[-1].weight, 0) + init.constant_(self.enc_bbox_head.layers[-1].bias, 0) + + for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head): + init.constant_(cls_.bias, bias) + init.constant_(reg_.layers[-1].weight, 0) + init.constant_(reg_.layers[-1].bias, 0) + + # linear_init_(self.enc_output[0]) + init.xavier_uniform_(self.enc_output[0].weight) + if self.learnt_init_query: + init.xavier_uniform_(self.tgt_embed.weight) + init.xavier_uniform_(self.query_pos_head.layers[0].weight) + init.xavier_uniform_(self.query_pos_head.layers[1].weight) + + + def _build_input_proj_layer(self, feat_channels): + self.input_proj = nn.ModuleList() + for in_channels in feat_channels: + self.input_proj.append( + nn.Sequential(OrderedDict([ + ('conv', nn.Conv2d(in_channels, self.hidden_dim, 1, bias=False)), + ('norm', nn.BatchNorm2d(self.hidden_dim,))]) + ) + ) + + in_channels = feat_channels[-1] + + for _ in range(self.num_levels - len(feat_channels)): + self.input_proj.append( + nn.Sequential(OrderedDict([ + ('conv', nn.Conv2d(in_channels, self.hidden_dim, 3, 2, padding=1, bias=False)), + ('norm', nn.BatchNorm2d(self.hidden_dim))]) + ) + ) + in_channels = self.hidden_dim + + def _get_encoder_input(self, feats): + # get projection features + proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)] + if self.num_levels > len(proj_feats): + len_srcs = len(proj_feats) + for i in range(len_srcs, self.num_levels): + if i == len_srcs: + proj_feats.append(self.input_proj[i](feats[-1])) + else: + proj_feats.append(self.input_proj[i](proj_feats[-1])) + + # get encoder inputs + feat_flatten = [] + spatial_shapes = [] + level_start_index = [0, ] + for i, feat in enumerate(proj_feats): + _, _, h, w = feat.shape + # [b, c, h, w] -> [b, h*w, c] + feat_flatten.append(feat.flatten(2).permute(0, 2, 1)) + # [num_levels, 2] + spatial_shapes.append([h, w]) + # [l], start index of each level + level_start_index.append(h * w + level_start_index[-1]) + + # [b, l, c] + feat_flatten = torch.concat(feat_flatten, 1) + level_start_index.pop() + return (feat_flatten, spatial_shapes, level_start_index) + + def _generate_anchors(self, + spatial_shapes=None, + grid_size=0.05, + dtype=torch.float32, + device='cpu'): + if spatial_shapes is None: + spatial_shapes = [[int(self.eval_spatial_size[0] / s), int(self.eval_spatial_size[1] / s)] + for s in self.feat_strides + ] + anchors = [] + for lvl, (h, w) in enumerate(spatial_shapes): + grid_y, grid_x = torch.meshgrid(\ + torch.arange(end=h, dtype=dtype), \ + torch.arange(end=w, dtype=dtype), indexing='ij') + grid_xy = torch.stack([grid_x, grid_y], -1) + valid_WH = torch.tensor([w, h]).to(dtype) + grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH + wh = torch.ones_like(grid_xy) * grid_size * (2.0 ** lvl) + anchors.append(torch.concat([grid_xy, wh], -1).reshape(-1, h * w, 4)) + + anchors = torch.concat(anchors, 1).to(device) + valid_mask = ((anchors > self.eps) * (anchors < 1 - self.eps)).all(-1, keepdim=True) + anchors = torch.log(anchors / (1 - anchors)) + # anchors = torch.where(valid_mask, anchors, float('inf')) + # anchors[valid_mask] = torch.inf # valid_mask [1, 8400, 1] + anchors = torch.where(valid_mask, anchors, torch.inf) + + return anchors, valid_mask + + + def _get_decoder_input(self, + memory, + spatial_shapes, + denoising_class=None, + denoising_bbox_unact=None): + bs, _, _ = memory.shape + # prepare input for decoder + if self.training or self.eval_spatial_size is None: + anchors, valid_mask = self._generate_anchors(spatial_shapes, device=memory.device) + else: + anchors, valid_mask = self.anchors.to(memory.device), self.valid_mask.to(memory.device) + + # memory = torch.where(valid_mask, memory, 0) + memory = valid_mask.to(memory.dtype) * memory # TODO fix type error for onnx export + + output_memory = self.enc_output(memory) + + enc_outputs_class = self.enc_score_head(output_memory) + enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors + + _, topk_ind = torch.topk(enc_outputs_class.max(-1).values, self.num_queries, dim=1) + + reference_points_unact = enc_outputs_coord_unact.gather(dim=1, \ + index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_coord_unact.shape[-1])) + + enc_topk_bboxes = F.sigmoid(reference_points_unact) + if denoising_bbox_unact is not None: + reference_points_unact = torch.concat( + [denoising_bbox_unact, reference_points_unact], 1) + + enc_topk_logits = enc_outputs_class.gather(dim=1, \ + index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_class.shape[-1])) + + # extract region features + if self.learnt_init_query: + target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1]) + else: + target = output_memory.gather(dim=1, \ + index=topk_ind.unsqueeze(-1).repeat(1, 1, output_memory.shape[-1])) + target = target.detach() + + if denoising_class is not None: + target = torch.concat([denoising_class, target], 1) + + return target, reference_points_unact.detach(), enc_topk_bboxes, enc_topk_logits + + + def forward(self, feats, targets=None): + + # input projection and embedding + (memory, spatial_shapes, level_start_index) = self._get_encoder_input(feats) + + # prepare denoising training + if self.training and self.num_denoising > 0: + denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \ + get_contrastive_denoising_training_group(targets, \ + self.num_classes, + self.num_queries, + self.denoising_class_embed, + num_denoising=self.num_denoising, + label_noise_ratio=self.label_noise_ratio, + box_noise_scale=self.box_noise_scale, ) + else: + denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None + + target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \ + self._get_decoder_input(memory, spatial_shapes, denoising_class, denoising_bbox_unact) + + # decoder + out_bboxes, out_logits = self.decoder( + target, + init_ref_points_unact, + memory, + spatial_shapes, + level_start_index, + self.dec_bbox_head, + self.dec_score_head, + self.query_pos_head, + attn_mask=attn_mask) + + if self.training and dn_meta is not None: + dn_out_bboxes, out_bboxes = torch.split(out_bboxes, dn_meta['dn_num_split'], dim=2) + dn_out_logits, out_logits = torch.split(out_logits, dn_meta['dn_num_split'], dim=2) + + out = {'pred_logits': out_logits[-1], 'pred_boxes': out_bboxes[-1]} + + if self.training and self.aux_loss: + out['aux_outputs'] = self._set_aux_loss(out_logits[:-1], out_bboxes[:-1]) + out['aux_outputs'].extend(self._set_aux_loss([enc_topk_logits], [enc_topk_bboxes])) + + if self.training and dn_meta is not None: + out['dn_aux_outputs'] = self._set_aux_loss(dn_out_logits, dn_out_bboxes) + out['dn_meta'] = dn_meta + + return out + + + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_coord): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [{'pred_logits': a, 'pred_boxes': b} + for a, b in zip(outputs_class, outputs_coord)] diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py new file mode 100644 index 0000000..efe58fd --- /dev/null +++ b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py @@ -0,0 +1,94 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import torchvision + +from ...core import register + + +__all__ = ['RTDETRPostProcessor'] + + +def mod(a, b): + out = a - a // b * b + return out + + +@register() +class RTDETRPostProcessor(nn.Module): + __share__ = [ + 'num_classes', + 'use_focal_loss', + 'num_top_queries', + 'remap_mscoco_category' + ] + + def __init__( + self, + num_classes=80, + use_focal_loss=True, + num_top_queries=300, + remap_mscoco_category=False + ) -> None: + super().__init__() + self.use_focal_loss = use_focal_loss + self.num_top_queries = num_top_queries + self.num_classes = int(num_classes) + self.remap_mscoco_category = remap_mscoco_category + self.deploy_mode = False + + def extra_repr(self) -> str: + return f'use_focal_loss={self.use_focal_loss}, num_classes={self.num_classes}, num_top_queries={self.num_top_queries}' + + # def forward(self, outputs, orig_target_sizes): + def forward(self, outputs, orig_target_sizes: torch.Tensor): + logits, boxes = outputs['pred_logits'], outputs['pred_boxes'] + # orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0) + + bbox_pred = torchvision.ops.box_convert(boxes, in_fmt='cxcywh', out_fmt='xyxy') + bbox_pred *= orig_target_sizes.repeat(1, 2).unsqueeze(1) + + if self.use_focal_loss: + scores = F.sigmoid(logits) + scores, index = torch.topk(scores.flatten(1), self.num_top_queries, dim=-1) + # TODO for older tensorrt + # labels = index % self.num_classes + labels = mod(index, self.num_classes) + index = index // self.num_classes + boxes = bbox_pred.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, bbox_pred.shape[-1])) + + else: + scores = F.softmax(logits, dim=-1)[:, :, :-1] + scores, labels = scores.max(dim=-1) + boxes = bbox_pred + if scores.shape[1] > self.num_top_queries: + scores, index = torch.topk(scores, self.num_top_queries, dim=-1) + labels = torch.gather(labels, dim=1, index=index) + boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1])) + + # TODO for onnx export + if self.deploy_mode: + return labels, boxes, scores + + # TODO + if self.remap_mscoco_category: + from ...data.dataset import mscoco_label2category + labels = torch.tensor([mscoco_label2category[int(x.item())] for x in labels.flatten()])\ + .to(boxes.device).reshape(labels.shape) + + results = [] + for lab, box, sco in zip(labels, boxes, scores): + result = dict(labels=lab, boxes=box, scores=sco) + results.append(result) + + return results + + + def deploy(self, ): + self.eval() + self.deploy_mode = True + return self diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetrv2_criterion.py b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetrv2_criterion.py new file mode 100644 index 0000000..c69e368 --- /dev/null +++ b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetrv2_criterion.py @@ -0,0 +1,265 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import torch +import torch.nn as nn +import torch.distributed +import torch.nn.functional as F +import torchvision + +import copy + +from .box_ops import box_cxcywh_to_xyxy, box_iou, generalized_box_iou +from ...misc.dist_utils import get_world_size, is_dist_available_and_initialized +from ...core import register + + +@register() +class RTDETRCriterionv2(nn.Module): + """ This class computes the loss for DETR. + The process happens in two steps: + 1) we compute hungarian assignment between ground truth boxes and the outputs of the model + 2) we supervise each pair of matched ground-truth / prediction (supervise class and box) + """ + __share__ = ['num_classes', ] + __inject__ = ['matcher', ] + + def __init__(self, \ + matcher, + weight_dict, + losses, + alpha=0.2, + gamma=2.0, + num_classes=80, + boxes_weight_format=None, + share_matched_indices=False): + """Create the criterion. + Parameters: + matcher: module able to compute a matching between targets and proposals + num_classes: number of object categories, omitting the special no-object category + weight_dict: dict containing as key the names of the losses and as values their relative weight. + eos_coef: relative classification weight applied to the no-object category + losses: list of all the losses to be applied. See get_loss for list of available losses. + boxes_weight_format: format for boxes weight (iou, ) + """ + super().__init__() + self.num_classes = num_classes + self.matcher = matcher + self.weight_dict = weight_dict + self.losses = losses + self.boxes_weight_format = boxes_weight_format + self.share_matched_indices = share_matched_indices + self.alpha = alpha + self.gamma = gamma + + def loss_labels_focal(self, outputs, targets, indices, num_boxes): + assert 'pred_logits' in outputs + src_logits = outputs['pred_logits'] + idx = self._get_src_permutation_idx(indices) + target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) + target_classes = torch.full(src_logits.shape[:2], self.num_classes, + dtype=torch.int64, device=src_logits.device) + target_classes[idx] = target_classes_o + target = F.one_hot(target_classes, num_classes=self.num_classes+1)[..., :-1] + loss = torchvision.ops.sigmoid_focal_loss(src_logits, target, self.alpha, self.gamma, reduction='none') + loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes + + return {'loss_focal': loss} + + def loss_labels_vfl(self, outputs, targets, indices, num_boxes, values=None): + assert 'pred_boxes' in outputs + idx = self._get_src_permutation_idx(indices) + if values is None: + src_boxes = outputs['pred_boxes'][idx] + target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0) + ious, _ = box_iou(box_cxcywh_to_xyxy(src_boxes), box_cxcywh_to_xyxy(target_boxes)) + ious = torch.diag(ious).detach() + else: + ious = values + + src_logits = outputs['pred_logits'] + target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) + target_classes = torch.full(src_logits.shape[:2], self.num_classes, + dtype=torch.int64, device=src_logits.device) + target_classes[idx] = target_classes_o + target = F.one_hot(target_classes, num_classes=self.num_classes + 1)[..., :-1] + + target_score_o = torch.zeros_like(target_classes, dtype=src_logits.dtype) + target_score_o[idx] = ious.to(target_score_o.dtype) + target_score = target_score_o.unsqueeze(-1) * target + + pred_score = F.sigmoid(src_logits).detach() + weight = self.alpha * pred_score.pow(self.gamma) * (1 - target) + target_score + + loss = F.binary_cross_entropy_with_logits(src_logits, target_score, weight=weight, reduction='none') + loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes + return {'loss_vfl': loss} + + def loss_boxes(self, outputs, targets, indices, num_boxes, boxes_weight=None): + """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss + targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4] + The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size. + """ + assert 'pred_boxes' in outputs + idx = self._get_src_permutation_idx(indices) + src_boxes = outputs['pred_boxes'][idx] + target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0) + + losses = {} + loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none') + losses['loss_bbox'] = loss_bbox.sum() / num_boxes + + loss_giou = 1 - torch.diag(generalized_box_iou(\ + box_cxcywh_to_xyxy(src_boxes), box_cxcywh_to_xyxy(target_boxes))) + loss_giou = loss_giou if boxes_weight is None else loss_giou * boxes_weight + losses['loss_giou'] = loss_giou.sum() / num_boxes + return losses + + def _get_src_permutation_idx(self, indices): + # permute predictions following indices + batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) + src_idx = torch.cat([src for (src, _) in indices]) + return batch_idx, src_idx + + def _get_tgt_permutation_idx(self, indices): + # permute targets following indices + batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) + tgt_idx = torch.cat([tgt for (_, tgt) in indices]) + return batch_idx, tgt_idx + + def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs): + loss_map = { + 'boxes': self.loss_boxes, + 'focal': self.loss_labels_focal, + 'vfl': self.loss_labels_vfl, + } + assert loss in loss_map, f'do you really want to compute {loss} loss?' + return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs) + + def forward(self, outputs, targets, **kwargs): + """ This performs the loss computation. + Parameters: + outputs: dict of tensors, see the output specification of the model for the format + targets: list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc + """ + outputs_without_aux = {k: v for k, v in outputs.items() if 'aux' not in k} + + # Compute the average number of target boxes accross all nodes, for normalization purposes + num_boxes = sum(len(t["labels"]) for t in targets) + num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) + if is_dist_available_and_initialized(): + torch.distributed.all_reduce(num_boxes) + num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item() + + # Retrieve the matching between the outputs of the last layer and the targets + matched = self.matcher(outputs_without_aux, targets) + indices = matched['indices'] + + # Compute all the requested losses + losses = {} + for loss in self.losses: + meta = self.get_loss_meta_info(loss, outputs, targets, indices) + l_dict = self.get_loss(loss, outputs, targets, indices, num_boxes, **meta) + l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict} + losses.update(l_dict) + + # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. + if 'aux_outputs' in outputs: + for i, aux_outputs in enumerate(outputs['aux_outputs']): + if not self.share_matched_indices: + matched = self.matcher(aux_outputs, targets) + indices = matched['indices'] + for loss in self.losses: + meta = self.get_loss_meta_info(loss, aux_outputs, targets, indices) + l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **meta) + l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict} + l_dict = {k + f'_aux_{i}': v for k, v in l_dict.items()} + losses.update(l_dict) + + # In case of cdn auxiliary losses. For rtdetr + if 'dn_aux_outputs' in outputs: + assert 'dn_meta' in outputs, '' + indices = self.get_cdn_matched_indices(outputs['dn_meta'], targets) + dn_num_boxes = num_boxes * outputs['dn_meta']['dn_num_group'] + for i, aux_outputs in enumerate(outputs['dn_aux_outputs']): + for loss in self.losses: + meta = self.get_loss_meta_info(loss, aux_outputs, targets, indices) + l_dict = self.get_loss(loss, aux_outputs, targets, indices, dn_num_boxes, **meta) + l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict} + l_dict = {k + f'_dn_{i}': v for k, v in l_dict.items()} + losses.update(l_dict) + + # In case of encoder auxiliary losses. For rtdetr v2 + if 'enc_aux_outputs' in outputs: + assert 'enc_meta' in outputs, '' + class_agnostic = outputs['enc_meta']['class_agnostic'] + if class_agnostic: + orig_num_classes = self.num_classes + self.num_classes = 1 + enc_targets = copy.deepcopy(targets) + for t in enc_targets: + t['labels'] = torch.zeros_like(t["labels"]) + else: + enc_targets = targets + + for i, aux_outputs in enumerate(outputs['enc_aux_outputs']): + matched = self.matcher(aux_outputs, targets) + indices = matched['indices'] + for loss in self.losses: + meta = self.get_loss_meta_info(loss, aux_outputs, enc_targets, indices) + l_dict = self.get_loss(loss, aux_outputs, enc_targets, indices, num_boxes, **meta) + l_dict = {k: l_dict[k] * self.weight_dict[k] for k in l_dict if k in self.weight_dict} + l_dict = {k + f'_enc_{i}': v for k, v in l_dict.items()} + losses.update(l_dict) + + if class_agnostic: + self.num_classes = orig_num_classes + + return losses + + def get_loss_meta_info(self, loss, outputs, targets, indices): + if self.boxes_weight_format is None: + return {} + + src_boxes = outputs['pred_boxes'][self._get_src_permutation_idx(indices)] + target_boxes = torch.cat([t['boxes'][j] for t, (_, j) in zip(targets, indices)], dim=0) + + if self.boxes_weight_format == 'iou': + iou, _ = box_iou(box_cxcywh_to_xyxy(src_boxes.detach()), box_cxcywh_to_xyxy(target_boxes)) + iou = torch.diag(iou) + elif self.boxes_weight_format == 'giou': + iou = torch.diag(generalized_box_iou(\ + box_cxcywh_to_xyxy(src_boxes.detach()), box_cxcywh_to_xyxy(target_boxes))) + else: + raise AttributeError() + + if loss in ('boxes', ): + meta = {'boxes_weight': iou} + elif loss in ('vfl', ): + meta = {'values': iou} + else: + meta = {} + + return meta + + @staticmethod + def get_cdn_matched_indices(dn_meta, targets): + """get_cdn_matched_indices + """ + dn_positive_idx, dn_num_group = dn_meta["dn_positive_idx"], dn_meta["dn_num_group"] + num_gts = [len(t['labels']) for t in targets] + device = targets[0]['labels'].device + + dn_match_indices = [] + for i, num_gt in enumerate(num_gts): + if num_gt > 0: + gt_idx = torch.arange(num_gt, dtype=torch.int64, device=device) + gt_idx = gt_idx.tile(dn_num_group) + assert len(dn_positive_idx[i]) == len(gt_idx) + dn_match_indices.append((dn_positive_idx[i], gt_idx)) + else: + dn_match_indices.append((torch.zeros(0, dtype=torch.int64, device=device), \ + torch.zeros(0, dtype=torch.int64, device=device))) + + return dn_match_indices diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetrv2_decoder.py b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetrv2_decoder.py new file mode 100644 index 0000000..e35a7c3 --- /dev/null +++ b/rtdetrv2_pytorch/src/zoo/rtdetr/rtdetrv2_decoder.py @@ -0,0 +1,609 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import math +import copy +import functools +from collections import OrderedDict + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.nn.init as init +from typing import List + +from .denoising import get_contrastive_denoising_training_group +from .utils import deformable_attention_core_func_v2, get_activation, inverse_sigmoid +from .utils import bias_init_with_prob + +from ...core import register + +__all__ = ['RTDETRTransformerv2'] + + +class MLP(nn.Module): + def __init__(self, input_dim, hidden_dim, output_dim, num_layers, act='relu'): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + self.act = get_activation(act) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = self.act(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +class MSDeformableAttention(nn.Module): + def __init__( + self, + embed_dim=256, + num_heads=8, + num_levels=4, + num_points=4, + method='default', + offset_scale=0.5, + ): + """Multi-Scale Deformable Attention + """ + super(MSDeformableAttention, self).__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.num_levels = num_levels + self.offset_scale = offset_scale + + if isinstance(num_points, list): + assert len(num_points) == num_levels, '' + num_points_list = num_points + else: + num_points_list = [num_points for _ in range(num_levels)] + + self.num_points_list = num_points_list + + num_points_scale = [1/n for n in num_points_list for _ in range(n)] + self.register_buffer('num_points_scale', torch.tensor(num_points_scale, dtype=torch.float32)) + + self.total_points = num_heads * sum(num_points_list) + self.method = method + + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" + + self.sampling_offsets = nn.Linear(embed_dim, self.total_points * 2) + self.attention_weights = nn.Linear(embed_dim, self.total_points) + self.value_proj = nn.Linear(embed_dim, embed_dim) + self.output_proj = nn.Linear(embed_dim, embed_dim) + + self.ms_deformable_attn_core = functools.partial(deformable_attention_core_func_v2, method=self.method) + + self._reset_parameters() + + if method == 'discrete': + for p in self.sampling_offsets.parameters(): + p.requires_grad = False + + def _reset_parameters(self): + # sampling_offsets + init.constant_(self.sampling_offsets.weight, 0) + thetas = torch.arange(self.num_heads, dtype=torch.float32) * (2.0 * math.pi / self.num_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = grid_init / grid_init.abs().max(-1, keepdim=True).values + grid_init = grid_init.reshape(self.num_heads, 1, 2).tile([1, sum(self.num_points_list), 1]) + scaling = torch.concat([torch.arange(1, n + 1) for n in self.num_points_list]).reshape(1, -1, 1) + grid_init *= scaling + self.sampling_offsets.bias.data[...] = grid_init.flatten() + + # attention_weights + init.constant_(self.attention_weights.weight, 0) + init.constant_(self.attention_weights.bias, 0) + + # proj + init.xavier_uniform_(self.value_proj.weight) + init.constant_(self.value_proj.bias, 0) + init.xavier_uniform_(self.output_proj.weight) + init.constant_(self.output_proj.bias, 0) + + + def forward(self, + query: torch.Tensor, + reference_points: torch.Tensor, + value: torch.Tensor, + value_spatial_shapes: List[int], + value_mask: torch.Tensor=None): + """ + Args: + query (Tensor): [bs, query_length, C] + reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area + value (Tensor): [bs, value_length, C] + value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] + value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements + + Returns: + output (Tensor): [bs, Length_{query}, C] + """ + bs, Len_q = query.shape[:2] + Len_v = value.shape[1] + + value = self.value_proj(value) + if value_mask is not None: + value = value * value_mask.to(value.dtype).unsqueeze(-1) + + value = value.reshape(bs, Len_v, self.num_heads, self.head_dim) + + sampling_offsets: torch.Tensor = self.sampling_offsets(query) + sampling_offsets = sampling_offsets.reshape(bs, Len_q, self.num_heads, sum(self.num_points_list), 2) + + attention_weights = self.attention_weights(query).reshape(bs, Len_q, self.num_heads, sum(self.num_points_list)) + attention_weights = F.softmax(attention_weights, dim=-1).reshape(bs, Len_q, self.num_heads, sum(self.num_points_list)) + + if reference_points.shape[-1] == 2: + offset_normalizer = torch.tensor(value_spatial_shapes) + offset_normalizer = offset_normalizer.flip([1]).reshape(1, 1, 1, self.num_levels, 1, 2) + sampling_locations = reference_points.reshape(bs, Len_q, 1, self.num_levels, 1, 2) + sampling_offsets / offset_normalizer + elif reference_points.shape[-1] == 4: + # reference_points [8, 480, None, 1, 4] + # sampling_offsets [8, 480, 8, 12, 2] + num_points_scale = self.num_points_scale.to(dtype=query.dtype).unsqueeze(-1) + offset = sampling_offsets * num_points_scale * reference_points[:, :, None, :, 2:] * self.offset_scale + sampling_locations = reference_points[:, :, None, :, :2] + offset + else: + raise ValueError( + "Last dim of reference_points must be 2 or 4, but get {} instead.". + format(reference_points.shape[-1])) + + output = self.ms_deformable_attn_core(value, value_spatial_shapes, sampling_locations, attention_weights, self.num_points_list) + + output = self.output_proj(output) + + return output + + +class TransformerDecoderLayer(nn.Module): + def __init__(self, + d_model=256, + n_head=8, + dim_feedforward=1024, + dropout=0., + activation='relu', + n_levels=4, + n_points=4, + cross_attn_method='default'): + super(TransformerDecoderLayer, self).__init__() + + # self attention + self.self_attn = nn.MultiheadAttention(d_model, n_head, dropout=dropout, batch_first=True) + self.dropout1 = nn.Dropout(dropout) + self.norm1 = nn.LayerNorm(d_model) + + # cross attention + self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels, n_points, method=cross_attn_method) + self.dropout2 = nn.Dropout(dropout) + self.norm2 = nn.LayerNorm(d_model) + + # ffn + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.activation = get_activation(activation) + self.dropout3 = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + self.dropout4 = nn.Dropout(dropout) + self.norm3 = nn.LayerNorm(d_model) + + self._reset_parameters() + + def _reset_parameters(self): + init.xavier_uniform_(self.linear1.weight) + init.xavier_uniform_(self.linear2.weight) + + def with_pos_embed(self, tensor, pos): + return tensor if pos is None else tensor + pos + + def forward_ffn(self, tgt): + return self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) + + def forward(self, + target, + reference_points, + memory, + memory_spatial_shapes, + attn_mask=None, + memory_mask=None, + query_pos_embed=None): + # self attention + q = k = self.with_pos_embed(target, query_pos_embed) + + target2, _ = self.self_attn(q, k, value=target, attn_mask=attn_mask) + target = target + self.dropout1(target2) + target = self.norm1(target) + + # cross attention + target2 = self.cross_attn(\ + self.with_pos_embed(target, query_pos_embed), + reference_points, + memory, + memory_spatial_shapes, + memory_mask) + target = target + self.dropout2(target2) + target = self.norm2(target) + + # ffn + target2 = self.forward_ffn(target) + target = target + self.dropout4(target2) + target = self.norm3(target) + + return target + + +class TransformerDecoder(nn.Module): + def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1): + super(TransformerDecoder, self).__init__() + self.layers = nn.ModuleList([copy.deepcopy(decoder_layer) for _ in range(num_layers)]) + self.hidden_dim = hidden_dim + self.num_layers = num_layers + self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx + + def forward(self, + target, + ref_points_unact, + memory, + memory_spatial_shapes, + bbox_head, + score_head, + query_pos_head, + attn_mask=None, + memory_mask=None): + dec_out_bboxes = [] + dec_out_logits = [] + ref_points_detach = F.sigmoid(ref_points_unact) + + output = target + for i, layer in enumerate(self.layers): + ref_points_input = ref_points_detach.unsqueeze(2) + query_pos_embed = query_pos_head(ref_points_detach) + + output = layer(output, ref_points_input, memory, memory_spatial_shapes, attn_mask, memory_mask, query_pos_embed) + + inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points_detach)) + + if self.training: + dec_out_logits.append(score_head[i](output)) + if i == 0: + dec_out_bboxes.append(inter_ref_bbox) + else: + dec_out_bboxes.append(F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points))) + + elif i == self.eval_idx: + dec_out_logits.append(score_head[i](output)) + dec_out_bboxes.append(inter_ref_bbox) + break + + ref_points = inter_ref_bbox + ref_points_detach = inter_ref_bbox.detach() + + return torch.stack(dec_out_bboxes), torch.stack(dec_out_logits) + + +@register() +class RTDETRTransformerv2(nn.Module): + __share__ = ['num_classes', 'eval_spatial_size'] + + def __init__(self, + num_classes=80, + hidden_dim=256, + num_queries=300, + feat_channels=[512, 1024, 2048], + feat_strides=[8, 16, 32], + num_levels=3, + num_points=4, + nhead=8, + num_layers=6, + dim_feedforward=1024, + dropout=0., + activation="relu", + num_denoising=100, + label_noise_ratio=0.5, + box_noise_scale=1.0, + learn_query_content=False, + eval_spatial_size=None, + eval_idx=-1, + eps=1e-2, + aux_loss=True, + cross_attn_method='default', + query_select_method='default'): + super().__init__() + assert len(feat_channels) <= num_levels + assert len(feat_strides) == len(feat_channels) + + for _ in range(num_levels - len(feat_strides)): + feat_strides.append(feat_strides[-1] * 2) + + self.hidden_dim = hidden_dim + self.nhead = nhead + self.feat_strides = feat_strides + self.num_levels = num_levels + self.num_classes = num_classes + self.num_queries = num_queries + self.eps = eps + self.num_layers = num_layers + self.eval_spatial_size = eval_spatial_size + self.aux_loss = aux_loss + + assert query_select_method in ('default', 'one2many', 'agnostic'), '' + assert cross_attn_method in ('default', 'discrete'), '' + self.cross_attn_method = cross_attn_method + self.query_select_method = query_select_method + + # backbone feature projection + self._build_input_proj_layer(feat_channels) + + # Transformer module + decoder_layer = TransformerDecoderLayer(hidden_dim, nhead, dim_feedforward, dropout, \ + activation, num_levels, num_points, cross_attn_method=cross_attn_method) + self.decoder = TransformerDecoder(hidden_dim, decoder_layer, num_layers, eval_idx) + + # denoising + self.num_denoising = num_denoising + self.label_noise_ratio = label_noise_ratio + self.box_noise_scale = box_noise_scale + if num_denoising > 0: + self.denoising_class_embed = nn.Embedding(num_classes+1, hidden_dim, padding_idx=num_classes) + init.normal_(self.denoising_class_embed.weight[:-1]) + + # decoder embedding + self.learn_query_content = learn_query_content + if learn_query_content: + self.tgt_embed = nn.Embedding(num_queries, hidden_dim) + self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, 2) + + # if num_select_queries != self.num_queries: + # layer = TransformerEncoderLayer(hidden_dim, nhead, dim_feedforward, activation='gelu') + # self.encoder = TransformerEncoder(layer, 1) + + self.enc_output = nn.Sequential(OrderedDict([ + ('proj', nn.Linear(hidden_dim, hidden_dim)), + ('norm', nn.LayerNorm(hidden_dim,)), + ])) + + if query_select_method == 'agnostic': + self.enc_score_head = nn.Linear(hidden_dim, 1) + else: + self.enc_score_head = nn.Linear(hidden_dim, num_classes) + + self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, 3) + + # decoder head + self.dec_score_head = nn.ModuleList([ + nn.Linear(hidden_dim, num_classes) for _ in range(num_layers) + ]) + self.dec_bbox_head = nn.ModuleList([ + MLP(hidden_dim, hidden_dim, 4, 3) for _ in range(num_layers) + ]) + + # init encoder output anchors and valid_mask + if self.eval_spatial_size: + anchors, valid_mask = self._generate_anchors() + self.register_buffer('anchors', anchors) + self.register_buffer('valid_mask', valid_mask) + + self._reset_parameters() + + def _reset_parameters(self): + bias = bias_init_with_prob(0.01) + init.constant_(self.enc_score_head.bias, bias) + init.constant_(self.enc_bbox_head.layers[-1].weight, 0) + init.constant_(self.enc_bbox_head.layers[-1].bias, 0) + + for _cls, _reg in zip(self.dec_score_head, self.dec_bbox_head): + init.constant_(_cls.bias, bias) + init.constant_(_reg.layers[-1].weight, 0) + init.constant_(_reg.layers[-1].bias, 0) + + init.xavier_uniform_(self.enc_output[0].weight) + if self.learn_query_content: + init.xavier_uniform_(self.tgt_embed.weight) + init.xavier_uniform_(self.query_pos_head.layers[0].weight) + init.xavier_uniform_(self.query_pos_head.layers[1].weight) + for m in self.input_proj: + init.xavier_uniform_(m[0].weight) + + def _build_input_proj_layer(self, feat_channels): + self.input_proj = nn.ModuleList() + for in_channels in feat_channels: + self.input_proj.append( + nn.Sequential(OrderedDict([ + ('conv', nn.Conv2d(in_channels, self.hidden_dim, 1, bias=False)), + ('norm', nn.BatchNorm2d(self.hidden_dim,))]) + ) + ) + + in_channels = feat_channels[-1] + + for _ in range(self.num_levels - len(feat_channels)): + self.input_proj.append( + nn.Sequential(OrderedDict([ + ('conv', nn.Conv2d(in_channels, self.hidden_dim, 3, 2, padding=1, bias=False)), + ('norm', nn.BatchNorm2d(self.hidden_dim))]) + ) + ) + in_channels = self.hidden_dim + + def _get_encoder_input(self, feats: List[torch.Tensor]): + # get projection features + proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)] + if self.num_levels > len(proj_feats): + len_srcs = len(proj_feats) + for i in range(len_srcs, self.num_levels): + if i == len_srcs: + proj_feats.append(self.input_proj[i](feats[-1])) + else: + proj_feats.append(self.input_proj[i](proj_feats[-1])) + + # get encoder inputs + feat_flatten = [] + spatial_shapes = [] + for i, feat in enumerate(proj_feats): + _, _, h, w = feat.shape + # [b, c, h, w] -> [b, h*w, c] + feat_flatten.append(feat.flatten(2).permute(0, 2, 1)) + # [num_levels, 2] + spatial_shapes.append([h, w]) + # [b, l, c] + feat_flatten = torch.concat(feat_flatten, 1) + return feat_flatten, spatial_shapes + + def _generate_anchors(self, + spatial_shapes=None, + grid_size=0.05, + dtype=torch.float32, + device='cpu'): + if spatial_shapes is None: + spatial_shapes = [] + eval_h, eval_w = self.eval_spatial_size + for s in self.feat_strides: + spatial_shapes.append([int(eval_h / s), int(eval_w / s)]) + + anchors = [] + for lvl, (h, w) in enumerate(spatial_shapes): + grid_y, grid_x = torch.meshgrid(torch.arange(h), torch.arange(w), indexing='ij') + grid_xy = torch.stack([grid_x, grid_y], dim=-1) + grid_xy = (grid_xy.unsqueeze(0) + 0.5) / torch.tensor([w, h], dtype=dtype) + wh = torch.ones_like(grid_xy) * grid_size * (2.0 ** lvl) + lvl_anchors = torch.concat([grid_xy, wh], dim=-1).reshape(-1, h * w, 4) + anchors.append(lvl_anchors) + + anchors = torch.concat(anchors, dim=1).to(device) + valid_mask = ((anchors > self.eps) * (anchors < 1 - self.eps)).all(-1, keepdim=True) + anchors = torch.log(anchors / (1 - anchors)) + anchors = torch.where(valid_mask, anchors, torch.inf) + + return anchors, valid_mask + + + def _get_decoder_input(self, + memory: torch.Tensor, + spatial_shapes, + denoising_logits=None, + denoising_bbox_unact=None): + + # prepare input for decoder + if self.training or self.eval_spatial_size is None: + anchors, valid_mask = self._generate_anchors(spatial_shapes, device=memory.device) + else: + anchors = self.anchors + valid_mask = self.valid_mask + + # memory = torch.where(valid_mask, memory, 0) + # TODO fix type error for onnx export + memory = valid_mask.to(memory.dtype) * memory + + output_memory :torch.Tensor = self.enc_output(memory) + enc_outputs_logits :torch.Tensor = self.enc_score_head(output_memory) + enc_outputs_coord_unact :torch.Tensor = self.enc_bbox_head(output_memory) + anchors + + enc_topk_bboxes_list, enc_topk_logits_list = [], [] + enc_topk_memory, enc_topk_logits, enc_topk_bbox_unact = \ + self._select_topk(output_memory, enc_outputs_logits, enc_outputs_coord_unact, self.num_queries) + + if self.training: + enc_topk_bboxes = F.sigmoid(enc_topk_bbox_unact) + enc_topk_bboxes_list.append(enc_topk_bboxes) + enc_topk_logits_list.append(enc_topk_logits) + + # if self.num_select_queries != self.num_queries: + # raise NotImplementedError('') + + if self.learn_query_content: + content = self.tgt_embed.weight.unsqueeze(0).tile([memory.shape[0], 1, 1]) + else: + content = enc_topk_memory.detach() + + enc_topk_bbox_unact = enc_topk_bbox_unact.detach() + + if denoising_bbox_unact is not None: + enc_topk_bbox_unact = torch.concat([denoising_bbox_unact, enc_topk_bbox_unact], dim=1) + content = torch.concat([denoising_logits, content], dim=1) + + return content, enc_topk_bbox_unact, enc_topk_bboxes_list, enc_topk_logits_list + + def _select_topk(self, memory: torch.Tensor, outputs_logits: torch.Tensor, outputs_coords_unact: torch.Tensor, topk: int): + if self.query_select_method == 'default': + _, topk_ind = torch.topk(outputs_logits.max(-1).values, topk, dim=-1) + + elif self.query_select_method == 'one2many': + _, topk_ind = torch.topk(outputs_logits.flatten(1), topk, dim=-1) + topk_ind = topk_ind // self.num_classes + + elif self.query_select_method == 'agnostic': + _, topk_ind = torch.topk(outputs_logits.squeeze(-1), topk, dim=-1) + + topk_ind: torch.Tensor + + topk_coords = outputs_coords_unact.gather(dim=1, \ + index=topk_ind.unsqueeze(-1).repeat(1, 1, outputs_coords_unact.shape[-1])) + + topk_logits = outputs_logits.gather(dim=1, \ + index=topk_ind.unsqueeze(-1).repeat(1, 1, outputs_logits.shape[-1])) + + topk_memory = memory.gather(dim=1, \ + index=topk_ind.unsqueeze(-1).repeat(1, 1, memory.shape[-1])) + + return topk_memory, topk_logits, topk_coords + + + def forward(self, feats, targets=None): + # input projection and embedding + memory, spatial_shapes = self._get_encoder_input(feats) + + # prepare denoising training + if self.training and self.num_denoising > 0: + denoising_logits, denoising_bbox_unact, attn_mask, dn_meta = \ + get_contrastive_denoising_training_group(targets, \ + self.num_classes, + self.num_queries, + self.denoising_class_embed, + num_denoising=self.num_denoising, + label_noise_ratio=self.label_noise_ratio, + box_noise_scale=self.box_noise_scale, ) + else: + denoising_logits, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None + + init_ref_contents, init_ref_points_unact, enc_topk_bboxes_list, enc_topk_logits_list = \ + self._get_decoder_input(memory, spatial_shapes, denoising_logits, denoising_bbox_unact) + + # decoder + out_bboxes, out_logits = self.decoder( + init_ref_contents, + init_ref_points_unact, + memory, + spatial_shapes, + self.dec_bbox_head, + self.dec_score_head, + self.query_pos_head, + attn_mask=attn_mask) + + if self.training and dn_meta is not None: + dn_out_bboxes, out_bboxes = torch.split(out_bboxes, dn_meta['dn_num_split'], dim=2) + dn_out_logits, out_logits = torch.split(out_logits, dn_meta['dn_num_split'], dim=2) + + out = {'pred_logits': out_logits[-1], 'pred_boxes': out_bboxes[-1]} + + if self.training and self.aux_loss: + out['aux_outputs'] = self._set_aux_loss(out_logits[:-1], out_bboxes[:-1]) + out['enc_aux_outputs'] = self._set_aux_loss(enc_topk_logits_list, enc_topk_bboxes_list) + out['enc_meta'] = {'class_agnostic': self.query_select_method == 'agnostic'} + + if dn_meta is not None: + out['dn_aux_outputs'] = self._set_aux_loss(dn_out_logits, dn_out_bboxes) + out['dn_meta'] = dn_meta + + return out + + + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_coord): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [{'pred_logits': a, 'pred_boxes': b} + for a, b in zip(outputs_class, outputs_coord)] diff --git a/rtdetrv2_pytorch/src/zoo/rtdetr/utils.py b/rtdetrv2_pytorch/src/zoo/rtdetr/utils.py new file mode 100644 index 0000000..1601dd3 --- /dev/null +++ b/rtdetrv2_pytorch/src/zoo/rtdetr/utils.py @@ -0,0 +1,172 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import math +from typing import List + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def inverse_sigmoid(x: torch.Tensor, eps: float=1e-5) -> torch.Tensor: + x = x.clip(min=0., max=1.) + return torch.log(x.clip(min=eps) / (1 - x).clip(min=eps)) + + +def bias_init_with_prob(prior_prob=0.01): + """initialize conv/fc bias value according to a given probability value.""" + bias_init = float(-math.log((1 - prior_prob) / prior_prob)) + return bias_init + + +def deformable_attention_core_func(value, value_spatial_shapes, sampling_locations, attention_weights): + """ + Args: + value (Tensor): [bs, value_length, n_head, c] + value_spatial_shapes (Tensor|List): [n_levels, 2] + value_level_start_index (Tensor|List): [n_levels] + sampling_locations (Tensor): [bs, query_length, n_head, n_levels, n_points, 2] + attention_weights (Tensor): [bs, query_length, n_head, n_levels, n_points] + + Returns: + output (Tensor): [bs, Length_{query}, C] + """ + bs, _, n_head, c = value.shape + _, Len_q, _, n_levels, n_points, _ = sampling_locations.shape + + split_shape = [h * w for h, w in value_spatial_shapes] + value_list = value.split(split_shape, dim=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for level, (h, w) in enumerate(value_spatial_shapes): + # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ + value_l_ = value_list[level].flatten(2).permute( + 0, 2, 1).reshape(bs * n_head, c, h, w) + # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 + sampling_grid_l_ = sampling_grids[:, :, :, level].permute( + 0, 2, 1, 3, 4).flatten(0, 1) + # N_*M_, D_, Lq_, P_ + sampling_value_l_ = F.grid_sample( + value_l_, + sampling_grid_l_, + mode='bilinear', + padding_mode='zeros', + align_corners=False) + sampling_value_list.append(sampling_value_l_) + # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_) + attention_weights = attention_weights.permute(0, 2, 1, 3, 4).reshape( + bs * n_head, 1, Len_q, n_levels * n_points) + output = (torch.stack( + sampling_value_list, dim=-2).flatten(-2) * + attention_weights).sum(-1).reshape(bs, n_head * c, Len_q) + + return output.permute(0, 2, 1) + + + +def deformable_attention_core_func_v2(\ + value: torch.Tensor, + value_spatial_shapes, + sampling_locations: torch.Tensor, + attention_weights: torch.Tensor, + num_points_list: List[int], + method='default'): + """ + Args: + value (Tensor): [bs, value_length, n_head, c] + value_spatial_shapes (Tensor|List): [n_levels, 2] + value_level_start_index (Tensor|List): [n_levels] + sampling_locations (Tensor): [bs, query_length, n_head, n_levels * n_points, 2] + attention_weights (Tensor): [bs, query_length, n_head, n_levels * n_points] + + Returns: + output (Tensor): [bs, Length_{query}, C] + """ + bs, _, n_head, c = value.shape + _, Len_q, _, _, _ = sampling_locations.shape + + split_shape = [h * w for h, w in value_spatial_shapes] + value_list = value.permute(0, 2, 3, 1).flatten(0, 1).split(split_shape, dim=-1) + + # sampling_offsets [8, 480, 8, 12, 2] + if method == 'default': + sampling_grids = 2 * sampling_locations - 1 + + elif method == 'discrete': + sampling_grids = sampling_locations + + sampling_grids = sampling_grids.permute(0, 2, 1, 3, 4).flatten(0, 1) + sampling_locations_list = sampling_grids.split(num_points_list, dim=-2) + + sampling_value_list = [] + for level, (h, w) in enumerate(value_spatial_shapes): + value_l = value_list[level].reshape(bs * n_head, c, h, w) + sampling_grid_l: torch.Tensor = sampling_locations_list[level] + + if method == 'default': + sampling_value_l = F.grid_sample( + value_l, + sampling_grid_l, + mode='bilinear', + padding_mode='zeros', + align_corners=False) + + elif method == 'discrete': + # n * m, seq, n, 2 + sampling_coord = (sampling_grid_l * torch.tensor([[w, h]], device=value.device) + 0.5).to(torch.int64) + + # FIX ME? for rectangle input + sampling_coord = sampling_coord.clamp(0, h - 1) + sampling_coord = sampling_coord.reshape(bs * n_head, Len_q * num_points_list[level], 2) + + s_idx = torch.arange(sampling_coord.shape[0], device=value.device).unsqueeze(-1).repeat(1, sampling_coord.shape[1]) + sampling_value_l: torch.Tensor = value_l[s_idx, :, sampling_coord[..., 1], sampling_coord[..., 0]] # n l c + + sampling_value_l = sampling_value_l.permute(0, 2, 1).reshape(bs * n_head, c, Len_q, num_points_list[level]) + + sampling_value_list.append(sampling_value_l) + + attn_weights = attention_weights.permute(0, 2, 1, 3).reshape(bs * n_head, 1, Len_q, sum(num_points_list)) + weighted_sample_locs = torch.concat(sampling_value_list, dim=-1) * attn_weights + output = weighted_sample_locs.sum(-1).reshape(bs, n_head * c, Len_q) + + return output.permute(0, 2, 1) + + +def get_activation(act: str, inpace: bool=True): + """get activation + """ + if act is None: + return nn.Identity() + + elif isinstance(act, nn.Module): + return act + + act = act.lower() + + if act == 'silu' or act == 'swish': + m = nn.SiLU() + + elif act == 'relu': + m = nn.ReLU() + + elif act == 'leaky_relu': + m = nn.LeakyReLU() + + elif act == 'silu': + m = nn.SiLU() + + elif act == 'gelu': + m = nn.GELU() + + elif act == 'hardsigmoid': + m = nn.Hardsigmoid() + + else: + raise RuntimeError('') + + if hasattr(m, 'inplace'): + m.inplace = inpace + + return m diff --git a/rtdetrv2_pytorch/tools/README.md b/rtdetrv2_pytorch/tools/README.md new file mode 100644 index 0000000..57d594a --- /dev/null +++ b/rtdetrv2_pytorch/tools/README.md @@ -0,0 +1,124 @@ +### Getting Started: A Complete Workflow + +This guide provides a complete, step-by-step workflow from setting up the environment to training, exporting, and running inference with TensorRT. + +#### **1. Environment Setup with Docker (Recommended)** + +Using Docker is the recommended way to ensure all dependencies, drivers, and CUDA versions are perfectly aligned. This eliminates "it works on my machine" issues. + +* **Step 1.1: Build and Run the Container** + + From the project's root directory, run `docker compose`. This will build the image based on the `Dockerfile` and start the service in the background. + + ```bash + docker compose up --build -d + ``` + +* **Step 1.2: Verify the Container is Running** + + Check that the container is up and running. Note its name for the next step. + ```bash + docker ps + ``` + +--- + +#### **2. Training & Evaluation (Using `docker attach`)** + +This method directly attaches your terminal to the container's main process. It's simple but requires careful handling to avoid terminating your session. + +* **Step 2.1: Attach to the Container** + + Attach your terminal to the running container. You will be dropped into a bash shell. + + ```bash + docker attach + ``` + +* **Step 2.2: Run the Training Command** + + Now, *inside the attached shell*, run your training command. `torchrun` will automatically use the GPUs assigned to the container. **Do not run it in the background (`&`)**. + + ```bash + # Example for 4 GPUs assigned to the container + torchrun --nproc_per_node=4 --master-port=8989 \ + tools/train.py -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \ + --amp + ``` + +* **Step 2.3: Detach from the Session (IMPORTANT!)** + + With your training running, you can safely detach and leave it running. + + **WARNING:** **DO NOT PRESS `Ctrl+C`**. This will kill the training process and potentially the entire container. + + To safely detach, press the sequence: **`Ctrl+P`**, followed immediately by **`Ctrl+Q`**. + + You will return to your local terminal, and the container will continue running the training in the background. + +* **Step 2.4: Re-attach to Your Session** + + To check on your training progress, simply run the `docker attach` command again. You will see the live output from your training command. + + ```bash + docker attach + ``` + (Remember to detach with `Ctrl+P`, `Ctrl+Q` when you're done.) + +--- + +#### **3. Exporting & Inference** + +For tasks like exporting or running inference, which don't need to run for days, it's safer to use `docker exec` to open a new, separate shell. + +* **Step 3.1: Open a New Shell in the Container** + ```bash + docker exec -it bash + ``` + +* **Step 3.2: Run Export or Inference Commands** + Now, inside this new shell, run your commands. + ```bash + # Export to ONNX + python tools/export_onnx.py \ + -c configs/rtdetr/rtdetr_r50vd_6x_coco.yml \ + -r path/to/trained_checkpoint.pth \ + --check + ``` + + ``` + # Convert to TensorRT + bash tools/onnx2trt.sh /path/to/your/model.onnx + ``` + + ``` + # RUN TRT Inference + python references/deploy/rtdetrv2_tensorrt.py \ + --engine /path/to/your/model.trt \ + --image /path/to/your/image.jpg \ + --output /path/to/save/output.jpg \ + --threshold 0.5 + ``` + +### Utilities & Tips + +* **Visualize training with TensorBoard:** + * Use the standard port `6006` to avoid conflicts with training. + * Ensure the port `6006` is exposed in your `docker-compose.yml`. + + ```bash + # Inside the container + tensorboard --logdir=path/to/summary/ --host=0.0.0.0 --port=6006 + ``` + +* **Managing the Container Lifecycle:** + * **To temporarily stop** the container without deleting it (e.g., to pause training and resume later): + ```bash + docker compose stop + ``` + You can restart it later with `docker compose start`. + + * **To stop and completely remove** the container, network, and volumes: + ```bash + docker compose down + ``` \ No newline at end of file diff --git a/rtdetrv2_pytorch/tools/export_onnx.py b/rtdetrv2_pytorch/tools/export_onnx.py new file mode 100644 index 0000000..1586319 --- /dev/null +++ b/rtdetrv2_pytorch/tools/export_onnx.py @@ -0,0 +1,100 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import os +import sys +sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')) + +import torch +import torch.nn as nn + +from src.core import YAMLConfig, yaml_utils + + +def main(args, ): + """main + """ + update_dict = yaml_utils.parse_cli(args.update) if args.update else {} + update_dict.update({k: v for k, v in args.__dict__.items() \ + if k not in ['update', ] and v is not None}) + cfg = YAMLConfig(args.config, **update_dict) + + if args.resume: + checkpoint = torch.load(args.resume, map_location='cpu') + if 'ema' in checkpoint: + state = checkpoint['ema']['module'] + else: + state = checkpoint['model'] + + # NOTE load train mode state -> convert to deploy mode + cfg.model.load_state_dict(state) + + else: + # raise AttributeError('Only support resume to load model.state_dict by now.') + print('not load model.state_dict, use default init state dict...') + + class Model(nn.Module): + def __init__(self, ) -> None: + super().__init__() + self.model = cfg.model.deploy() + self.postprocessor = cfg.postprocessor.deploy() + + def forward(self, images, orig_target_sizes): + outputs = self.model(images) + outputs = self.postprocessor(outputs, orig_target_sizes) + return outputs + + model = Model() + + data = torch.rand(1, 3, args.input_size, args.input_size) + size = torch.tensor([[args.input_size, args.input_size]]) + _ = model(data, size) + + dynamic_axes = { + 'images': {0: 'N', }, + 'orig_target_sizes': {0: 'N'} + } + + torch.onnx.export( + model, + (data, size), + args.output_file, + input_names=['images', 'orig_target_sizes'], + output_names=['labels', 'boxes', 'scores'], + dynamic_axes=dynamic_axes, + opset_version=16, + verbose=False, + do_constant_folding=True, + ) + + if args.check: + import onnx + onnx_model = onnx.load(args.output_file) + onnx.checker.check_model(onnx_model) + print('Check export onnx model done...') + + if args.simplify: + import onnx + import onnxsim + dynamic = True + # input_shapes = {'images': [1, 3, 640, 640], 'orig_target_sizes': [1, 2]} if dynamic else None + input_shapes = {'images': data.shape, 'orig_target_sizes': size.shape} if dynamic else None + onnx_model_simplify, check = onnxsim.simplify(args.output_file, input_shapes=input_shapes, dynamic_input_shape=dynamic) + onnx.save(onnx_model_simplify, args.output_file) + print(f'Simplify onnx model {check}...') + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--config', '-c', type=str) + parser.add_argument('--resume', '-r', type=str) + parser.add_argument('--output_file', '-o', type=str, default='model.onnx') + parser.add_argument('--input_size', '-s', type=int, default=640) + parser.add_argument('--check', action='store_true', default=False) + parser.add_argument('--simplify', action='store_true', default=False) + parser.add_argument('--update', '-u', nargs='+', help='update yaml config') + + args = parser.parse_args() + + main(args) \ No newline at end of file diff --git a/rtdetrv2_pytorch/tools/export_trt.py b/rtdetrv2_pytorch/tools/export_trt.py new file mode 100644 index 0000000..facad52 --- /dev/null +++ b/rtdetrv2_pytorch/tools/export_trt.py @@ -0,0 +1,81 @@ +import os +import argparse +import tensorrt as trt + +def main(onnx_path, engine_path, max_batchsize, opt_batchsize, min_batchsize, use_fp16=True, verbose=False)->None: + """ Convert ONNX model to TensorRT engine. + Args: + onnx_path (str): Path to the input ONNX model. + engine_path (str): Path to save the output TensorRT engine. + use_fp16 (bool): Whether to use FP16 precision. + verbose (bool): Whether to enable verbose logging. + """ + logger = trt.Logger(trt.Logger.VERBOSE if verbose else trt.Logger.INFO) + + builder = trt.Builder(logger) + network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) + network = builder.create_network(network_flags) + + parser = trt.OnnxParser(network, logger) + config = builder.create_builder_config() + config.set_preview_feature(trt.PreviewFeature.FASTER_DYNAMIC_SHAPES_0805, True) + + if not os.path.isfile(onnx_path): + raise FileNotFoundError(f"ONNX file not found: {onnx_path}") + + print(f"[INFO] Loading ONNX file from {onnx_path}") + with open(onnx_path, "rb") as f: + if not parser.parse(f.read()): + for error in range(parser.num_errors): + print(parser.get_error(error)) + raise RuntimeError("Failed to parse ONNX file") + + config = builder.create_builder_config() + config.set_preview_feature(trt.PreviewFeature.FASTER_DYNAMIC_SHAPES_0805, True) + config.max_workspace_size = 1 << 30 # 1GB + + if use_fp16: + if builder.platform_has_fast_fp16: + config.set_flag(trt.BuilderFlag.FP16) + print("[INFO] FP16 optimization enabled.") + else: + print("[WARNING] FP16 not supported on this platform. Proceeding with FP32.") + + profile = builder.create_optimization_profile() + profile.set_shape("images", min=(min_batchsize, 3, 640, 640), opt=(opt_batchsize, 3, 640, 640), max=(max_batchsize, 3, 640, 640)) + profile.set_shape("orig_target_sizes", min=(1, 2), opt=(1, 2), max=(1, 2)) + config.add_optimization_profile(profile) + + print("[INFO] Building TensorRT engine...") + engine = builder.build_engine(network, config) + + if engine is None: + raise RuntimeError("Failed to build the engine.") + + print(f"[INFO] Saving engine to {engine_path}") + with open(engine_path, "wb") as f: + f.write(engine.serialize()) + print("[INFO] Engine export complete.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Convert ONNX to TensorRT Engine") + parser.add_argument("--onnx", "-i", type=str, required=True, help="Path to input ONNX model file") + parser.add_argument("--saveEngine", "-o", type=str, default="model.engine", help="Path to output TensorRT engine file") + parser.add_argument("--maxBatchSize", "-Mb", type=int, default=32, help="Maximum batch size for inference") + parser.add_argument("--optBatchSize", "-ob", type=int, default=16, help="Optimal batch size for inference") + parser.add_argument("--minBatchSize", "-mb", type=int, default=1, help="Minimum batch size for inference") + parser.add_argument("--fp16", default=True, action="store_true", help="Enable FP16 precision mode") + parser.add_argument("--verbose", action="store_true", help="Enable verbose logging") + + args = parser.parse_args() + + main( + onnx_path=args.onnx, + engine_path=args.saveEngine, + max_batchsize=args.maxBatchSize, + opt_batchsize=args.optBatchSize, + min_batchsize=args.minBatchSize, + use_fp16=args.fp16, + verbose=args.verbose + ) diff --git a/rtdetrv2_pytorch/tools/onnx2trt.sh b/rtdetrv2_pytorch/tools/onnx2trt.sh new file mode 100644 index 0000000..6f43595 --- /dev/null +++ b/rtdetrv2_pytorch/tools/onnx2trt.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# A script to convert an ONNX model to a TensorRT engine using trtexec. +# This script automatically sets the output engine path based on the input ONNX file. + +# Exit immediately if a command exits with a non-zero status. +set -e + +# Check if an input file is provided. +if [ -z "$1" ]; then + echo "Error: No ONNX file provided." + echo "Usage: $0 /path/to/your/model.onnx" + exit 1 +fi + +ONNX_FILE=$1 +# Replace the .onnx extension with .trt for the output file. +ENGINE_FILE="${ONNX_FILE%.onnx}.trt" + +echo "==> Converting ONNX to TensorRT Engine <==" +echo " - Input ONNX: $ONNX_FILE" +echo " - Output TRT: $ENGINE_FILE" +echo " - Precision: FP16" +echo "==========================================" + +# Run the trtexec command. +# --fp16 enables 16-bit floating-point precision for faster inference. +# --verbose provides detailed output during the conversion process. +trtexec --onnx="$ONNX_FILE" \ + --saveEngine="$ENGINE_FILE" \ + --fp16 \ + --verbose + +echo "==========================================" +echo "✅ Successfully created TensorRT engine: $ENGINE_FILE" \ No newline at end of file diff --git a/rtdetrv2_pytorch/tools/run_profile.py b/rtdetrv2_pytorch/tools/run_profile.py new file mode 100644 index 0000000..bdcf989 --- /dev/null +++ b/rtdetrv2_pytorch/tools/run_profile.py @@ -0,0 +1,110 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import math +import os +import sys + +import torch +import torch.nn as nn +from torch import Tensor + +sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')) +from typing import Any, Dict, List, Optional + +from src.core import YAMLConfig, yaml_utils + +__all__ = ["profile_stats"] + +def _auto_scale_flops(flops: float): + """Copied from torch.profiler.profile""" + flop_headers = [ + "", + "K", + "M", + "G", + "T", + "P", + ] + assert flops > 0 + log_flops = max(0, min(math.log10(flops) / 3, float(len(flop_headers) - 1))) + assert log_flops >= 0 and log_flops < len(flop_headers) + return (pow(10, (math.floor(log_flops) * -3.0)), flop_headers[int(log_flops)]) + +def profile_stats( + model: nn.Module, + data: Optional[Tensor]=None, + shape: List[int]=[1, 3, 640, 640], + verbose: bool=False +) -> Dict[str, Any]: + is_training = model.training + + model.train() + num_params = sum([p.numel() for p in model.parameters() if p.requires_grad]) + + model.eval() + + if data is None: + dtype = next(model.parameters()).dtype + device = next(model.parameters()).device + data = torch.rand(*shape, dtype=dtype, device=device) + print(device) + + def trace_handler(prof): + print(prof.key_averages().table(sort_by='self_cuda_time_total', row_limit=-1)) + + wait = 0 + warmup = 1 + active = 1 + repeat = 1 + skip_first = 0 + with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + schedule=torch.profiler.schedule( + wait=wait, + warmup=warmup, + active=active, + repeat=repeat, + skip_first=skip_first, + ), + with_flops=True, + ) as p: + n_step = skip_first + (wait + warmup + active) * repeat + for _ in range(n_step): + _ = model(data) + p.step() + + if is_training: + model.train() + + statistics = p.key_averages() + info = statistics.table(sort_by='self_cuda_time_total', row_limit=-1) + num_flops = sum(event.flops for event in statistics if event.flops > 0) / active + (flops_scale, flops_header) = _auto_scale_flops(num_flops) + + if verbose: + print(info) + print(f'Total number of trainable parameters: {num_params}') + print(f'Total number of flops: {num_flops * flops_scale:.3f}{flops_header} with {shape}') + + return {'n_parameters': num_params, 'n_flops': num_flops, 'info': info} + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--config', type=str, required=True) + parser.add_argument('-d', '--device', type=str, default='cuda:0', help='device',) + parser.add_argument('-u', '--update', nargs='+', help='Update yaml config from command line.') + args = parser.parse_args() + + update_dict = yaml_utils.parse_cli(args.update) if args.update else {} + update_dict.update({k: v for k, v in args.__dict__.items() \ + if k not in ['update', ] and v is not None}) + cfg = YAMLConfig(args.config, **update_dict) + model = cfg.model.to(args.device) + + profile_stats(model, verbose=True) diff --git a/rtdetrv2_pytorch/tools/train.py b/rtdetrv2_pytorch/tools/train.py new file mode 100644 index 0000000..280caa8 --- /dev/null +++ b/rtdetrv2_pytorch/tools/train.py @@ -0,0 +1,65 @@ +"""Copyright(c) 2023 lyuwenyu. All Rights Reserved. +""" + +import os +import sys +sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')) + +import argparse + +from src.misc import dist_utils +from src.core import YAMLConfig, yaml_utils +from src.solver import TASKS + + +def main(args, ) -> None: + """main + """ + dist_utils.setup_distributed(args.print_rank, args.print_method, seed=args.seed) + + assert not all([args.tuning, args.resume]), \ + 'Only support from_scrach or resume or tuning at one time' + + update_dict = yaml_utils.parse_cli(args.update) + update_dict.update({k: v for k, v in args.__dict__.items() \ + if k not in ['update', ] and v is not None}) + + cfg = YAMLConfig(args.config, **update_dict) + print('cfg: ', cfg.__dict__) + + solver = TASKS[cfg.yaml_cfg['task']](cfg) + + if args.test_only: + solver.val() + else: + solver.fit() + + dist_utils.cleanup() + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + + # priority 0 + parser.add_argument('-c', '--config', type=str, required=True) + parser.add_argument('-r', '--resume', type=str, help='resume from checkpoint') + parser.add_argument('-t', '--tuning', type=str, help='tuning from checkpoint') + parser.add_argument('-d', '--device', type=str, help='device',) + parser.add_argument('--seed', type=int, help='exp reproducibility') + parser.add_argument('--use-amp', action='store_true', help='auto mixed precision training') + parser.add_argument('--output-dir', type=str, help='output directoy') + parser.add_argument('--summary-dir', type=str, help='tensorboard summry') + parser.add_argument('--test-only', action='store_true', default=False,) + + # priority 1 + parser.add_argument('-u', '--update', nargs='+', help='update yaml config') + + # env + parser.add_argument('--print-method', type=str, default='builtin', help='print method') + parser.add_argument('--print-rank', type=int, default=0, help='print rank id') + + parser.add_argument('--local-rank', type=int, help='local rank id') + args = parser.parse_args() + + main(args)