diff --git a/.circleci/docker/Dockerfile b/.circleci/docker/Dockerfile index d9cf8cc771..8c9030335b 100644 --- a/.circleci/docker/Dockerfile +++ b/.circleci/docker/Dockerfile @@ -1,6 +1,6 @@ -ARG PYTORCH="1.8.1" -ARG CUDA="10.2" -ARG CUDNN="7" +ARG PYTORCH="1.7.1" +ARG CUDA="11.1" +ARG CUDNN="8" FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel diff --git a/.circleci/test.yml b/.circleci/test.yml index 4c16e6290c..fe6f79e9c1 100644 --- a/.circleci/test.yml +++ b/.circleci/test.yml @@ -83,10 +83,10 @@ jobs: type: string cuda: type: enum - enum: ["10.1", "10.2", "11.1"] + enum: ["11.1"] cudnn: type: integer - default: 7 + default: 8 machine: image: ubuntu-2004-cuda-11.4:202110-01 # docker_layer_caching: true @@ -150,15 +150,15 @@ workflows: - 1.x - build_cpu: name: minimum_version_cpu - torch: 1.6.0 - torchvision: 0.7.0 - python: 3.6.9 # The lowest python 3.6.x version available on CircleCI images + torch: 1.7.1 + torchvision: 0.8.2 + python: 3.7.4 requires: - lint - build_cpu: name: maximum_version_cpu - torch: 1.12.1 - torchvision: 0.13.1 + torch: 1.13.0 + torchvision: 0.14.0 python: 3.9.0 requires: - minimum_version_cpu @@ -168,10 +168,10 @@ workflows: - maximum_version_cpu - build_cuda: name: mainstream_version_gpu - torch: 1.8.1 + torch: 1.7.1 # Use double quotation mark to explicitly specify its type # as string instead of number - cuda: "10.2" + cuda: "11.1" requires: - hold merge_stage_test: @@ -181,10 +181,10 @@ workflows: jobs: - build_cuda: name: minimum_version_gpu - torch: 1.6.0 + torch: 1.7.1 # Use double quotation mark to explicitly specify its type # as string instead of number - cuda: "10.1" + cuda: "11.1" filters: branches: only: diff --git a/.github/workflows/merge_stage_test.yml b/.github/workflows/merge_stage_test.yml index b6e9ba0c6b..a9ff9715d9 100644 --- a/.github/workflows/merge_stage_test.yml +++ b/.github/workflows/merge_stage_test.yml @@ -21,7 +21,7 @@ jobs: runs-on: ubuntu-18.04 strategy: matrix: - python-version: [3.6, 3.8, 3.9] + python-version: [3.8, 3.9] torch: [1.8.1] include: - torch: 1.8.1 @@ -64,7 +64,7 @@ jobs: strategy: matrix: python-version: [3.7] - torch: [1.6.0, 1.7.1, 1.8.1, 1.9.1, 1.10.1, 1.11.0, 1.12.1] + torch: [1.6.0, 1.7.1, 1.8.1, 1.9.1, 1.10.1, 1.11.0, 1.12.1, 1.13.0] include: - torch: 1.6.0 torchvision: 0.7.0 @@ -80,6 +80,8 @@ jobs: torchvision: 0.12.0 - torch: 1.12.1 torchvision: 0.13.1 + - torch: 1.13.0 + torchvision: 0.14.0 steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0aa94b3646..3cae132d8a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,7 +5,7 @@ repos: hooks: - id: flake8 - repo: https://github.com/PyCQA/isort - rev: 5.10.1 + rev: 5.11.5 hooks: - id: isort - repo: https://github.com/pre-commit/mirrors-yapf diff --git a/README.md b/README.md index 8bb0705c0d..b542740019 100644 --- a/README.md +++ b/README.md @@ -31,10 +31,25 @@ [👀Model Zoo](https://mmpose.readthedocs.io/en/1.x/model_zoo.html) | [📜Papers](https://mmpose.readthedocs.io/en/1.x/model_zoo_papers/algorithms.html) | [🆕Update News](https://mmpose.readthedocs.io/en/1.x/notes/changelog.html) | -[🤔Reporting Issues](https://github.com/open-mmlab/mmpose/issues/new/choose) +[🤔Reporting Issues](https://github.com/open-mmlab/mmpose/issues/new/choose) | +[🔥RTMPose](/projects/rtmpose/) +
+ + + + + + + + + + + +
+ ## Introduction English | [简体中文](README_CN.md) @@ -46,7 +61,9 @@ The master branch works with **PyTorch 1.6+**. https://user-images.githubusercontent.com/15977946/124654387-0fd3c500-ded1-11eb-84f6-24eeddbf4d91.mp4 -
+
+ +
Major Features - **Support diverse tasks** @@ -74,21 +91,37 @@ https://user-images.githubusercontent.com/15977946/124654387-0fd3c500-ded1-11eb- ## What's New -- 2022-10-14: MMPose [v1.0.0rc0](https://github.com/open-mmlab/mmpose/releases/tag/v1.0.0rc0) is released. Major updates include: +- We are excited to release **RTMPose**, a real-time pose estimation framework including: + + - A family of lightweight pose estimation models with state-of-the-art performance + - Inference APIs for Python, C++, C#, Java, etc. Easy to integrate into your applications and empower real-time stable pose estimation + - Cross-platform deployment with various backends + - A step-by-step guide to training and deploying your own models + + Checkout our [project page](/projects/rtmpose/) and [technical report](https://arxiv.org/abs/2303.07399) for more information! + +![rtmpose_intro](https://user-images.githubusercontent.com/13503330/219269619-935499e5-bdd9-49ea-8104-3c7796dbd862.png) + +- Welcome to [*projects of MMPose*](/projects/README.md), where you can access to the latest features of MMPose, and share your ideas and codes with the community at once. Contribution to MMPose will be simple and smooth: + + - Provide an easy and agile way to integrate algorithms, features and applications into MMPose + - Allow flexible code structure and style; only need a short code review process + - Build individual projects with full power of MMPose but not bound up with heavy frameworks + - Checkout new projects: + - [RTMPose](/projects/rtmpose/) + - [YOLOX-Pose (coming soon)](<>) + - [MMPose4AIGC (coming soon)](<>) + - Become a contributors and make MMPose greater. Start your journey from the [example project](/projects/example_project/) + +
- - Support 4 light-weight pose estimation algorithms - - SimCC (ECCV'22): [paper](https://doi.org/10.48550/arxiv.2107.03332) | [models](https://github.com/open-mmlab/mmpose/blob/1.x/configs/body_2d_keypoint/simcc/README.md) - - Debias-IPR (ICCV'21): [paper](https://openaccess.thecvf.com/content/ICCV2021/papers/Gu_Removing_the_Bias_of_Integral_Pose_Regression_ICCV_2021_paper.pdf) | [models](https://github.com/open-mmlab/mmpose/blob/1.x/configs/body_2d_keypoint/integral_regression/README.md) - - IPR (ECCV'18): [paper](https://arxiv.org/abs/1711.08229) | [models](https://github.com/open-mmlab/mmpose/blob/1.x/configs/body_2d_keypoint/integral_regression/README.md) - - DSNT (ArXiv'18): [paper](https://arxiv.org/abs/1801.07372v2) | [models](https://github.com/open-mmlab/mmpose/blob/1.x/configs/body_2d_keypoint/integral_regression/README.md) - - Add [Colab tutorial](https://github.com/open-mmlab/mmpose/blob/1.x/demo/MMPose_Tutorial.ipynb) for MMPose v1.0 +- 2022-03-15: MMPose [v1.0.0rc1](https://github.com/open-mmlab/mmpose/releases/tag/v1.0.0rc1) is released. Major updates include: -- 2022-09-01: MMPose [v1.0.0b0](https://github.com/open-mmlab/mmpose/releases/tag/v1.0.0b0) is released! + - Release [RTMPose](/projects/rtmpose/), a high-performance real-time pose estimation framework based on MMPose + - Support [ViTPose](/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md) (NeurIPS'22), [CID](/configs/body_2d_keypoint/cid/coco/hrnet_coco.md) (CVPR'22) and [DEKR](/configs/body_2d_keypoint/dekr/) (CVPR'21) + - Add [*Inferencer*](/docs/en/user_guides/inference.md#out-of-the-box-inferencer), a convenient interface for inference and visualization - - This release introduced major refactoring to MMPose towards better performance, extensibility and user-friendliness. - - Built upon a brand new and flexible training & test engine, which is still in progress. Welcome to try according to [the documentation](https://mmpose.readthedocs.io/en/1.x/). - - There are BC-breaking changes. Please check [the migration tutorial](https://mmpose.readthedocs.io/en/1.x/migration.html). - - The beta and release candidate versions will last until the end of 2022, and during the release candidate, we will develop on the `1.x` branch. And we will still maintain 0.x version still at least the end of 2023. + See the full [release note](https://github.com/open-mmlab/mmpose/releases/tag/v1.0.0rc1) for more exciting updates brought by MMPose v1.0.0rc1! ## Installation diff --git a/README_CN.md b/README_CN.md index 859eaf00c8..fe7d539980 100644 --- a/README_CN.md +++ b/README_CN.md @@ -31,7 +31,8 @@ [👀模型库](https://mmpose.readthedocs.io/zh_CN/1.x/model_zoo.html) | [📜论文库](https://mmpose.readthedocs.io/zh_CN/1.x/model_zoo_papers/algorithms.html) | [🆕更新日志](https://mmpose.readthedocs.io/zh_CN/1.x/notes/changelog.html) | -[🤔报告问题](https://github.com/open-mmlab/mmpose/issues/new/choose) +[🤔报告问题](https://github.com/open-mmlab/mmpose/issues/new/choose) | +[🔥RTMPose](/projects/rtmpose/) @@ -74,24 +75,37 @@ https://user-images.githubusercontent.com/15977946/124654387-0fd3c500-ded1-11eb- ## 最新进展 -- 2022-10-14: MMPose [v1.0.0rc0](https://github.com/open-mmlab/mmpose/releases/tag/v1.0.0rc0) 已经发布. 主要更新包括: +- 我们发布了 **RTMPose**,一个高性能实时多人姿态检测框架。具体包括: - - 增加了 4 个轻量化姿态估计算法 - - SimCC (ECCV'22): [论文链接](https://doi.org/10.48550/arxiv.2107.03332) | [模型](https://github.com/open-mmlab/mmpose/blob/1.x/configs/body_2d_keypoint/simcc/README.md) - - Debias-IPR (ICCV'21): [论文链接](https://openaccess.thecvf.com/content/ICCV2021/papers/Gu_Removing_the_Bias_of_Integral_Pose_Regression_ICCV_2021_paper.pdf) | [模型](https://github.com/open-mmlab/mmpose/blob/1.x/configs/body_2d_keypoint/integral_regression/README.md) - - IPR (ECCV'18): [论文链接](https://arxiv.org/abs/1711.08229) | [模型](https://github.com/open-mmlab/mmpose/blob/1.x/configs/body_2d_keypoint/integral_regression/README.md) - - DSNT (ArXiv'18): [论文链接](https://arxiv.org/abs/1801.07372v2) | [模型](https://github.com/open-mmlab/mmpose/blob/1.x/configs/body_2d_keypoint/integral_regression/README.md) - - 增加 MMPose 1.0 [Colab 教程](https://github.com/open-mmlab/mmpose/blob/1.x/demo/MMPose_Tutorial.ipynb) + - 一组新的轻量化姿态估计模型,在不同算力条件下达到 SOTA 的精度性能 + - 支持多语言(Python, C++, C#, Java, etc)的模型推理接口,可以轻松集成到您的应用中以支持实时、稳定的姿态估计 + - 跨平台,多后端的模型部署支持 + - 提供极易上手的教程,帮助您训练和部署自己的模型 -- 2022-09-01: MMPose [v1.0.0b0](https://github.com/open-mmlab/mmpose/releases/tag/v1.0.0b0) 已经发布. 主要更新包括: + 更多信息敬请参阅 RTMPose [项目主页](/projects/rtmpose/) 和 [技术报告](https://arxiv.org/abs/2303.07399) - - 对 MMPose 进行了重大重构,旨在提升算法库性能和可扩展性,并使其更容易上手。 - - 基于一个全新的,可扩展性强的训练和测试引擎,但目前仍在开发中。欢迎根据[文档](https://mmpose.readthedocs.io/zh_CN/1.x/)进行试用。 - - 新版本中存在一些与旧版本不兼容的修改。请查看[迁移文档](https://mmpose.readthedocs.io/zh_CN/1.x/migration.html)来详细了解这些变动。 - - 新版本的公测将持续到 2022 年末,在此期间,我们将基于 `1.x` 分支进行更新,不会合入到 `master` 分支。另外,至少 - 到 2023 年末,我们会保持对 0.x 版本的维护。 +![rtmpose_intro](https://user-images.githubusercontent.com/13503330/219269619-935499e5-bdd9-49ea-8104-3c7796dbd862.png) -发布历史和更新细节请参考 [更新日志](https://mmpose.readthedocs.io/zh_CN/1.x/notes/changelog.html) +- 欢迎使用 [*MMPose 项目*](/projects/README.md)。在这里,您可以发现 MMPose 中的最新功能和算法,并且可以通过最快的方式与社区分享自己的创意和代码实现。向 MMPose 中添加新功能从此变得简单丝滑: + + - 提供了一种简单迅捷的方式,将新的算法、功能和应用添加到 MMPose 中 + - 更灵活的代码结构和风格,更少的限制,更简短的代码审核流程 + - 通过独立项目的形式,利用 MMPose 的强大功能,同时不被代码框架所束缚 + - 最新添加的项目包括: + - [RTMPose](/projects/rtmpose/) + - [YOLOX-Pose (coming soon)](<>) + - [MMPose4AIGC (coming soon)](<>) + - 从简单的 [示例项目](/projects/example_project/) 开启您的 MMPose 代码贡献者之旅吧,让我们共同打造更好用的 MMPose! + +
+ +- 2022-03-15: MMPose [v1.0.0rc1](https://github.com/open-mmlab/mmpose/releases/tag/v1.0.0rc1) 正式发布了,主要更新包括: + + - 发布了 [RTMPose](/projects/rtmpose/),一个高性能实时多人姿态估计算法框架 + - 支持了多个新算法: [ViTPose](/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md) (NeurIPS'22), [CID](/configs/body_2d_keypoint/cid/coco/hrnet_coco.md) (CVPR'22) and [DEKR](/configs/body_2d_keypoint/dekr/) (CVPR'21) + - 增加了 [*Inferencer*](/docs/en/user_guides/inference.md#out-of-the-box-inferencer),一个非常便捷的模型推理接口,通过 1 行代码完成模型选择、权重加载、模型推理和结果可视化。 + + 请查看完整的 [版本说明](https://github.com/open-mmlab/mmpose/releases/tag/v1.0.0rc1) 以了解更多 MMPose v1.0.0rc1 带来的更新! ## 安装 @@ -279,10 +293,11 @@ MMPose 是一款由不同学校和公司共同贡献的开源项目。我们感 ## 欢迎加入 OpenMMLab 社区 -扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab),联络 OpenMMLab [官方微信小助手](/docs/en/imgs/wechat_assistant_qrcode.png)或加入 OpenMMLab 团队的 [官方交流 QQ 群](https://jq.qq.com/?_wv=1027&k=GJP18SjI) +扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab),联络 OpenMMLab [官方微信小助手](https://user-images.githubusercontent.com/25839884/205872898-e2e6009d-c6bb-4d27-8d07-117e697a3da8.jpg)或加入 OpenMMLab 团队的 [官方交流 QQ 群](https://jq.qq.com/?_wv=1027&k=K0QI8ByU)
- +
我们会在 OpenMMLab 社区为大家 diff --git a/configs/_base_/datasets/300w.py b/configs/_base_/datasets/300w.py index 10c343a2ad..2c3728da1d 100644 --- a/configs/_base_/datasets/300w.py +++ b/configs/_base_/datasets/300w.py @@ -11,373 +11,123 @@ homepage='https://ibug.doc.ic.ac.uk/resources/300-W/', ), keypoint_info={ - 0: - dict( - name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-16'), - 1: - dict( - name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-15'), - 2: - dict( - name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-14'), - 3: - dict( - name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-13'), - 4: - dict( - name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-12'), - 5: - dict( - name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-11'), - 6: - dict( - name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-10'), - 7: - dict(name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-9'), - 8: - dict(name='kpt-8', id=8, color=[255, 255, 255], type='', swap=''), - 9: - dict(name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-7'), + 0: dict(name='kpt-0', id=0, color=[255, 0, 0], type='', swap='kpt-16'), + 1: dict(name='kpt-1', id=1, color=[255, 0, 0], type='', swap='kpt-15'), + 2: dict(name='kpt-2', id=2, color=[255, 0, 0], type='', swap='kpt-14'), + 3: dict(name='kpt-3', id=3, color=[255, 0, 0], type='', swap='kpt-13'), + 4: dict(name='kpt-4', id=4, color=[255, 0, 0], type='', swap='kpt-12'), + 5: dict(name='kpt-5', id=5, color=[255, 0, 0], type='', swap='kpt-11'), + 6: dict(name='kpt-6', id=6, color=[255, 0, 0], type='', swap='kpt-10'), + 7: dict(name='kpt-7', id=7, color=[255, 0, 0], type='', swap='kpt-9'), + 8: dict(name='kpt-8', id=8, color=[255, 0, 0], type='', swap=''), + 9: dict(name='kpt-9', id=9, color=[255, 0, 0], type='', swap='kpt-7'), 10: - dict( - name='kpt-10', id=10, color=[255, 255, 255], type='', - swap='kpt-6'), + dict(name='kpt-10', id=10, color=[255, 0, 0], type='', swap='kpt-6'), 11: - dict( - name='kpt-11', id=11, color=[255, 255, 255], type='', - swap='kpt-5'), + dict(name='kpt-11', id=11, color=[255, 0, 0], type='', swap='kpt-5'), 12: - dict( - name='kpt-12', id=12, color=[255, 255, 255], type='', - swap='kpt-4'), + dict(name='kpt-12', id=12, color=[255, 0, 0], type='', swap='kpt-4'), 13: - dict( - name='kpt-13', id=13, color=[255, 255, 255], type='', - swap='kpt-3'), + dict(name='kpt-13', id=13, color=[255, 0, 0], type='', swap='kpt-3'), 14: - dict( - name='kpt-14', id=14, color=[255, 255, 255], type='', - swap='kpt-2'), + dict(name='kpt-14', id=14, color=[255, 0, 0], type='', swap='kpt-2'), 15: - dict( - name='kpt-15', id=15, color=[255, 255, 255], type='', - swap='kpt-1'), + dict(name='kpt-15', id=15, color=[255, 0, 0], type='', swap='kpt-1'), 16: - dict( - name='kpt-16', id=16, color=[255, 255, 255], type='', - swap='kpt-0'), + dict(name='kpt-16', id=16, color=[255, 0, 0], type='', swap='kpt-0'), 17: - dict( - name='kpt-17', - id=17, - color=[255, 255, 255], - type='', - swap='kpt-26'), + dict(name='kpt-17', id=17, color=[255, 0, 0], type='', swap='kpt-26'), 18: - dict( - name='kpt-18', - id=18, - color=[255, 255, 255], - type='', - swap='kpt-25'), + dict(name='kpt-18', id=18, color=[255, 0, 0], type='', swap='kpt-25'), 19: - dict( - name='kpt-19', - id=19, - color=[255, 255, 255], - type='', - swap='kpt-24'), + dict(name='kpt-19', id=19, color=[255, 0, 0], type='', swap='kpt-24'), 20: - dict( - name='kpt-20', - id=20, - color=[255, 255, 255], - type='', - swap='kpt-23'), + dict(name='kpt-20', id=20, color=[255, 0, 0], type='', swap='kpt-23'), 21: - dict( - name='kpt-21', - id=21, - color=[255, 255, 255], - type='', - swap='kpt-22'), + dict(name='kpt-21', id=21, color=[255, 0, 0], type='', swap='kpt-22'), 22: - dict( - name='kpt-22', - id=22, - color=[255, 255, 255], - type='', - swap='kpt-21'), + dict(name='kpt-22', id=22, color=[255, 0, 0], type='', swap='kpt-21'), 23: - dict( - name='kpt-23', - id=23, - color=[255, 255, 255], - type='', - swap='kpt-20'), + dict(name='kpt-23', id=23, color=[255, 0, 0], type='', swap='kpt-20'), 24: - dict( - name='kpt-24', - id=24, - color=[255, 255, 255], - type='', - swap='kpt-19'), + dict(name='kpt-24', id=24, color=[255, 0, 0], type='', swap='kpt-19'), 25: - dict( - name='kpt-25', - id=25, - color=[255, 255, 255], - type='', - swap='kpt-18'), + dict(name='kpt-25', id=25, color=[255, 0, 0], type='', swap='kpt-18'), 26: - dict( - name='kpt-26', - id=26, - color=[255, 255, 255], - type='', - swap='kpt-17'), - 27: - dict(name='kpt-27', id=27, color=[255, 255, 255], type='', swap=''), - 28: - dict(name='kpt-28', id=28, color=[255, 255, 255], type='', swap=''), - 29: - dict(name='kpt-29', id=29, color=[255, 255, 255], type='', swap=''), - 30: - dict(name='kpt-30', id=30, color=[255, 255, 255], type='', swap=''), + dict(name='kpt-26', id=26, color=[255, 0, 0], type='', swap='kpt-17'), + 27: dict(name='kpt-27', id=27, color=[255, 0, 0], type='', swap=''), + 28: dict(name='kpt-28', id=28, color=[255, 0, 0], type='', swap=''), + 29: dict(name='kpt-29', id=29, color=[255, 0, 0], type='', swap=''), + 30: dict(name='kpt-30', id=30, color=[255, 0, 0], type='', swap=''), 31: - dict( - name='kpt-31', - id=31, - color=[255, 255, 255], - type='', - swap='kpt-35'), + dict(name='kpt-31', id=31, color=[255, 0, 0], type='', swap='kpt-35'), 32: - dict( - name='kpt-32', - id=32, - color=[255, 255, 255], - type='', - swap='kpt-34'), - 33: - dict(name='kpt-33', id=33, color=[255, 255, 255], type='', swap=''), + dict(name='kpt-32', id=32, color=[255, 0, 0], type='', swap='kpt-34'), + 33: dict(name='kpt-33', id=33, color=[255, 0, 0], type='', swap=''), 34: - dict( - name='kpt-34', - id=34, - color=[255, 255, 255], - type='', - swap='kpt-32'), + dict(name='kpt-34', id=34, color=[255, 0, 0], type='', swap='kpt-32'), 35: - dict( - name='kpt-35', - id=35, - color=[255, 255, 255], - type='', - swap='kpt-31'), + dict(name='kpt-35', id=35, color=[255, 0, 0], type='', swap='kpt-31'), 36: - dict( - name='kpt-36', - id=36, - color=[255, 255, 255], - type='', - swap='kpt-45'), + dict(name='kpt-36', id=36, color=[255, 0, 0], type='', swap='kpt-45'), 37: - dict( - name='kpt-37', - id=37, - color=[255, 255, 255], - type='', - swap='kpt-44'), + dict(name='kpt-37', id=37, color=[255, 0, 0], type='', swap='kpt-44'), 38: - dict( - name='kpt-38', - id=38, - color=[255, 255, 255], - type='', - swap='kpt-43'), + dict(name='kpt-38', id=38, color=[255, 0, 0], type='', swap='kpt-43'), 39: - dict( - name='kpt-39', - id=39, - color=[255, 255, 255], - type='', - swap='kpt-42'), + dict(name='kpt-39', id=39, color=[255, 0, 0], type='', swap='kpt-42'), 40: - dict( - name='kpt-40', - id=40, - color=[255, 255, 255], - type='', - swap='kpt-47'), - 41: - dict( - name='kpt-41', - id=41, - color=[255, 255, 255], - type='', - swap='kpt-46'), - 42: - dict( - name='kpt-42', - id=42, - color=[255, 255, 255], - type='', - swap='kpt-39'), - 43: - dict( - name='kpt-43', - id=43, - color=[255, 255, 255], - type='', - swap='kpt-38'), - 44: - dict( - name='kpt-44', - id=44, - color=[255, 255, 255], - type='', - swap='kpt-37'), - 45: - dict( - name='kpt-45', - id=45, - color=[255, 255, 255], - type='', - swap='kpt-36'), - 46: - dict( - name='kpt-46', - id=46, - color=[255, 255, 255], - type='', - swap='kpt-41'), - 47: - dict( - name='kpt-47', - id=47, - color=[255, 255, 255], - type='', - swap='kpt-40'), - 48: - dict( - name='kpt-48', - id=48, - color=[255, 255, 255], - type='', - swap='kpt-54'), - 49: - dict( - name='kpt-49', - id=49, - color=[255, 255, 255], - type='', - swap='kpt-53'), - 50: - dict( - name='kpt-50', - id=50, - color=[255, 255, 255], - type='', - swap='kpt-52'), - 51: - dict(name='kpt-51', id=51, color=[255, 255, 255], type='', swap=''), - 52: - dict( - name='kpt-52', - id=52, - color=[255, 255, 255], - type='', - swap='kpt-50'), - 53: - dict( - name='kpt-53', - id=53, - color=[255, 255, 255], - type='', - swap='kpt-49'), - 54: - dict( - name='kpt-54', - id=54, - color=[255, 255, 255], - type='', - swap='kpt-48'), - 55: - dict( - name='kpt-55', - id=55, - color=[255, 255, 255], - type='', - swap='kpt-59'), - 56: - dict( - name='kpt-56', - id=56, - color=[255, 255, 255], - type='', - swap='kpt-58'), - 57: - dict(name='kpt-57', id=57, color=[255, 255, 255], type='', swap=''), - 58: - dict( - name='kpt-58', - id=58, - color=[255, 255, 255], - type='', - swap='kpt-56'), - 59: - dict( - name='kpt-59', - id=59, - color=[255, 255, 255], - type='', - swap='kpt-55'), - 60: - dict( - name='kpt-60', - id=60, - color=[255, 255, 255], - type='', - swap='kpt-64'), - 61: - dict( - name='kpt-61', - id=61, - color=[255, 255, 255], - type='', - swap='kpt-63'), - 62: - dict(name='kpt-62', id=62, color=[255, 255, 255], type='', swap=''), - 63: - dict( - name='kpt-63', - id=63, - color=[255, 255, 255], - type='', - swap='kpt-61'), - 64: - dict( - name='kpt-64', - id=64, - color=[255, 255, 255], - type='', - swap='kpt-60'), - 65: - dict( - name='kpt-65', - id=65, - color=[255, 255, 255], - type='', - swap='kpt-67'), - 66: - dict(name='kpt-66', id=66, color=[255, 255, 255], type='', swap=''), - 67: - dict( - name='kpt-67', - id=67, - color=[255, 255, 255], - type='', - swap='kpt-65'), + dict(name='kpt-40', id=40, color=[255, 0, 0], type='', swap='kpt-47'), + 41: dict( + name='kpt-41', id=41, color=[255, 0, 0], type='', swap='kpt-46'), + 42: dict( + name='kpt-42', id=42, color=[255, 0, 0], type='', swap='kpt-39'), + 43: dict( + name='kpt-43', id=43, color=[255, 0, 0], type='', swap='kpt-38'), + 44: dict( + name='kpt-44', id=44, color=[255, 0, 0], type='', swap='kpt-37'), + 45: dict( + name='kpt-45', id=45, color=[255, 0, 0], type='', swap='kpt-36'), + 46: dict( + name='kpt-46', id=46, color=[255, 0, 0], type='', swap='kpt-41'), + 47: dict( + name='kpt-47', id=47, color=[255, 0, 0], type='', swap='kpt-40'), + 48: dict( + name='kpt-48', id=48, color=[255, 0, 0], type='', swap='kpt-54'), + 49: dict( + name='kpt-49', id=49, color=[255, 0, 0], type='', swap='kpt-53'), + 50: dict( + name='kpt-50', id=50, color=[255, 0, 0], type='', swap='kpt-52'), + 51: dict(name='kpt-51', id=51, color=[255, 0, 0], type='', swap=''), + 52: dict( + name='kpt-52', id=52, color=[255, 0, 0], type='', swap='kpt-50'), + 53: dict( + name='kpt-53', id=53, color=[255, 0, 0], type='', swap='kpt-49'), + 54: dict( + name='kpt-54', id=54, color=[255, 0, 0], type='', swap='kpt-48'), + 55: dict( + name='kpt-55', id=55, color=[255, 0, 0], type='', swap='kpt-59'), + 56: dict( + name='kpt-56', id=56, color=[255, 0, 0], type='', swap='kpt-58'), + 57: dict(name='kpt-57', id=57, color=[255, 0, 0], type='', swap=''), + 58: dict( + name='kpt-58', id=58, color=[255, 0, 0], type='', swap='kpt-56'), + 59: dict( + name='kpt-59', id=59, color=[255, 0, 0], type='', swap='kpt-55'), + 60: dict( + name='kpt-60', id=60, color=[255, 0, 0], type='', swap='kpt-64'), + 61: dict( + name='kpt-61', id=61, color=[255, 0, 0], type='', swap='kpt-63'), + 62: dict(name='kpt-62', id=62, color=[255, 0, 0], type='', swap=''), + 63: dict( + name='kpt-63', id=63, color=[255, 0, 0], type='', swap='kpt-61'), + 64: dict( + name='kpt-64', id=64, color=[255, 0, 0], type='', swap='kpt-60'), + 65: dict( + name='kpt-65', id=65, color=[255, 0, 0], type='', swap='kpt-67'), + 66: dict(name='kpt-66', id=66, color=[255, 0, 0], type='', swap=''), + 67: dict( + name='kpt-67', id=67, color=[255, 0, 0], type='', swap='kpt-65'), }, skeleton_info={}, joint_weights=[1.] * 68, diff --git a/configs/_base_/datasets/aflw.py b/configs/_base_/datasets/aflw.py index bf534cbb75..cf5e10964d 100644 --- a/configs/_base_/datasets/aflw.py +++ b/configs/_base_/datasets/aflw.py @@ -13,70 +13,31 @@ 'team-bischof/lrs/downloads/aflw/', ), keypoint_info={ - 0: - dict(name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-5'), - 1: - dict(name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-4'), - 2: - dict(name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-3'), - 3: - dict(name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-2'), - 4: - dict(name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-1'), - 5: - dict(name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-0'), - 6: - dict( - name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-11'), - 7: - dict( - name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-10'), - 8: - dict(name='kpt-8', id=8, color=[255, 255, 255], type='', swap='kpt-9'), - 9: - dict(name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-8'), + 0: dict(name='kpt-0', id=0, color=[255, 0, 0], type='', swap='kpt-5'), + 1: dict(name='kpt-1', id=1, color=[255, 0, 0], type='', swap='kpt-4'), + 2: dict(name='kpt-2', id=2, color=[255, 0, 0], type='', swap='kpt-3'), + 3: dict(name='kpt-3', id=3, color=[255, 0, 0], type='', swap='kpt-2'), + 4: dict(name='kpt-4', id=4, color=[255, 0, 0], type='', swap='kpt-1'), + 5: dict(name='kpt-5', id=5, color=[255, 0, 0], type='', swap='kpt-0'), + 6: dict(name='kpt-6', id=6, color=[255, 0, 0], type='', swap='kpt-11'), + 7: dict(name='kpt-7', id=7, color=[255, 0, 0], type='', swap='kpt-10'), + 8: dict(name='kpt-8', id=8, color=[255, 0, 0], type='', swap='kpt-9'), + 9: dict(name='kpt-9', id=9, color=[255, 0, 0], type='', swap='kpt-8'), 10: - dict( - name='kpt-10', id=10, color=[255, 255, 255], type='', - swap='kpt-7'), + dict(name='kpt-10', id=10, color=[255, 0, 0], type='', swap='kpt-7'), 11: - dict( - name='kpt-11', id=11, color=[255, 255, 255], type='', - swap='kpt-6'), + dict(name='kpt-11', id=11, color=[255, 0, 0], type='', swap='kpt-6'), 12: - dict( - name='kpt-12', - id=12, - color=[255, 255, 255], - type='', - swap='kpt-14'), - 13: - dict(name='kpt-13', id=13, color=[255, 255, 255], type='', swap=''), + dict(name='kpt-12', id=12, color=[255, 0, 0], type='', swap='kpt-14'), + 13: dict(name='kpt-13', id=13, color=[255, 0, 0], type='', swap=''), 14: - dict( - name='kpt-14', - id=14, - color=[255, 255, 255], - type='', - swap='kpt-12'), + dict(name='kpt-14', id=14, color=[255, 0, 0], type='', swap='kpt-12'), 15: - dict( - name='kpt-15', - id=15, - color=[255, 255, 255], - type='', - swap='kpt-17'), - 16: - dict(name='kpt-16', id=16, color=[255, 255, 255], type='', swap=''), + dict(name='kpt-15', id=15, color=[255, 0, 0], type='', swap='kpt-17'), + 16: dict(name='kpt-16', id=16, color=[255, 0, 0], type='', swap=''), 17: - dict( - name='kpt-17', - id=17, - color=[255, 255, 255], - type='', - swap='kpt-15'), - 18: - dict(name='kpt-18', id=18, color=[255, 255, 255], type='', swap='') + dict(name='kpt-17', id=17, color=[255, 0, 0], type='', swap='kpt-15'), + 18: dict(name='kpt-18', id=18, color=[255, 0, 0], type='', swap='') }, skeleton_info={}, joint_weights=[1.] * 19, diff --git a/configs/_base_/datasets/coco_aic.py b/configs/_base_/datasets/coco_aic.py new file mode 100644 index 0000000000..a084247468 --- /dev/null +++ b/configs/_base_/datasets/coco_aic.py @@ -0,0 +1,205 @@ +dataset_info = dict( + dataset_name='coco', + paper_info=[ + dict( + author='Lin, Tsung-Yi and Maire, Michael and ' + 'Belongie, Serge and Hays, James and ' + 'Perona, Pietro and Ramanan, Deva and ' + r'Doll{\'a}r, Piotr and Zitnick, C Lawrence', + title='Microsoft coco: Common objects in context', + container='European conference on computer vision', + year='2014', + homepage='http://cocodataset.org/', + ), + dict( + author='Wu, Jiahong and Zheng, He and Zhao, Bo and ' + 'Li, Yixin and Yan, Baoming and Liang, Rui and ' + 'Wang, Wenjia and Zhou, Shipei and Lin, Guosen and ' + 'Fu, Yanwei and others', + title='Ai challenger: A large-scale dataset for going ' + 'deeper in image understanding', + container='arXiv', + year='2017', + homepage='https://github.com/AIChallenger/AI_Challenger_2017', + ), + ], + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 2: + dict( + name='right_eye', + id=2, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + 17: + dict( + name='head_top', + id=17, + color=[51, 153, 255], + type='upper', + swap=''), + 18: + dict(name='neck', id=18, color=[51, 153, 255], type='upper', swap='') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]), + 13: + dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]), + 14: + dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]), + 16: + dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]), + 18: + dict( + link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]), + 19: + dict(link=('head_top', 'neck'), id=11, color=[51, 153, 255]), + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, + 1.5, 1.5 + ], + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.026, 0.026 + ]) diff --git a/configs/_base_/datasets/coco_wholebody_face.py b/configs/_base_/datasets/coco_wholebody_face.py index 7c9ee3350e..a3fe1e5b33 100644 --- a/configs/_base_/datasets/coco_wholebody_face.py +++ b/configs/_base_/datasets/coco_wholebody_face.py @@ -12,425 +12,131 @@ ), keypoint_info={ 0: - dict( - name='face-0', - id=0, - color=[255, 255, 255], - type='', - swap='face-16'), + dict(name='face-0', id=0, color=[255, 0, 0], type='', swap='face-16'), 1: - dict( - name='face-1', - id=1, - color=[255, 255, 255], - type='', - swap='face-15'), + dict(name='face-1', id=1, color=[255, 0, 0], type='', swap='face-15'), 2: - dict( - name='face-2', - id=2, - color=[255, 255, 255], - type='', - swap='face-14'), + dict(name='face-2', id=2, color=[255, 0, 0], type='', swap='face-14'), 3: - dict( - name='face-3', - id=3, - color=[255, 255, 255], - type='', - swap='face-13'), + dict(name='face-3', id=3, color=[255, 0, 0], type='', swap='face-13'), 4: - dict( - name='face-4', - id=4, - color=[255, 255, 255], - type='', - swap='face-12'), + dict(name='face-4', id=4, color=[255, 0, 0], type='', swap='face-12'), 5: - dict( - name='face-5', - id=5, - color=[255, 255, 255], - type='', - swap='face-11'), + dict(name='face-5', id=5, color=[255, 0, 0], type='', swap='face-11'), 6: - dict( - name='face-6', - id=6, - color=[255, 255, 255], - type='', - swap='face-10'), + dict(name='face-6', id=6, color=[255, 0, 0], type='', swap='face-10'), 7: - dict( - name='face-7', id=7, color=[255, 255, 255], type='', - swap='face-9'), - 8: - dict(name='face-8', id=8, color=[255, 255, 255], type='', swap=''), + dict(name='face-7', id=7, color=[255, 0, 0], type='', swap='face-9'), + 8: dict(name='face-8', id=8, color=[255, 0, 0], type='', swap=''), 9: - dict( - name='face-9', id=9, color=[255, 255, 255], type='', - swap='face-7'), + dict(name='face-9', id=9, color=[255, 0, 0], type='', swap='face-7'), 10: - dict( - name='face-10', - id=10, - color=[255, 255, 255], - type='', - swap='face-6'), + dict(name='face-10', id=10, color=[255, 0, 0], type='', swap='face-6'), 11: - dict( - name='face-11', - id=11, - color=[255, 255, 255], - type='', - swap='face-5'), + dict(name='face-11', id=11, color=[255, 0, 0], type='', swap='face-5'), 12: - dict( - name='face-12', - id=12, - color=[255, 255, 255], - type='', - swap='face-4'), + dict(name='face-12', id=12, color=[255, 0, 0], type='', swap='face-4'), 13: - dict( - name='face-13', - id=13, - color=[255, 255, 255], - type='', - swap='face-3'), + dict(name='face-13', id=13, color=[255, 0, 0], type='', swap='face-3'), 14: - dict( - name='face-14', - id=14, - color=[255, 255, 255], - type='', - swap='face-2'), + dict(name='face-14', id=14, color=[255, 0, 0], type='', swap='face-2'), 15: - dict( - name='face-15', - id=15, - color=[255, 255, 255], - type='', - swap='face-1'), + dict(name='face-15', id=15, color=[255, 0, 0], type='', swap='face-1'), 16: - dict( - name='face-16', - id=16, - color=[255, 255, 255], - type='', - swap='face-0'), - 17: - dict( - name='face-17', - id=17, - color=[255, 255, 255], - type='', - swap='face-26'), - 18: - dict( - name='face-18', - id=18, - color=[255, 255, 255], - type='', - swap='face-25'), - 19: - dict( - name='face-19', - id=19, - color=[255, 255, 255], - type='', - swap='face-24'), - 20: - dict( - name='face-20', - id=20, - color=[255, 255, 255], - type='', - swap='face-23'), - 21: - dict( - name='face-21', - id=21, - color=[255, 255, 255], - type='', - swap='face-22'), - 22: - dict( - name='face-22', - id=22, - color=[255, 255, 255], - type='', - swap='face-21'), - 23: - dict( - name='face-23', - id=23, - color=[255, 255, 255], - type='', - swap='face-20'), - 24: - dict( - name='face-24', - id=24, - color=[255, 255, 255], - type='', - swap='face-19'), - 25: - dict( - name='face-25', - id=25, - color=[255, 255, 255], - type='', - swap='face-18'), - 26: - dict( - name='face-26', - id=26, - color=[255, 255, 255], - type='', - swap='face-17'), - 27: - dict(name='face-27', id=27, color=[255, 255, 255], type='', swap=''), - 28: - dict(name='face-28', id=28, color=[255, 255, 255], type='', swap=''), - 29: - dict(name='face-29', id=29, color=[255, 255, 255], type='', swap=''), - 30: - dict(name='face-30', id=30, color=[255, 255, 255], type='', swap=''), - 31: - dict( - name='face-31', - id=31, - color=[255, 255, 255], - type='', - swap='face-35'), - 32: - dict( - name='face-32', - id=32, - color=[255, 255, 255], - type='', - swap='face-34'), - 33: - dict(name='face-33', id=33, color=[255, 255, 255], type='', swap=''), - 34: - dict( - name='face-34', - id=34, - color=[255, 255, 255], - type='', - swap='face-32'), - 35: - dict( - name='face-35', - id=35, - color=[255, 255, 255], - type='', - swap='face-31'), - 36: - dict( - name='face-36', - id=36, - color=[255, 255, 255], - type='', - swap='face-45'), - 37: - dict( - name='face-37', - id=37, - color=[255, 255, 255], - type='', - swap='face-44'), - 38: - dict( - name='face-38', - id=38, - color=[255, 255, 255], - type='', - swap='face-43'), - 39: - dict( - name='face-39', - id=39, - color=[255, 255, 255], - type='', - swap='face-42'), - 40: - dict( - name='face-40', - id=40, - color=[255, 255, 255], - type='', - swap='face-47'), - 41: - dict( - name='face-41', - id=41, - color=[255, 255, 255], - type='', - swap='face-46'), - 42: - dict( - name='face-42', - id=42, - color=[255, 255, 255], - type='', - swap='face-39'), - 43: - dict( - name='face-43', - id=43, - color=[255, 255, 255], - type='', - swap='face-38'), - 44: - dict( - name='face-44', - id=44, - color=[255, 255, 255], - type='', - swap='face-37'), - 45: - dict( - name='face-45', - id=45, - color=[255, 255, 255], - type='', - swap='face-36'), - 46: - dict( - name='face-46', - id=46, - color=[255, 255, 255], - type='', - swap='face-41'), - 47: - dict( - name='face-47', - id=47, - color=[255, 255, 255], - type='', - swap='face-40'), - 48: - dict( - name='face-48', - id=48, - color=[255, 255, 255], - type='', - swap='face-54'), - 49: - dict( - name='face-49', - id=49, - color=[255, 255, 255], - type='', - swap='face-53'), - 50: - dict( - name='face-50', - id=50, - color=[255, 255, 255], - type='', - swap='face-52'), - 51: - dict(name='face-51', id=52, color=[255, 255, 255], type='', swap=''), - 52: - dict( - name='face-52', - id=52, - color=[255, 255, 255], - type='', - swap='face-50'), - 53: - dict( - name='face-53', - id=53, - color=[255, 255, 255], - type='', - swap='face-49'), - 54: - dict( - name='face-54', - id=54, - color=[255, 255, 255], - type='', - swap='face-48'), - 55: - dict( - name='face-55', - id=55, - color=[255, 255, 255], - type='', - swap='face-59'), - 56: - dict( - name='face-56', - id=56, - color=[255, 255, 255], - type='', - swap='face-58'), - 57: - dict(name='face-57', id=57, color=[255, 255, 255], type='', swap=''), - 58: - dict( - name='face-58', - id=58, - color=[255, 255, 255], - type='', - swap='face-56'), - 59: - dict( - name='face-59', - id=59, - color=[255, 255, 255], - type='', - swap='face-55'), - 60: - dict( - name='face-60', - id=60, - color=[255, 255, 255], - type='', - swap='face-64'), - 61: - dict( - name='face-61', - id=61, - color=[255, 255, 255], - type='', - swap='face-63'), - 62: - dict(name='face-62', id=62, color=[255, 255, 255], type='', swap=''), - 63: - dict( - name='face-63', - id=63, - color=[255, 255, 255], - type='', - swap='face-61'), - 64: - dict( - name='face-64', - id=64, - color=[255, 255, 255], - type='', - swap='face-60'), - 65: - dict( - name='face-65', - id=65, - color=[255, 255, 255], - type='', - swap='face-67'), - 66: - dict(name='face-66', id=66, color=[255, 255, 255], type='', swap=''), - 67: - dict( - name='face-67', - id=67, - color=[255, 255, 255], - type='', - swap='face-65') + dict(name='face-16', id=16, color=[255, 0, 0], type='', swap='face-0'), + 17: dict( + name='face-17', id=17, color=[255, 0, 0], type='', swap='face-26'), + 18: dict( + name='face-18', id=18, color=[255, 0, 0], type='', swap='face-25'), + 19: dict( + name='face-19', id=19, color=[255, 0, 0], type='', swap='face-24'), + 20: dict( + name='face-20', id=20, color=[255, 0, 0], type='', swap='face-23'), + 21: dict( + name='face-21', id=21, color=[255, 0, 0], type='', swap='face-22'), + 22: dict( + name='face-22', id=22, color=[255, 0, 0], type='', swap='face-21'), + 23: dict( + name='face-23', id=23, color=[255, 0, 0], type='', swap='face-20'), + 24: dict( + name='face-24', id=24, color=[255, 0, 0], type='', swap='face-19'), + 25: dict( + name='face-25', id=25, color=[255, 0, 0], type='', swap='face-18'), + 26: dict( + name='face-26', id=26, color=[255, 0, 0], type='', swap='face-17'), + 27: dict(name='face-27', id=27, color=[255, 0, 0], type='', swap=''), + 28: dict(name='face-28', id=28, color=[255, 0, 0], type='', swap=''), + 29: dict(name='face-29', id=29, color=[255, 0, 0], type='', swap=''), + 30: dict(name='face-30', id=30, color=[255, 0, 0], type='', swap=''), + 31: dict( + name='face-31', id=31, color=[255, 0, 0], type='', swap='face-35'), + 32: dict( + name='face-32', id=32, color=[255, 0, 0], type='', swap='face-34'), + 33: dict(name='face-33', id=33, color=[255, 0, 0], type='', swap=''), + 34: dict( + name='face-34', id=34, color=[255, 0, 0], type='', swap='face-32'), + 35: dict( + name='face-35', id=35, color=[255, 0, 0], type='', swap='face-31'), + 36: dict( + name='face-36', id=36, color=[255, 0, 0], type='', swap='face-45'), + 37: dict( + name='face-37', id=37, color=[255, 0, 0], type='', swap='face-44'), + 38: dict( + name='face-38', id=38, color=[255, 0, 0], type='', swap='face-43'), + 39: dict( + name='face-39', id=39, color=[255, 0, 0], type='', swap='face-42'), + 40: dict( + name='face-40', id=40, color=[255, 0, 0], type='', swap='face-47'), + 41: dict( + name='face-41', id=41, color=[255, 0, 0], type='', swap='face-46'), + 42: dict( + name='face-42', id=42, color=[255, 0, 0], type='', swap='face-39'), + 43: dict( + name='face-43', id=43, color=[255, 0, 0], type='', swap='face-38'), + 44: dict( + name='face-44', id=44, color=[255, 0, 0], type='', swap='face-37'), + 45: dict( + name='face-45', id=45, color=[255, 0, 0], type='', swap='face-36'), + 46: dict( + name='face-46', id=46, color=[255, 0, 0], type='', swap='face-41'), + 47: dict( + name='face-47', id=47, color=[255, 0, 0], type='', swap='face-40'), + 48: dict( + name='face-48', id=48, color=[255, 0, 0], type='', swap='face-54'), + 49: dict( + name='face-49', id=49, color=[255, 0, 0], type='', swap='face-53'), + 50: dict( + name='face-50', id=50, color=[255, 0, 0], type='', swap='face-52'), + 51: dict(name='face-51', id=52, color=[255, 0, 0], type='', swap=''), + 52: dict( + name='face-52', id=52, color=[255, 0, 0], type='', swap='face-50'), + 53: dict( + name='face-53', id=53, color=[255, 0, 0], type='', swap='face-49'), + 54: dict( + name='face-54', id=54, color=[255, 0, 0], type='', swap='face-48'), + 55: dict( + name='face-55', id=55, color=[255, 0, 0], type='', swap='face-59'), + 56: dict( + name='face-56', id=56, color=[255, 0, 0], type='', swap='face-58'), + 57: dict(name='face-57', id=57, color=[255, 0, 0], type='', swap=''), + 58: dict( + name='face-58', id=58, color=[255, 0, 0], type='', swap='face-56'), + 59: dict( + name='face-59', id=59, color=[255, 0, 0], type='', swap='face-55'), + 60: dict( + name='face-60', id=60, color=[255, 0, 0], type='', swap='face-64'), + 61: dict( + name='face-61', id=61, color=[255, 0, 0], type='', swap='face-63'), + 62: dict(name='face-62', id=62, color=[255, 0, 0], type='', swap=''), + 63: dict( + name='face-63', id=63, color=[255, 0, 0], type='', swap='face-61'), + 64: dict( + name='face-64', id=64, color=[255, 0, 0], type='', swap='face-60'), + 65: dict( + name='face-65', id=65, color=[255, 0, 0], type='', swap='face-67'), + 66: dict(name='face-66', id=66, color=[255, 0, 0], type='', swap=''), + 67: dict( + name='face-67', id=67, color=[255, 0, 0], type='', swap='face-65') }, skeleton_info={}, joint_weights=[1.] * 68, diff --git a/configs/_base_/datasets/cofw.py b/configs/_base_/datasets/cofw.py index 2fb7ad2f8d..d528bf2f2f 100644 --- a/configs/_base_/datasets/cofw.py +++ b/configs/_base_/datasets/cofw.py @@ -10,124 +10,47 @@ homepage='http://www.vision.caltech.edu/xpburgos/ICCV13/', ), keypoint_info={ - 0: - dict(name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-1'), - 1: - dict(name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-0'), - 2: - dict(name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-3'), - 3: - dict(name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-2'), - 4: - dict(name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-6'), - 5: - dict(name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-7'), - 6: - dict(name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-4'), - 7: - dict(name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-5'), - 8: - dict(name='kpt-8', id=8, color=[255, 255, 255], type='', swap='kpt-9'), - 9: - dict(name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-8'), + 0: dict(name='kpt-0', id=0, color=[255, 0, 0], type='', swap='kpt-1'), + 1: dict(name='kpt-1', id=1, color=[255, 0, 0], type='', swap='kpt-0'), + 2: dict(name='kpt-2', id=2, color=[255, 0, 0], type='', swap='kpt-3'), + 3: dict(name='kpt-3', id=3, color=[255, 0, 0], type='', swap='kpt-2'), + 4: dict(name='kpt-4', id=4, color=[255, 0, 0], type='', swap='kpt-6'), + 5: dict(name='kpt-5', id=5, color=[255, 0, 0], type='', swap='kpt-7'), + 6: dict(name='kpt-6', id=6, color=[255, 0, 0], type='', swap='kpt-4'), + 7: dict(name='kpt-7', id=7, color=[255, 0, 0], type='', swap='kpt-5'), + 8: dict(name='kpt-8', id=8, color=[255, 0, 0], type='', swap='kpt-9'), + 9: dict(name='kpt-9', id=9, color=[255, 0, 0], type='', swap='kpt-8'), 10: - dict( - name='kpt-10', - id=10, - color=[255, 255, 255], - type='', - swap='kpt-11'), + dict(name='kpt-10', id=10, color=[255, 0, 0], type='', swap='kpt-11'), 11: - dict( - name='kpt-11', - id=11, - color=[255, 255, 255], - type='', - swap='kpt-10'), + dict(name='kpt-11', id=11, color=[255, 0, 0], type='', swap='kpt-10'), 12: - dict( - name='kpt-12', - id=12, - color=[255, 255, 255], - type='', - swap='kpt-14'), + dict(name='kpt-12', id=12, color=[255, 0, 0], type='', swap='kpt-14'), 13: - dict( - name='kpt-13', - id=13, - color=[255, 255, 255], - type='', - swap='kpt-15'), + dict(name='kpt-13', id=13, color=[255, 0, 0], type='', swap='kpt-15'), 14: - dict( - name='kpt-14', - id=14, - color=[255, 255, 255], - type='', - swap='kpt-12'), + dict(name='kpt-14', id=14, color=[255, 0, 0], type='', swap='kpt-12'), 15: - dict( - name='kpt-15', - id=15, - color=[255, 255, 255], - type='', - swap='kpt-13'), + dict(name='kpt-15', id=15, color=[255, 0, 0], type='', swap='kpt-13'), 16: - dict( - name='kpt-16', - id=16, - color=[255, 255, 255], - type='', - swap='kpt-17'), + dict(name='kpt-16', id=16, color=[255, 0, 0], type='', swap='kpt-17'), 17: - dict( - name='kpt-17', - id=17, - color=[255, 255, 255], - type='', - swap='kpt-16'), + dict(name='kpt-17', id=17, color=[255, 0, 0], type='', swap='kpt-16'), 18: - dict( - name='kpt-18', - id=18, - color=[255, 255, 255], - type='', - swap='kpt-19'), + dict(name='kpt-18', id=18, color=[255, 0, 0], type='', swap='kpt-19'), 19: - dict( - name='kpt-19', - id=19, - color=[255, 255, 255], - type='', - swap='kpt-18'), - 20: - dict(name='kpt-20', id=20, color=[255, 255, 255], type='', swap=''), - 21: - dict(name='kpt-21', id=21, color=[255, 255, 255], type='', swap=''), + dict(name='kpt-19', id=19, color=[255, 0, 0], type='', swap='kpt-18'), + 20: dict(name='kpt-20', id=20, color=[255, 0, 0], type='', swap=''), + 21: dict(name='kpt-21', id=21, color=[255, 0, 0], type='', swap=''), 22: - dict( - name='kpt-22', - id=22, - color=[255, 255, 255], - type='', - swap='kpt-23'), + dict(name='kpt-22', id=22, color=[255, 0, 0], type='', swap='kpt-23'), 23: - dict( - name='kpt-23', - id=23, - color=[255, 255, 255], - type='', - swap='kpt-22'), - 24: - dict(name='kpt-24', id=24, color=[255, 255, 255], type='', swap=''), - 25: - dict(name='kpt-25', id=25, color=[255, 255, 255], type='', swap=''), - 26: - dict(name='kpt-26', id=26, color=[255, 255, 255], type='', swap=''), - 27: - dict(name='kpt-27', id=27, color=[255, 255, 255], type='', swap=''), - 28: - dict(name='kpt-28', id=28, color=[255, 255, 255], type='', swap='') + dict(name='kpt-23', id=23, color=[255, 0, 0], type='', swap='kpt-22'), + 24: dict(name='kpt-24', id=24, color=[255, 0, 0], type='', swap=''), + 25: dict(name='kpt-25', id=25, color=[255, 0, 0], type='', swap=''), + 26: dict(name='kpt-26', id=26, color=[255, 0, 0], type='', swap=''), + 27: dict(name='kpt-27', id=27, color=[255, 0, 0], type='', swap=''), + 28: dict(name='kpt-28', id=28, color=[255, 0, 0], type='', swap='') }, skeleton_info={}, joint_weights=[1.] * 29, diff --git a/configs/_base_/datasets/wflw.py b/configs/_base_/datasets/wflw.py index bed6f56f30..80c29b696c 100644 --- a/configs/_base_/datasets/wflw.py +++ b/configs/_base_/datasets/wflw.py @@ -10,572 +10,182 @@ homepage='https://wywu.github.io/projects/LAB/WFLW.html', ), keypoint_info={ - 0: - dict( - name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-32'), - 1: - dict( - name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-31'), - 2: - dict( - name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-30'), - 3: - dict( - name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-29'), - 4: - dict( - name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-28'), - 5: - dict( - name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-27'), - 6: - dict( - name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-26'), - 7: - dict( - name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-25'), - 8: - dict( - name='kpt-8', id=8, color=[255, 255, 255], type='', swap='kpt-24'), - 9: - dict( - name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-23'), + 0: dict(name='kpt-0', id=0, color=[255, 0, 0], type='', swap='kpt-32'), + 1: dict(name='kpt-1', id=1, color=[255, 0, 0], type='', swap='kpt-31'), + 2: dict(name='kpt-2', id=2, color=[255, 0, 0], type='', swap='kpt-30'), + 3: dict(name='kpt-3', id=3, color=[255, 0, 0], type='', swap='kpt-29'), + 4: dict(name='kpt-4', id=4, color=[255, 0, 0], type='', swap='kpt-28'), + 5: dict(name='kpt-5', id=5, color=[255, 0, 0], type='', swap='kpt-27'), + 6: dict(name='kpt-6', id=6, color=[255, 0, 0], type='', swap='kpt-26'), + 7: dict(name='kpt-7', id=7, color=[255, 0, 0], type='', swap='kpt-25'), + 8: dict(name='kpt-8', id=8, color=[255, 0, 0], type='', swap='kpt-24'), + 9: dict(name='kpt-9', id=9, color=[255, 0, 0], type='', swap='kpt-23'), 10: - dict( - name='kpt-10', - id=10, - color=[255, 255, 255], - type='', - swap='kpt-22'), + dict(name='kpt-10', id=10, color=[255, 0, 0], type='', swap='kpt-22'), 11: - dict( - name='kpt-11', - id=11, - color=[255, 255, 255], - type='', - swap='kpt-21'), + dict(name='kpt-11', id=11, color=[255, 0, 0], type='', swap='kpt-21'), 12: - dict( - name='kpt-12', - id=12, - color=[255, 255, 255], - type='', - swap='kpt-20'), + dict(name='kpt-12', id=12, color=[255, 0, 0], type='', swap='kpt-20'), 13: - dict( - name='kpt-13', - id=13, - color=[255, 255, 255], - type='', - swap='kpt-19'), + dict(name='kpt-13', id=13, color=[255, 0, 0], type='', swap='kpt-19'), 14: - dict( - name='kpt-14', - id=14, - color=[255, 255, 255], - type='', - swap='kpt-18'), + dict(name='kpt-14', id=14, color=[255, 0, 0], type='', swap='kpt-18'), 15: - dict( - name='kpt-15', - id=15, - color=[255, 255, 255], - type='', - swap='kpt-17'), - 16: - dict(name='kpt-16', id=16, color=[255, 255, 255], type='', swap=''), + dict(name='kpt-15', id=15, color=[255, 0, 0], type='', swap='kpt-17'), + 16: dict(name='kpt-16', id=16, color=[255, 0, 0], type='', swap=''), 17: - dict( - name='kpt-17', - id=17, - color=[255, 255, 255], - type='', - swap='kpt-15'), + dict(name='kpt-17', id=17, color=[255, 0, 0], type='', swap='kpt-15'), 18: - dict( - name='kpt-18', - id=18, - color=[255, 255, 255], - type='', - swap='kpt-14'), + dict(name='kpt-18', id=18, color=[255, 0, 0], type='', swap='kpt-14'), 19: - dict( - name='kpt-19', - id=19, - color=[255, 255, 255], - type='', - swap='kpt-13'), + dict(name='kpt-19', id=19, color=[255, 0, 0], type='', swap='kpt-13'), 20: - dict( - name='kpt-20', - id=20, - color=[255, 255, 255], - type='', - swap='kpt-12'), + dict(name='kpt-20', id=20, color=[255, 0, 0], type='', swap='kpt-12'), 21: - dict( - name='kpt-21', - id=21, - color=[255, 255, 255], - type='', - swap='kpt-11'), + dict(name='kpt-21', id=21, color=[255, 0, 0], type='', swap='kpt-11'), 22: - dict( - name='kpt-22', - id=22, - color=[255, 255, 255], - type='', - swap='kpt-10'), + dict(name='kpt-22', id=22, color=[255, 0, 0], type='', swap='kpt-10'), 23: - dict( - name='kpt-23', id=23, color=[255, 255, 255], type='', - swap='kpt-9'), + dict(name='kpt-23', id=23, color=[255, 0, 0], type='', swap='kpt-9'), 24: - dict( - name='kpt-24', id=24, color=[255, 255, 255], type='', - swap='kpt-8'), + dict(name='kpt-24', id=24, color=[255, 0, 0], type='', swap='kpt-8'), 25: - dict( - name='kpt-25', id=25, color=[255, 255, 255], type='', - swap='kpt-7'), + dict(name='kpt-25', id=25, color=[255, 0, 0], type='', swap='kpt-7'), 26: - dict( - name='kpt-26', id=26, color=[255, 255, 255], type='', - swap='kpt-6'), + dict(name='kpt-26', id=26, color=[255, 0, 0], type='', swap='kpt-6'), 27: - dict( - name='kpt-27', id=27, color=[255, 255, 255], type='', - swap='kpt-5'), + dict(name='kpt-27', id=27, color=[255, 0, 0], type='', swap='kpt-5'), 28: - dict( - name='kpt-28', id=28, color=[255, 255, 255], type='', - swap='kpt-4'), + dict(name='kpt-28', id=28, color=[255, 0, 0], type='', swap='kpt-4'), 29: - dict( - name='kpt-29', id=29, color=[255, 255, 255], type='', - swap='kpt-3'), + dict(name='kpt-29', id=29, color=[255, 0, 0], type='', swap='kpt-3'), 30: - dict( - name='kpt-30', id=30, color=[255, 255, 255], type='', - swap='kpt-2'), + dict(name='kpt-30', id=30, color=[255, 0, 0], type='', swap='kpt-2'), 31: - dict( - name='kpt-31', id=31, color=[255, 255, 255], type='', - swap='kpt-1'), + dict(name='kpt-31', id=31, color=[255, 0, 0], type='', swap='kpt-1'), 32: - dict( - name='kpt-32', id=32, color=[255, 255, 255], type='', - swap='kpt-0'), + dict(name='kpt-32', id=32, color=[255, 0, 0], type='', swap='kpt-0'), 33: - dict( - name='kpt-33', - id=33, - color=[255, 255, 255], - type='', - swap='kpt-46'), + dict(name='kpt-33', id=33, color=[255, 0, 0], type='', swap='kpt-46'), 34: - dict( - name='kpt-34', - id=34, - color=[255, 255, 255], - type='', - swap='kpt-45'), + dict(name='kpt-34', id=34, color=[255, 0, 0], type='', swap='kpt-45'), 35: - dict( - name='kpt-35', - id=35, - color=[255, 255, 255], - type='', - swap='kpt-44'), + dict(name='kpt-35', id=35, color=[255, 0, 0], type='', swap='kpt-44'), 36: - dict( - name='kpt-36', - id=36, - color=[255, 255, 255], - type='', - swap='kpt-43'), - 37: - dict( - name='kpt-37', - id=37, - color=[255, 255, 255], - type='', - swap='kpt-42'), - 38: - dict( - name='kpt-38', - id=38, - color=[255, 255, 255], - type='', - swap='kpt-50'), - 39: - dict( - name='kpt-39', - id=39, - color=[255, 255, 255], - type='', - swap='kpt-49'), - 40: - dict( - name='kpt-40', - id=40, - color=[255, 255, 255], - type='', - swap='kpt-48'), - 41: - dict( - name='kpt-41', - id=41, - color=[255, 255, 255], - type='', - swap='kpt-47'), - 42: - dict( - name='kpt-42', - id=42, - color=[255, 255, 255], - type='', - swap='kpt-37'), - 43: - dict( - name='kpt-43', - id=43, - color=[255, 255, 255], - type='', - swap='kpt-36'), - 44: - dict( - name='kpt-44', - id=44, - color=[255, 255, 255], - type='', - swap='kpt-35'), - 45: - dict( - name='kpt-45', - id=45, - color=[255, 255, 255], - type='', - swap='kpt-34'), - 46: - dict( - name='kpt-46', - id=46, - color=[255, 255, 255], - type='', - swap='kpt-33'), - 47: - dict( - name='kpt-47', - id=47, - color=[255, 255, 255], - type='', - swap='kpt-41'), - 48: - dict( - name='kpt-48', - id=48, - color=[255, 255, 255], - type='', - swap='kpt-40'), - 49: - dict( - name='kpt-49', - id=49, - color=[255, 255, 255], - type='', - swap='kpt-39'), - 50: - dict( - name='kpt-50', - id=50, - color=[255, 255, 255], - type='', - swap='kpt-38'), - 51: - dict(name='kpt-51', id=51, color=[255, 255, 255], type='', swap=''), - 52: - dict(name='kpt-52', id=52, color=[255, 255, 255], type='', swap=''), - 53: - dict(name='kpt-53', id=53, color=[255, 255, 255], type='', swap=''), - 54: - dict(name='kpt-54', id=54, color=[255, 255, 255], type='', swap=''), - 55: - dict( - name='kpt-55', - id=55, - color=[255, 255, 255], - type='', - swap='kpt-59'), - 56: - dict( - name='kpt-56', - id=56, - color=[255, 255, 255], - type='', - swap='kpt-58'), - 57: - dict(name='kpt-57', id=57, color=[255, 255, 255], type='', swap=''), - 58: - dict( - name='kpt-58', - id=58, - color=[255, 255, 255], - type='', - swap='kpt-56'), - 59: - dict( - name='kpt-59', - id=59, - color=[255, 255, 255], - type='', - swap='kpt-55'), - 60: - dict( - name='kpt-60', - id=60, - color=[255, 255, 255], - type='', - swap='kpt-72'), - 61: - dict( - name='kpt-61', - id=61, - color=[255, 255, 255], - type='', - swap='kpt-71'), - 62: - dict( - name='kpt-62', - id=62, - color=[255, 255, 255], - type='', - swap='kpt-70'), - 63: - dict( - name='kpt-63', - id=63, - color=[255, 255, 255], - type='', - swap='kpt-69'), - 64: - dict( - name='kpt-64', - id=64, - color=[255, 255, 255], - type='', - swap='kpt-68'), - 65: - dict( - name='kpt-65', - id=65, - color=[255, 255, 255], - type='', - swap='kpt-75'), - 66: - dict( - name='kpt-66', - id=66, - color=[255, 255, 255], - type='', - swap='kpt-74'), - 67: - dict( - name='kpt-67', - id=67, - color=[255, 255, 255], - type='', - swap='kpt-73'), - 68: - dict( - name='kpt-68', - id=68, - color=[255, 255, 255], - type='', - swap='kpt-64'), - 69: - dict( - name='kpt-69', - id=69, - color=[255, 255, 255], - type='', - swap='kpt-63'), - 70: - dict( - name='kpt-70', - id=70, - color=[255, 255, 255], - type='', - swap='kpt-62'), - 71: - dict( - name='kpt-71', - id=71, - color=[255, 255, 255], - type='', - swap='kpt-61'), - 72: - dict( - name='kpt-72', - id=72, - color=[255, 255, 255], - type='', - swap='kpt-60'), - 73: - dict( - name='kpt-73', - id=73, - color=[255, 255, 255], - type='', - swap='kpt-67'), - 74: - dict( - name='kpt-74', - id=74, - color=[255, 255, 255], - type='', - swap='kpt-66'), - 75: - dict( - name='kpt-75', - id=75, - color=[255, 255, 255], - type='', - swap='kpt-65'), - 76: - dict( - name='kpt-76', - id=76, - color=[255, 255, 255], - type='', - swap='kpt-82'), - 77: - dict( - name='kpt-77', - id=77, - color=[255, 255, 255], - type='', - swap='kpt-81'), - 78: - dict( - name='kpt-78', - id=78, - color=[255, 255, 255], - type='', - swap='kpt-80'), - 79: - dict(name='kpt-79', id=79, color=[255, 255, 255], type='', swap=''), - 80: - dict( - name='kpt-80', - id=80, - color=[255, 255, 255], - type='', - swap='kpt-78'), - 81: - dict( - name='kpt-81', - id=81, - color=[255, 255, 255], - type='', - swap='kpt-77'), - 82: - dict( - name='kpt-82', - id=82, - color=[255, 255, 255], - type='', - swap='kpt-76'), - 83: - dict( - name='kpt-83', - id=83, - color=[255, 255, 255], - type='', - swap='kpt-87'), - 84: - dict( - name='kpt-84', - id=84, - color=[255, 255, 255], - type='', - swap='kpt-86'), - 85: - dict(name='kpt-85', id=85, color=[255, 255, 255], type='', swap=''), - 86: - dict( - name='kpt-86', - id=86, - color=[255, 255, 255], - type='', - swap='kpt-84'), - 87: - dict( - name='kpt-87', - id=87, - color=[255, 255, 255], - type='', - swap='kpt-83'), - 88: - dict( - name='kpt-88', - id=88, - color=[255, 255, 255], - type='', - swap='kpt-92'), - 89: - dict( - name='kpt-89', - id=89, - color=[255, 255, 255], - type='', - swap='kpt-91'), - 90: - dict(name='kpt-90', id=90, color=[255, 255, 255], type='', swap=''), - 91: - dict( - name='kpt-91', - id=91, - color=[255, 255, 255], - type='', - swap='kpt-89'), - 92: - dict( - name='kpt-92', - id=92, - color=[255, 255, 255], - type='', - swap='kpt-88'), - 93: - dict( - name='kpt-93', - id=93, - color=[255, 255, 255], - type='', - swap='kpt-95'), - 94: - dict(name='kpt-94', id=94, color=[255, 255, 255], type='', swap=''), - 95: - dict( - name='kpt-95', - id=95, - color=[255, 255, 255], - type='', - swap='kpt-93'), - 96: - dict( - name='kpt-96', - id=96, - color=[255, 255, 255], - type='', - swap='kpt-97'), - 97: - dict( - name='kpt-97', - id=97, - color=[255, 255, 255], - type='', - swap='kpt-96') + dict(name='kpt-36', id=36, color=[255, 0, 0], type='', swap='kpt-43'), + 37: dict( + name='kpt-37', id=37, color=[255, 0, 0], type='', swap='kpt-42'), + 38: dict( + name='kpt-38', id=38, color=[255, 0, 0], type='', swap='kpt-50'), + 39: dict( + name='kpt-39', id=39, color=[255, 0, 0], type='', swap='kpt-49'), + 40: dict( + name='kpt-40', id=40, color=[255, 0, 0], type='', swap='kpt-48'), + 41: dict( + name='kpt-41', id=41, color=[255, 0, 0], type='', swap='kpt-47'), + 42: dict( + name='kpt-42', id=42, color=[255, 0, 0], type='', swap='kpt-37'), + 43: dict( + name='kpt-43', id=43, color=[255, 0, 0], type='', swap='kpt-36'), + 44: dict( + name='kpt-44', id=44, color=[255, 0, 0], type='', swap='kpt-35'), + 45: dict( + name='kpt-45', id=45, color=[255, 0, 0], type='', swap='kpt-34'), + 46: dict( + name='kpt-46', id=46, color=[255, 0, 0], type='', swap='kpt-33'), + 47: dict( + name='kpt-47', id=47, color=[255, 0, 0], type='', swap='kpt-41'), + 48: dict( + name='kpt-48', id=48, color=[255, 0, 0], type='', swap='kpt-40'), + 49: dict( + name='kpt-49', id=49, color=[255, 0, 0], type='', swap='kpt-39'), + 50: dict( + name='kpt-50', id=50, color=[255, 0, 0], type='', swap='kpt-38'), + 51: dict(name='kpt-51', id=51, color=[255, 0, 0], type='', swap=''), + 52: dict(name='kpt-52', id=52, color=[255, 0, 0], type='', swap=''), + 53: dict(name='kpt-53', id=53, color=[255, 0, 0], type='', swap=''), + 54: dict(name='kpt-54', id=54, color=[255, 0, 0], type='', swap=''), + 55: dict( + name='kpt-55', id=55, color=[255, 0, 0], type='', swap='kpt-59'), + 56: dict( + name='kpt-56', id=56, color=[255, 0, 0], type='', swap='kpt-58'), + 57: dict(name='kpt-57', id=57, color=[255, 0, 0], type='', swap=''), + 58: dict( + name='kpt-58', id=58, color=[255, 0, 0], type='', swap='kpt-56'), + 59: dict( + name='kpt-59', id=59, color=[255, 0, 0], type='', swap='kpt-55'), + 60: dict( + name='kpt-60', id=60, color=[255, 0, 0], type='', swap='kpt-72'), + 61: dict( + name='kpt-61', id=61, color=[255, 0, 0], type='', swap='kpt-71'), + 62: dict( + name='kpt-62', id=62, color=[255, 0, 0], type='', swap='kpt-70'), + 63: dict( + name='kpt-63', id=63, color=[255, 0, 0], type='', swap='kpt-69'), + 64: dict( + name='kpt-64', id=64, color=[255, 0, 0], type='', swap='kpt-68'), + 65: dict( + name='kpt-65', id=65, color=[255, 0, 0], type='', swap='kpt-75'), + 66: dict( + name='kpt-66', id=66, color=[255, 0, 0], type='', swap='kpt-74'), + 67: dict( + name='kpt-67', id=67, color=[255, 0, 0], type='', swap='kpt-73'), + 68: dict( + name='kpt-68', id=68, color=[255, 0, 0], type='', swap='kpt-64'), + 69: dict( + name='kpt-69', id=69, color=[255, 0, 0], type='', swap='kpt-63'), + 70: dict( + name='kpt-70', id=70, color=[255, 0, 0], type='', swap='kpt-62'), + 71: dict( + name='kpt-71', id=71, color=[255, 0, 0], type='', swap='kpt-61'), + 72: dict( + name='kpt-72', id=72, color=[255, 0, 0], type='', swap='kpt-60'), + 73: dict( + name='kpt-73', id=73, color=[255, 0, 0], type='', swap='kpt-67'), + 74: dict( + name='kpt-74', id=74, color=[255, 0, 0], type='', swap='kpt-66'), + 75: dict( + name='kpt-75', id=75, color=[255, 0, 0], type='', swap='kpt-65'), + 76: dict( + name='kpt-76', id=76, color=[255, 0, 0], type='', swap='kpt-82'), + 77: dict( + name='kpt-77', id=77, color=[255, 0, 0], type='', swap='kpt-81'), + 78: dict( + name='kpt-78', id=78, color=[255, 0, 0], type='', swap='kpt-80'), + 79: dict(name='kpt-79', id=79, color=[255, 0, 0], type='', swap=''), + 80: dict( + name='kpt-80', id=80, color=[255, 0, 0], type='', swap='kpt-78'), + 81: dict( + name='kpt-81', id=81, color=[255, 0, 0], type='', swap='kpt-77'), + 82: dict( + name='kpt-82', id=82, color=[255, 0, 0], type='', swap='kpt-76'), + 83: dict( + name='kpt-83', id=83, color=[255, 0, 0], type='', swap='kpt-87'), + 84: dict( + name='kpt-84', id=84, color=[255, 0, 0], type='', swap='kpt-86'), + 85: dict(name='kpt-85', id=85, color=[255, 0, 0], type='', swap=''), + 86: dict( + name='kpt-86', id=86, color=[255, 0, 0], type='', swap='kpt-84'), + 87: dict( + name='kpt-87', id=87, color=[255, 0, 0], type='', swap='kpt-83'), + 88: dict( + name='kpt-88', id=88, color=[255, 0, 0], type='', swap='kpt-92'), + 89: dict( + name='kpt-89', id=89, color=[255, 0, 0], type='', swap='kpt-91'), + 90: dict(name='kpt-90', id=90, color=[255, 0, 0], type='', swap=''), + 91: dict( + name='kpt-91', id=91, color=[255, 0, 0], type='', swap='kpt-89'), + 92: dict( + name='kpt-92', id=92, color=[255, 0, 0], type='', swap='kpt-88'), + 93: dict( + name='kpt-93', id=93, color=[255, 0, 0], type='', swap='kpt-95'), + 94: dict(name='kpt-94', id=94, color=[255, 0, 0], type='', swap=''), + 95: dict( + name='kpt-95', id=95, color=[255, 0, 0], type='', swap='kpt-93'), + 96: dict( + name='kpt-96', id=96, color=[255, 0, 0], type='', swap='kpt-97'), + 97: dict( + name='kpt-97', id=97, color=[255, 0, 0], type='', swap='kpt-96') }, skeleton_info={}, joint_weights=[1.] * 98, diff --git a/configs/_base_/default_runtime.py b/configs/_base_/default_runtime.py index 548bb7268b..c333a5a5f2 100644 --- a/configs/_base_/default_runtime.py +++ b/configs/_base_/default_runtime.py @@ -5,7 +5,7 @@ timer=dict(type='IterTimerHook'), logger=dict(type='LoggerHook', interval=50), param_scheduler=dict(type='ParamSchedulerHook'), - checkpoint=dict(type='CheckpointHook', interval=1), + checkpoint=dict(type='CheckpointHook', interval=10), sampler_seed=dict(type='DistSamplerSeedHook'), visualization=dict(type='PoseVisualizationHook', enable=False), ) @@ -39,7 +39,7 @@ # file I/O backend file_client_args = dict(backend='disk') -# training/validatin/testing progress +# training/validation/testing progress train_cfg = dict(by_epoch=True) val_cfg = dict() test_cfg = dict() diff --git a/configs/animal_2d_keypoint/rtmpose/README.md b/configs/animal_2d_keypoint/rtmpose/README.md new file mode 100644 index 0000000000..fbb103e36c --- /dev/null +++ b/configs/animal_2d_keypoint/rtmpose/README.md @@ -0,0 +1,16 @@ +# RTMPose + +Recent studies on 2D pose estimation have achieved excellent performance on public benchmarks, yet its application in the industrial community still suffers from heavy model parameters and high latency. +In order to bridge this gap, we empirically study five aspects that affect the performance of multi-person pose estimation algorithms: paradigm, backbone network, localization algorithm, training strategy, and deployment inference, and present a high-performance real-time multi-person pose estimation framework, **RTMPose**, based on MMPose. +Our RTMPose-m achieves **75.8% AP** on COCO with **90+ FPS** on an Intel i7-11700 CPU and **430+ FPS** on an NVIDIA GTX 1660 Ti GPU, and RTMPose-l achieves **67.0% AP** on COCO-WholeBody with **130+ FPS**, outperforming existing open-source libraries. +To further evaluate RTMPose's capability in critical real-time applications, we also report the performance after deploying on the mobile device. + +## Results and Models + +### AP-10K Dataset + +Results on AP-10K validation set + +| Model | Input Size | AP | Details and Download | +| :-------: | :--------: | :---: | :------------------------------------------: | +| RTMPose-m | 256x256 | 0.722 | [rtmpose_cp10k.md](./ap10k/rtmpose_ap10k.md) | diff --git a/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose-m_8xb64-210e_ap10k-256x256.py b/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose-m_8xb64-210e_ap10k-256x256.py new file mode 100644 index 0000000000..ddc981a6d8 --- /dev/null +++ b/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose-m_8xb64-210e_ap10k-256x256.py @@ -0,0 +1,246 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=(8, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'AP10KDataset' +data_mode = 'topdown' +data_root = 'data/ap10k/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/pose/ap10k/', +# f'{data_root}': 's3://openmmlab/datasets/pose/ap10k/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-train-split1.json', + data_prefix=dict(img='data/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-val-split1.json', + data_prefix=dict(img='data/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-test-split1.json', + data_prefix=dict(img='data/'), + test_mode=True, + pipeline=val_pipeline, + )) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/ap10k-val-split1.json') +test_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/ap10k-test-split1.json') diff --git a/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose_ap10k.md b/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose_ap10k.md new file mode 100644 index 0000000000..6303a131da --- /dev/null +++ b/configs/animal_2d_keypoint/rtmpose/ap10k/rtmpose_ap10k.md @@ -0,0 +1,25 @@ + + + + +
+AP-10K (NeurIPS'2021) + +```bibtex +@misc{yu2021ap10k, + title={AP-10K: A Benchmark for Animal Pose Estimation in the Wild}, + author={Hang Yu and Yufei Xu and Jing Zhang and Wei Zhao and Ziyu Guan and Dacheng Tao}, + year={2021}, + eprint={2108.12617}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ +Results on AP-10K validation set + +| Arch | Input Size | AP | AP50 | AP75 | APM | APL | ckpt | log | +| :----------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :------------: | :------------: | :-----------------------------------------: | :----------------------------------------: | +| [rtmpose-m](./rtmpose-m_8xb64-210e_ap10k-256x256.py) | 256x256 | 0.722 | 0.939 | 0.788 | 0.569 | 0.728 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-ap10k_pt-aic-coco_210e-256x256-7a041aa1_20230206.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-ap10k_pt-aic-coco_210e-256x256-7a041aa1_20230206.json) | diff --git a/configs/animal_2d_keypoint/topdown_heatmap/README.md b/configs/animal_2d_keypoint/topdown_heatmap/README.md index 6440c3becd..b4f8e366ff 100644 --- a/configs/animal_2d_keypoint/topdown_heatmap/README.md +++ b/configs/animal_2d_keypoint/topdown_heatmap/README.md @@ -25,12 +25,13 @@ Results on AnimalPose validation set (1117 instances) Results on AP-10K validation set -| Model | Input Size | AP | Details and Download | -| :--------: | :--------: | :---: | :----------------------------------------: | -| HRNet-w48 | 256x256 | 0.728 | [hrnet_ap10k.md](./ap10k/hrnet_ap10k.md) | -| HRNet-w32 | 256x256 | 0.722 | [hrnet_ap10k.md](./ap10k/hrnet_ap10k.md) | -| ResNet-101 | 256x256 | 0.681 | [resnet_ap10k.md](./ap10k/resnet_ap10k.md) | -| ResNet-50 | 256x256 | 0.680 | [resnet_ap10k.md](./ap10k/resnet_ap10k.md) | +| Model | Input Size | AP | Details and Download | +| :--------: | :--------: | :---: | :--------------------------------------------------: | +| HRNet-w48 | 256x256 | 0.728 | [hrnet_ap10k.md](./ap10k/hrnet_ap10k.md) | +| HRNet-w32 | 256x256 | 0.722 | [hrnet_ap10k.md](./ap10k/hrnet_ap10k.md) | +| ResNet-101 | 256x256 | 0.681 | [resnet_ap10k.md](./ap10k/resnet_ap10k.md) | +| ResNet-50 | 256x256 | 0.680 | [resnet_ap10k.md](./ap10k/resnet_ap10k.md) | +| CSPNeXt-m | 256x256 | 0.703 | [cspnext_udp_ap10k.md](./ap10k/cspnext_udp_ap10k.md) | ### Desert Locust Dataset diff --git a/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w32_8xb64-210e_animalpose-256x256.py b/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w32_8xb64-210e_animalpose-256x256.py index 721ac93be8..89749504e1 100644 --- a/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w32_8xb64-210e_animalpose-256x256.py +++ b/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w32_8xb64-210e_animalpose-256x256.py @@ -100,7 +100,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w48_8xb64-210e_animalpose-256x256.py b/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w48_8xb64-210e_animalpose-256x256.py index 79de4dfd24..89cadbea0d 100644 --- a/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w48_8xb64-210e_animalpose-256x256.py +++ b/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w48_8xb64-210e_animalpose-256x256.py @@ -100,7 +100,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res101_8xb64-210e_animalpose-256x256.py b/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res101_8xb64-210e_animalpose-256x256.py index b7274a2732..03d1580387 100644 --- a/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res101_8xb64-210e_animalpose-256x256.py +++ b/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res101_8xb64-210e_animalpose-256x256.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res152_8xb32-210e_animalpose-256x256.py b/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res152_8xb32-210e_animalpose-256x256.py index 988a22d2a9..6004b95476 100644 --- a/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res152_8xb32-210e_animalpose-256x256.py +++ b/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res152_8xb32-210e_animalpose-256x256.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res50_8xb64-210e_animalpose-256x256.py b/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res50_8xb64-210e_animalpose-256x256.py index 004ac6ea2a..8636a7147a 100644 --- a/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res50_8xb64-210e_animalpose-256x256.py +++ b/configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_res50_8xb64-210e_animalpose-256x256.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext-m_udp_8xb64-210e_ap10k-256x256.py b/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext-m_udp_8xb64-210e_ap10k-256x256.py new file mode 100644 index 0000000000..5bce8d0a68 --- /dev/null +++ b/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext-m_udp_8xb64-210e_ap10k-256x256.py @@ -0,0 +1,228 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 105 to 210 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth')), + head=dict( + type='HeatmapHead', + in_channels=768, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=False, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'AP10KDataset' +data_mode = 'topdown' +data_root = 'data/ap10k/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/pose/ap10k/', +# f'{data_root}': 's3://openmmlab/datasets/pose/ap10k/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-train-split1.json', + data_prefix=dict(img='data/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-val-split1.json', + data_prefix=dict(img='data/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-test-split1.json', + data_prefix=dict(img='data/'), + test_mode=True, + pipeline=val_pipeline, + )) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/ap10k-val-split1.json') +test_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/ap10k-test-split1.json') diff --git a/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext_udp_ap10k.md b/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext_udp_ap10k.md new file mode 100644 index 0000000000..4ba6b39b3e --- /dev/null +++ b/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext_udp_ap10k.md @@ -0,0 +1,58 @@ + + +
+RTMDet (ArXiv 2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+UDP (CVPR'2020) + +```bibtex +@InProceedings{Huang_2020_CVPR, + author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan}, + title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation}, + booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2020} +} +``` + +
+ + + +
+AP-10K (NeurIPS'2021) + +```bibtex +@misc{yu2021ap10k, + title={AP-10K: A Benchmark for Animal Pose Estimation in the Wild}, + author={Hang Yu and Yufei Xu and Jing Zhang and Wei Zhao and Ziyu Guan and Dacheng Tao}, + year={2021}, + eprint={2108.12617}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ +Results on AP-10K validation set + +| Arch | Input Size | AP | AP50 | AP75 | APM | APL | ckpt | log | +| :----------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :------------: | :------------: | :-----------------------------------------: | :----------------------------------------: | +| [pose_cspnext_m](/configs/animal_2d_keypoint/topdown_heatmap/ap10k/cspnext-m_udp_8xb64-210e_ap10k-256x256.py) | 256x256 | 0.703 | 0.944 | 0.776 | 0.513 | 0.710 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_udp-ap10k_pt-in1k_210e-256x256-1f2d947a_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_udp-ap10k_pt-in1k_210e-256x256-1f2d947a_20230123.json) | diff --git a/configs/animal_2d_keypoint/topdown_heatmap/ap10k/resnet_ap10k.yml b/configs/animal_2d_keypoint/topdown_heatmap/ap10k/resnet_ap10k.yml new file mode 100644 index 0000000000..11c6d912ac --- /dev/null +++ b/configs/animal_2d_keypoint/topdown_heatmap/ap10k/resnet_ap10k.yml @@ -0,0 +1,41 @@ +Collections: +- Name: SimpleBaseline2D + Paper: + Title: Simple baselines for human pose estimation and tracking + URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html + README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md +Models: +- Config: configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res50_8xb64-210e_ap10k-256x256.py + In Collection: SimpleBaseline2D + Alias: animal + Metadata: + Architecture: &id001 + - SimpleBaseline2D + Training Data: AP-10K + Name: topdown_heatmap_res50_ap10k_256x256 + Results: + - Dataset: AP-10K + Metrics: + AP: 0.680 + AP@0.5: 0.926 + AP@0.75: 0.738 + APL: 0.687 + APM: 0.552 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/animal/resnet/res50_ap10k_256x256-35760eb8_20211029.pth +- Config: configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res101_8xb64-210e_ap10k-256x256.py + In Collection: SimpleBaseline2D + Metadata: + Architecture: *id001 + Training Data: AP-10K + Name: topdown_heatmap_res101_ap10k_256x256 + Results: + - Dataset: AP-10K + Metrics: + AP: 0.681 + AP@0.5: 0.921 + AP@0.75: 0.751 + APL: 0.690 + APM: 0.545 + Task: Animal 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/animal/resnet/res101_ap10k_256x256-9edfafb9_20211029.pth diff --git a/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w32_8xb64-210e_ap10k-256x256.py b/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w32_8xb64-210e_ap10k-256x256.py index bdb1138a55..afb75945a7 100644 --- a/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w32_8xb64-210e_ap10k-256x256.py +++ b/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w32_8xb64-210e_ap10k-256x256.py @@ -100,17 +100,14 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), dict(type='GetBBoxCenterScale'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict( - type='PackPoseInputs', - meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', - 'input_size', 'flip_indices', 'category')) + dict(type='PackPoseInputs') ] # data loaders @@ -160,8 +157,8 @@ # evaluators val_evaluator = dict( - type='AP10KCocoMetric', + type='CocoMetric', ann_file=data_root + 'annotations/ap10k-val-split1.json') test_evaluator = dict( - type='AP10KCocoMetric', + type='CocoMetric', ann_file=data_root + 'annotations/ap10k-test-split1.json') diff --git a/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w48_8xb64-210e_ap10k-256x256.py b/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w48_8xb64-210e_ap10k-256x256.py index f8158784ea..b142a5f614 100644 --- a/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w48_8xb64-210e_ap10k-256x256.py +++ b/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_hrnet-w48_8xb64-210e_ap10k-256x256.py @@ -100,17 +100,14 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), dict(type='GetBBoxCenterScale'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict( - type='PackPoseInputs', - meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', - 'input_size', 'flip_indices', 'category')) + dict(type='PackPoseInputs') ] # data loaders @@ -160,8 +157,8 @@ # evaluators val_evaluator = dict( - type='AP10KCocoMetric', + type='CocoMetric', ann_file=data_root + 'annotations/ap10k-val-split1.json') test_evaluator = dict( - type='AP10KCocoMetric', + type='CocoMetric', ann_file=data_root + 'annotations/ap10k-test-split1.json') diff --git a/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res101_8xb64-210e_ap10k-256x256.py b/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res101_8xb64-210e_ap10k-256x256.py index 3a57f1613e..f21215ee9a 100644 --- a/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res101_8xb64-210e_ap10k-256x256.py +++ b/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res101_8xb64-210e_ap10k-256x256.py @@ -71,17 +71,14 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), dict(type='GetBBoxCenterScale'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict( - type='PackPoseInputs', - meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', - 'input_size', 'flip_indices', 'category')) + dict(type='PackPoseInputs') ] # data loaders @@ -131,8 +128,8 @@ # evaluators val_evaluator = dict( - type='AP10KCocoMetric', + type='CocoMetric', ann_file=data_root + 'annotations/ap10k-val-split1.json') test_evaluator = dict( - type='AP10KCocoMetric', + type='CocoMetric', ann_file=data_root + 'annotations/ap10k-test-split1.json') diff --git a/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res50_8xb64-210e_ap10k-256x256.py b/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res50_8xb64-210e_ap10k-256x256.py index cc8ca486cc..93b139823e 100644 --- a/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res50_8xb64-210e_ap10k-256x256.py +++ b/configs/animal_2d_keypoint/topdown_heatmap/ap10k/td-hm_res50_8xb64-210e_ap10k-256x256.py @@ -71,17 +71,14 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), dict(type='GetBBoxCenterScale'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict( - type='PackPoseInputs', - meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', - 'input_size', 'flip_indices', 'category')) + dict(type='PackPoseInputs') ] # data loaders @@ -131,8 +128,8 @@ # evaluators val_evaluator = dict( - type='AP10KCocoMetric', + type='CocoMetric', ann_file=data_root + 'annotations/ap10k-val-split1.json') test_evaluator = dict( - type='AP10KCocoMetric', + type='CocoMetric', ann_file=data_root + 'annotations/ap10k-test-split1.json') diff --git a/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res101_8xb64-210e_locust-160x160.py b/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res101_8xb64-210e_locust-160x160.py index 468ab87d43..d8195928ba 100644 --- a/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res101_8xb64-210e_locust-160x160.py +++ b/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res101_8xb64-210e_locust-160x160.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict( @@ -74,7 +74,7 @@ rotate_factor=180, scale_factor=(0.7, 1.3)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res152_8xb32-210e_locust-160x160.py b/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res152_8xb32-210e_locust-160x160.py index ce67c63159..4f99d69642 100644 --- a/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res152_8xb32-210e_locust-160x160.py +++ b/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res152_8xb32-210e_locust-160x160.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=256) # hooks -default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict( @@ -74,7 +74,7 @@ rotate_factor=180, scale_factor=(0.7, 1.3)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res50_8xb64-210e_locust-160x160.py b/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res50_8xb64-210e_locust-160x160.py index fc61730989..9bf5fb7b76 100644 --- a/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res50_8xb64-210e_locust-160x160.py +++ b/configs/animal_2d_keypoint/topdown_heatmap/locust/td-hm_res50_8xb64-210e_locust-160x160.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict( @@ -74,7 +74,7 @@ rotate_factor=180, scale_factor=(0.7, 1.3)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res101_8xb64-210e_zebra-160x160.py b/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res101_8xb64-210e_zebra-160x160.py index f180b3d8ee..e208c7557f 100644 --- a/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res101_8xb64-210e_zebra-160x160.py +++ b/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res101_8xb64-210e_zebra-160x160.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict( @@ -74,7 +74,7 @@ rotate_factor=180, scale_factor=(0.7, 1.3)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res152_8xb32-210e_zebra-160x160.py b/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res152_8xb32-210e_zebra-160x160.py index cfa27dfd84..f292264333 100644 --- a/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res152_8xb32-210e_zebra-160x160.py +++ b/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res152_8xb32-210e_zebra-160x160.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=256) # hooks -default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict( @@ -74,7 +74,7 @@ rotate_factor=180, scale_factor=(0.7, 1.3)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res50_8xb64-210e_zebra-160x160.py b/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res50_8xb64-210e_zebra-160x160.py index c8b377aba0..08eb7b19cb 100644 --- a/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res50_8xb64-210e_zebra-160x160.py +++ b/configs/animal_2d_keypoint/topdown_heatmap/zebra/td-hm_res50_8xb64-210e_zebra-160x160.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict( @@ -74,7 +74,7 @@ rotate_factor=180, scale_factor=(0.7, 1.3)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/associative_embedding/README.md b/configs/body_2d_keypoint/associative_embedding/README.md new file mode 100644 index 0000000000..7f5fa8ea17 --- /dev/null +++ b/configs/body_2d_keypoint/associative_embedding/README.md @@ -0,0 +1,9 @@ +# Associative embedding: End-to-end learning for joint detection and grouping (AE) + +Associative Embedding is one of the most popular 2D bottom-up pose estimation approaches, that first detect all the keypoints and then group/associate them into person instances. + +In order to group all the predicted keypoints to individuals, a tag is also predicted for each detected keypoint. Tags of the same person are similar, while tags of different people are different. Thus the keypoints can be grouped according to the tags. + +
+ +
diff --git a/configs/body_2d_keypoint/associative_embedding/coco/ae_hrnet-w32_8xb24-300e_coco-512x512.py b/configs/body_2d_keypoint/associative_embedding/coco/ae_hrnet-w32_8xb24-300e_coco-512x512.py new file mode 100644 index 0000000000..306c86ac82 --- /dev/null +++ b/configs/body_2d_keypoint/associative_embedding/coco/ae_hrnet-w32_8xb24-300e_coco-512x512.py @@ -0,0 +1,159 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=300, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=1.5e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=300, + milestones=[200, 260], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=192) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', interval=50)) + +# codec settings +codec = dict( + type='AssociativeEmbedding', + input_size=(512, 512), + heatmap_size=(128, 128), + sigma=2, + decode_keypoint_order=[ + 0, 1, 2, 3, 4, 5, 6, 11, 12, 7, 8, 9, 10, 13, 14, 15, 16 + ], + decode_max_instances=30) + +# model settings +model = dict( + type='BottomupPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='AssociativeEmbeddingHead', + in_channels=32, + num_keypoints=17, + tag_dim=1, + tag_per_keypoint=True, + deconv_out_channels=None, + keypoint_loss=dict(type='KeypointMSELoss', use_target_weight=True), + tag_loss=dict(type='AssociativeEmbeddingLoss', loss_weight=0.001), + # The heatmap will be resized to the input size before decoding + # if ``restore_heatmap_size==True`` + decoder=dict(codec, heatmap_size=codec['input_size'])), + test_cfg=dict( + multiscale_test=False, + flip_test=True, + shift_heatmap=True, + restore_heatmap_size=True, + align_corners=False)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'bottomup' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [] +val_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict( + type='BottomupResize', + input_size=codec['input_size'], + size_factor=32, + resize_mode='expand'), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=24, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json', + nms_mode='none', + score_mode='keypoint', +) +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512.py b/configs/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512.py new file mode 100644 index 0000000000..ae45cc8ce5 --- /dev/null +++ b/configs/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512.py @@ -0,0 +1,162 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=140, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=1e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=140, + milestones=[90, 120], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=160) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='DecoupledHeatmap', input_size=(512, 512), heatmap_size=(128, 128)) + +# model settings +model = dict( + type='BottomupPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256), + multiscale_output=True)), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='CIDHead', + in_channels=(32, 64, 128, 256), + num_keypoints=17, + gfd_channels=32, + input_transform='resize_concat', + input_index=(0, 1, 2, 3), + coupled_heatmap_loss=dict(type='FocalHeatmapLoss', loss_weight=1.0), + decoupled_heatmap_loss=dict(type='FocalHeatmapLoss', loss_weight=4.0), + contrastive_loss=dict( + type='InfoNCELoss', temperature=0.05, loss_weight=1.0), + decoder=codec, + ), + train_cfg=dict(max_train_instances=200), + test_cfg=dict( + multiscale_test=False, + flip_test=True, + shift_heatmap=False, + align_corners=False)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'bottomup' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='BottomupRandomAffine', input_size=codec['input_size']), + dict(type='RandomFlip', direction='horizontal'), + dict(type='GenerateTarget', encoder=codec), + dict(type='BottomupGetHeatmapMask'), + dict(type='PackPoseInputs'), +] +val_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict( + type='BottomupResize', + input_size=codec['input_size'], + size_factor=64, + resize_mode='expand'), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape', + 'img_shape', 'input_size', 'input_center', 'input_scale', + 'flip', 'flip_direction', 'flip_indices', 'raw_ann_info', + 'skeleton_links')) +] + +# data loaders +train_dataloader = dict( + batch_size=20, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=1, + num_workers=1, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json', + nms_thr=0.8, + score_mode='keypoint', +) +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512.py b/configs/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512.py new file mode 100644 index 0000000000..f7fa261ed8 --- /dev/null +++ b/configs/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512.py @@ -0,0 +1,162 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=140, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=1e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=140, + milestones=[90, 120], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=160) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='DecoupledHeatmap', input_size=(512, 512), heatmap_size=(128, 128)) + +# model settings +model = dict( + type='BottomupPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384), + multiscale_output=True)), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth'), + ), + head=dict( + type='CIDHead', + in_channels=(48, 96, 192, 384), + num_keypoints=17, + gfd_channels=48, + input_transform='resize_concat', + input_index=(0, 1, 2, 3), + coupled_heatmap_loss=dict(type='FocalHeatmapLoss', loss_weight=1.0), + decoupled_heatmap_loss=dict(type='FocalHeatmapLoss', loss_weight=4.0), + contrastive_loss=dict( + type='InfoNCELoss', temperature=0.05, loss_weight=1.0), + decoder=codec, + ), + train_cfg=dict(max_train_instances=200), + test_cfg=dict( + multiscale_test=False, + flip_test=True, + shift_heatmap=False, + align_corners=False)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'bottomup' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='BottomupRandomAffine', input_size=codec['input_size']), + dict(type='RandomFlip', direction='horizontal'), + dict(type='GenerateTarget', encoder=codec), + dict(type='BottomupGetHeatmapMask'), + dict(type='PackPoseInputs'), +] +val_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict( + type='BottomupResize', + input_size=codec['input_size'], + size_factor=64, + resize_mode='expand'), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape', + 'img_shape', 'input_size', 'input_center', 'input_scale', + 'flip', 'flip_direction', 'flip_indices', 'raw_ann_info', + 'skeleton_links')) +] + +# data loaders +train_dataloader = dict( + batch_size=20, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=1, + num_workers=1, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json', + nms_thr=0.8, + score_mode='keypoint', +) +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/cid/coco/hrnet_coco.md b/configs/body_2d_keypoint/cid/coco/hrnet_coco.md new file mode 100644 index 0000000000..f82cb04db0 --- /dev/null +++ b/configs/body_2d_keypoint/cid/coco/hrnet_coco.md @@ -0,0 +1,42 @@ + + +
+CID (CVPR'2022) + +```bibtex +@InProceedings{Wang_2022_CVPR, + author = {Wang, Dongkai and Zhang, Shiliang}, + title = {Contextual Instance Decoupling for Robust Multi-Person Pose Estimation}, + booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2022}, + pages = {11060-11068} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 without multi-scale test + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [CID](/configs/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512.py) | 512x512 | 0.704 | 0.894 | 0.775 | 0.753 | 0.928 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512_42b7e6e6-20230207.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w32_8xb20-140e_coco-512x512_20230207.json) | +| [CID](/configs/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512.py) | 512x512 | 0.715 | 0.900 | 0.782 | 0.765 | 0.935 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512_a36c3ecf-20230207.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/cid/coco/cid_hrnet-w48_8xb20-140e_coco-512x512_20230207.json) | diff --git a/configs/body_2d_keypoint/dekr/README.md b/configs/body_2d_keypoint/dekr/README.md new file mode 100644 index 0000000000..04726421c0 --- /dev/null +++ b/configs/body_2d_keypoint/dekr/README.md @@ -0,0 +1,22 @@ +# Bottom-up Human Pose Estimation via Disentangled Keypoint Regression (DEKR) + + + +
+DEKR (CVPR'2021) + +```bibtex +@inproceedings{geng2021bottom, + title={Bottom-up human pose estimation via disentangled keypoint regression}, + author={Geng, Zigang and Sun, Ke and Xiao, Bin and Zhang, Zhaoxiang and Wang, Jingdong}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={14676--14686}, + year={2021} +} +``` + +
+ +DEKR is a popular 2D bottom-up pose estimation approach that simultaneously detects all the instances and regresses the offsets from the instance centers to joints. + +In order to predict the offsets more accurately, the offsets of different joints are regressed using separated branches with deformable convolutional layers. Thus convolution kernels with different shapes are adopted to extract features for the corresponding joint. diff --git a/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512.py b/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512.py new file mode 100644 index 0000000000..94614ac651 --- /dev/null +++ b/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512.py @@ -0,0 +1,184 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=140, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=1e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=140, + milestones=[90, 120], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=80) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='SPR', + input_size=(512, 512), + heatmap_size=(128, 128), + sigma=(4, 2), + minimal_diagonal_length=32**0.5, + generate_keypoint_heatmaps=True, + decode_max_instances=30) + +# model settings +model = dict( + type='BottomupPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256), + multiscale_output=True)), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='DEKRHead', + in_channels=(32, 64, 128, 256), + num_keypoints=17, + input_transform='resize_concat', + input_index=(0, 1, 2, 3), + heatmap_loss=dict(type='KeypointMSELoss', use_target_weight=True), + displacement_loss=dict( + type='SoftWeightSmoothL1Loss', + use_target_weight=True, + supervise_empty=False, + beta=1 / 9, + loss_weight=0.002, + ), + decoder=codec, + rescore_cfg=dict( + in_channels=74, + norm_indexes=(5, 6), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/kpt_rescore_coco-33d58c5c.pth')), + ), + test_cfg=dict( + multiscale_test=False, + flip_test=True, + nms_dist_thr=0.05, + shift_heatmap=True, + align_corners=False)) + +# enable DDP training when rescore net is used +find_unused_parameters = True + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'bottomup' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='BottomupRandomAffine', input_size=codec['input_size']), + dict(type='RandomFlip', direction='horizontal'), + dict(type='GenerateTarget', encoder=codec), + dict(type='BottomupGetHeatmapMask'), + dict(type='PackPoseInputs'), +] +val_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict( + type='BottomupResize', + input_size=codec['input_size'], + size_factor=32, + resize_mode='expand'), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape', + 'img_shape', 'input_size', 'input_center', 'input_scale', + 'flip', 'flip_direction', 'flip_indices', 'raw_ann_info', + 'skeleton_links')) +] + +# data loaders +train_dataloader = dict( + batch_size=10, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=1, + num_workers=1, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json', + nms_mode='none', + score_mode='keypoint', +) +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640.py b/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640.py new file mode 100644 index 0000000000..1ffa60d24b --- /dev/null +++ b/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640.py @@ -0,0 +1,185 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=140, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=1e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=140, + milestones=[90, 120], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=80) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='SPR', + input_size=(640, 640), + heatmap_size=(160, 160), + sigma=(4, 2), + minimal_diagonal_length=32**0.5, + generate_keypoint_heatmaps=True, + decode_max_instances=30) + +# model settings +model = dict( + type='BottomupPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384), + multiscale_output=True)), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth'), + ), + head=dict( + type='DEKRHead', + in_channels=(48, 96, 192, 384), + num_keypoints=17, + input_transform='resize_concat', + input_index=(0, 1, 2, 3), + num_heatmap_filters=48, + heatmap_loss=dict(type='KeypointMSELoss', use_target_weight=True), + displacement_loss=dict( + type='SoftWeightSmoothL1Loss', + use_target_weight=True, + supervise_empty=False, + beta=1 / 9, + loss_weight=0.002, + ), + decoder=codec, + rescore_cfg=dict( + in_channels=74, + norm_indexes=(5, 6), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/kpt_rescore_coco-33d58c5c.pth')), + ), + test_cfg=dict( + multiscale_test=False, + flip_test=True, + nms_dist_thr=0.05, + shift_heatmap=True, + align_corners=False)) + +# enable DDP training when rescore net is used +find_unused_parameters = True + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'bottomup' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='BottomupRandomAffine', input_size=codec['input_size']), + dict(type='RandomFlip', direction='horizontal'), + dict(type='GenerateTarget', encoder=codec), + dict(type='BottomupGetHeatmapMask'), + dict(type='PackPoseInputs'), +] +val_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict( + type='BottomupResize', + input_size=codec['input_size'], + size_factor=32, + resize_mode='expand'), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape', + 'img_shape', 'input_size', 'input_center', 'input_scale', + 'flip', 'flip_direction', 'flip_indices', 'raw_ann_info', + 'skeleton_links')) +] + +# data loaders +train_dataloader = dict( + batch_size=10, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=1, + num_workers=1, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json', + nms_mode='none', + score_mode='keypoint', +) +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/dekr/coco/hrnet_coco.md b/configs/body_2d_keypoint/dekr/coco/hrnet_coco.md new file mode 100644 index 0000000000..648b9bc735 --- /dev/null +++ b/configs/body_2d_keypoint/dekr/coco/hrnet_coco.md @@ -0,0 +1,58 @@ + + +
+DEKR (CVPR'2021) + +```bibtex +@inproceedings{geng2021bottom, + title={Bottom-up human pose estimation via disentangled keypoint regression}, + author={Geng, Zigang and Sun, Ke and Xiao, Bin and Zhang, Zhaoxiang and Wang, Jingdong}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={14676--14686}, + year={2021} +} +``` + +
+ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 without multi-scale test + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [HRNet-w32](/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512.py) | 512x512 | 0.686 | 0.868 | 0.750 | 0.735 | 0.898 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512_ac7c17bf-20221228.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512_20221228.json) | +| [HRNet-w48](/configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640.py) | 640x640 | 0.714 | 0.883 | 0.777 | 0.762 | 0.915 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640_74796c32-20230124.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/coco/dekr_hrnet-w48_8xb10-140e_coco-640x640_20230124.json) | diff --git a/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512.py b/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512.py new file mode 100644 index 0000000000..3b34f0d362 --- /dev/null +++ b/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512.py @@ -0,0 +1,185 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=300, val_interval=20) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=1e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=300, + milestones=[200, 260], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=80) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='crowdpose/AP', rule='greater')) + +# codec settings +codec = dict( + type='SPR', + input_size=(512, 512), + heatmap_size=(128, 128), + sigma=(4, 2), + minimal_diagonal_length=32**0.5, + generate_keypoint_heatmaps=True, + decode_max_instances=30) + +# model settings +model = dict( + type='BottomupPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256), + multiscale_output=True)), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='DEKRHead', + in_channels=(32, 64, 128, 256), + num_keypoints=14, + input_transform='resize_concat', + input_index=(0, 1, 2, 3), + heatmap_loss=dict(type='KeypointMSELoss', use_target_weight=True), + displacement_loss=dict( + type='SoftWeightSmoothL1Loss', + use_target_weight=True, + supervise_empty=False, + beta=1 / 9, + loss_weight=0.004, + ), + decoder=codec, + rescore_cfg=dict( + in_channels=59, + norm_indexes=(0, 1), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/kpt_rescore_crowdpose-300c7efe.pth')), + ), + test_cfg=dict( + multiscale_test=False, + flip_test=True, + nms_dist_thr=0.05, + shift_heatmap=True, + align_corners=False)) + +# enable DDP training when rescore net is used +find_unused_parameters = True + +# base dataset settings +dataset_type = 'CrowdPoseDataset' +data_mode = 'bottomup' +data_root = 'data/crowdpose/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='BottomupRandomAffine', input_size=codec['input_size']), + dict(type='RandomFlip', direction='horizontal'), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] +val_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict( + type='BottomupResize', + input_size=codec['input_size'], + size_factor=32, + resize_mode='expand'), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape', + 'img_shape', 'input_size', 'input_center', 'input_scale', + 'flip', 'flip_direction', 'flip_indices', 'raw_ann_info', + 'skeleton_links')) +] + +# data loaders +train_dataloader = dict( + batch_size=10, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=1, + num_workers=1, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mmpose_crowdpose_test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/mmpose_crowdpose_test.json', + nms_mode='none', + score_mode='keypoint', + use_area=False, + iou_type='keypoints_crowd', + prefix='crowdpose') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640.py b/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640.py new file mode 100644 index 0000000000..5b575461e4 --- /dev/null +++ b/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640.py @@ -0,0 +1,186 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=300, val_interval=20) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=1e-3, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=300, + milestones=[200, 260], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=40) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='crowdpose/AP', rule='greater')) + +# codec settings +codec = dict( + type='SPR', + input_size=(640, 640), + heatmap_size=(160, 160), + sigma=(4, 2), + minimal_diagonal_length=32**0.5, + generate_keypoint_heatmaps=True, + decode_max_instances=30) + +# model settings +model = dict( + type='BottomupPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384), + multiscale_output=True)), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth'), + ), + head=dict( + type='DEKRHead', + in_channels=(48, 96, 192, 384), + num_keypoints=14, + input_transform='resize_concat', + input_index=(0, 1, 2, 3), + num_heatmap_filters=48, + heatmap_loss=dict(type='KeypointMSELoss', use_target_weight=True), + displacement_loss=dict( + type='SoftWeightSmoothL1Loss', + use_target_weight=True, + supervise_empty=False, + beta=1 / 9, + loss_weight=0.004, + ), + decoder=codec, + rescore_cfg=dict( + in_channels=59, + norm_indexes=(0, 1), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/kpt_rescore_crowdpose-300c7efe.pth')), + ), + test_cfg=dict( + multiscale_test=False, + flip_test=True, + nms_dist_thr=0.05, + shift_heatmap=True, + align_corners=False)) + +# enable DDP training when rescore net is used +find_unused_parameters = True + +# base dataset settings +dataset_type = 'CrowdPoseDataset' +data_mode = 'bottomup' +data_root = 'data/crowdpose/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='BottomupRandomAffine', input_size=codec['input_size']), + dict(type='RandomFlip', direction='horizontal'), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs'), +] +val_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict( + type='BottomupResize', + input_size=codec['input_size'], + size_factor=32, + resize_mode='expand'), + dict( + type='PackPoseInputs', + meta_keys=('id', 'img_id', 'img_path', 'crowd_index', 'ori_shape', + 'img_shape', 'input_size', 'input_center', 'input_scale', + 'flip', 'flip_direction', 'flip_indices', 'raw_ann_info', + 'skeleton_links')) +] + +# data loaders +train_dataloader = dict( + batch_size=5, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=1, + num_workers=1, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mmpose_crowdpose_test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/mmpose_crowdpose_test.json', + nms_mode='none', + score_mode='keypoint', + use_area=False, + iou_type='keypoints_crowd', + prefix='crowdpose') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/dekr/crowdpose/hrnet_crowdpose.md b/configs/body_2d_keypoint/dekr/crowdpose/hrnet_crowdpose.md new file mode 100644 index 0000000000..ea58d95b7f --- /dev/null +++ b/configs/body_2d_keypoint/dekr/crowdpose/hrnet_crowdpose.md @@ -0,0 +1,56 @@ + + +
+DEKR (CVPR'2021) + +```bibtex +@inproceedings{geng2021bottom, + title={Bottom-up human pose estimation via disentangled keypoint regression}, + author={Geng, Zigang and Sun, Ke and Xiao, Bin and Zhang, Zhaoxiang and Wang, Jingdong}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={14676--14686}, + year={2021} +} +``` + +
+ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+CrowdPose (CVPR'2019) + +```bibtex +@article{li2018crowdpose, + title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark}, + author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu}, + journal={arXiv preprint arXiv:1812.00324}, + year={2018} +} +``` + +
+ +Results on CrowdPose test without multi-scale test + +| Arch | Input Size | AP | AP50 | AP75 | AP (E) | AP (M) | AP (H) | ckpt | log | +| :--------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :----: | :----: | :----: | :--------------------------------------------: | :-------------------------------------------: | +| [HRNet-w32](/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-300e_crowdpose-512x512.py) | 512x512 | 0.663 | 0.857 | 0.714 | 0.740 | 0.671 | 0.576 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-140e_crowdpose-512x512_147bae97-20221228.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w32_8xb10-140e_crowdpose-512x512_20221228.json) | +| [HRNet-w48](/configs/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640.py) | 640x640 | 0.679 | 0.869 | 0.731 | 0.753 | 0.688 | 0.593 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640_4ea6031e-20230128.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/dekr/crowdpose/dekr_hrnet-w48_8xb5-300e_crowdpose-640x640_20230128.json) | diff --git a/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256.py b/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256.py index 3fd5b06d88..9f4f05061f 100644 --- a/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256.py +++ b/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_8xb64-210e_coco-256x256.py @@ -83,10 +83,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict( - type='GenerateTarget', - target_type='heatmap+keypoint_label', - encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] test_pipeline = [ diff --git a/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_debias-8xb64-210e_coco-256x256.py b/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_debias-8xb64-210e_coco-256x256.py index 6203eccb79..b892569686 100644 --- a/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_debias-8xb64-210e_coco-256x256.py +++ b/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_debias-8xb64-210e_coco-256x256.py @@ -85,10 +85,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict( - type='GenerateTarget', - target_type='heatmap+keypoint_label', - encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] test_pipeline = [ diff --git a/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256.py b/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256.py index b911f8b300..9dcf9cce72 100644 --- a/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256.py +++ b/configs/body_2d_keypoint/integral_regression/coco/ipr_res50_dsnt-8xb64-210e_coco-256x256.py @@ -83,10 +83,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict( - type='GenerateTarget', - target_type='heatmap+keypoint_label', - encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] test_pipeline = [ diff --git a/configs/body_2d_keypoint/rtmpose/README.md b/configs/body_2d_keypoint/rtmpose/README.md new file mode 100644 index 0000000000..3037974917 --- /dev/null +++ b/configs/body_2d_keypoint/rtmpose/README.md @@ -0,0 +1,39 @@ +# RTMPose + +Recent studies on 2D pose estimation have achieved excellent performance on public benchmarks, yet its application in the industrial community still suffers from heavy model parameters and high latency. +In order to bridge this gap, we empirically study five aspects that affect the performance of multi-person pose estimation algorithms: paradigm, backbone network, localization algorithm, training strategy, and deployment inference, and present a high-performance real-time multi-person pose estimation framework, **RTMPose**, based on MMPose. +Our RTMPose-m achieves **75.8% AP** on COCO with **90+ FPS** on an Intel i7-11700 CPU and **430+ FPS** on an NVIDIA GTX 1660 Ti GPU, and RTMPose-l achieves **67.0% AP** on COCO-WholeBody with **130+ FPS**, outperforming existing open-source libraries. +To further evaluate RTMPose's capability in critical real-time applications, we also report the performance after deploying on the mobile device. + +## Results and Models + +### COCO Dataset + +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Model | Input Size | AP | AR | Details and Download | +| :----------------: | :--------: | :---: | :---: | :---------------------------------------: | +| RTMPose-t | 256x192 | 0.682 | 0.736 | [rtmpose_coco.md](./coco/rtmpose_coco.md) | +| RTMPose-s | 256x192 | 0.716 | 0.768 | [rtmpose_coco.md](./coco/rtmpose_coco.md) | +| RTMPose-m | 256x192 | 0.746 | 0.795 | [rtmpose_coco.md](./coco/rtmpose_coco.md) | +| RTMPose-l | 256x192 | 0.758 | 0.806 | [rtmpose_coco.md](./coco/rtmpose_coco.md) | +| RTMPose-t-aic-coco | 256x192 | 0.685 | 0.738 | [rtmpose_coco.md](./coco/rtmpose_coco.md) | +| RTMPose-s-aic-coco | 256x192 | 0.722 | 0.772 | [rtmpose_coco.md](./coco/rtmpose_coco.md) | +| RTMPose-m-aic-coco | 256x192 | 0.758 | 0.806 | [rtmpose_coco.md](./coco/rtmpose_coco.md) | +| RTMPose-l-aic-coco | 256x192 | 0.765 | 0.813 | [rtmpose_coco.md](./coco/rtmpose_coco.md) | +| RTMPose-m-aic-coco | 384x288 | 0.770 | 0.816 | [rtmpose_coco.md](./coco/rtmpose_coco.md) | +| RTMPose-l-aic-coco | 384x288 | 0.773 | 0.819 | [rtmpose_coco.md](./coco/rtmpose_coco.md) | + +### MPII Dataset + +| Model | Input Size | PCKh@0.5 | PCKh@0.1 | Details and Download | +| :-------: | :--------: | :------: | :------: | :---------------------------------------: | +| RTMPose-m | 256x256 | 0.907 | 0.348 | [rtmpose_mpii.md](./mpii/rtmpose_mpii.md) | + +### CrowdPose Dataset + +Results on CrowdPose test with [YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3) human detector + +| Model | Input Size | AP | AR | Details and Download | +| :-------: | :--------: | :---: | :---: | :------------------------------------------------------: | +| RTMPose-m | 256x192 | 0.706 | 0.788 | [rtmpose_crowdpose.md](./crowdpose/rtmpose_crowdpose.md) | diff --git a/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-256x192.py b/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-256x192.py new file mode 100644 index 0000000000..fabcd90344 --- /dev/null +++ b/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-256x192.py @@ -0,0 +1,272 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=1024, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type='RepeatDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], + ), + times=3) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=17, + mapping=[ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + ]) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-384x288.py b/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-384x288.py new file mode 100644 index 0000000000..cc9fb7a52e --- /dev/null +++ b/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_aic-coco-384x288.py @@ -0,0 +1,272 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(288, 384), + sigma=(6., 6.93), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=1024, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=(9, 12), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type='RepeatDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], + ), + times=3) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=17, + mapping=[ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + ]) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_coco-256x192.py b/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_coco-256x192.py new file mode 100644 index 0000000000..d9c180fe3a --- /dev/null +++ b/configs/body_2d_keypoint/rtmpose/coco/rtmpose-l_8xb256-420e_coco-256x192.py @@ -0,0 +1,232 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=1024, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file=f'{data_root}person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-256x192.py b/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-256x192.py new file mode 100644 index 0000000000..0fd70b7822 --- /dev/null +++ b/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-256x192.py @@ -0,0 +1,272 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type='RepeatDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], + ), + times=3) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=17, + mapping=[ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + ]) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=128 * 2, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-384x288.py b/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-384x288.py new file mode 100644 index 0000000000..700de32aea --- /dev/null +++ b/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_aic-coco-384x288.py @@ -0,0 +1,272 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(288, 384), + sigma=(6., 6.93), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=(9, 12), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type='RepeatDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], + ), + times=3) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=17, + mapping=[ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + ]) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=128 * 2, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_coco-256x192.py b/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_coco-256x192.py new file mode 100644 index 0000000000..26bd52498a --- /dev/null +++ b/configs/body_2d_keypoint/rtmpose/coco/rtmpose-m_8xb256-420e_coco-256x192.py @@ -0,0 +1,232 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file=f'{data_root}person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_aic-coco-256x192.py b/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_aic-coco-256x192.py new file mode 100644 index 0000000000..be1eeea320 --- /dev/null +++ b/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_aic-coco-256x192.py @@ -0,0 +1,272 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.0), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.33, + widen_factor=0.5, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-s_udp-aic-coco_210e-256x192-92f5a029_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=512, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type='RepeatDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], + ), + times=3) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=17, + mapping=[ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + ]) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=128 * 2, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_coco-256x192.py b/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_coco-256x192.py new file mode 100644 index 0000000000..ab87d99148 --- /dev/null +++ b/configs/body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_coco-256x192.py @@ -0,0 +1,232 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.33, + widen_factor=0.5, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-s_udp-aic-coco_210e-256x192-92f5a029_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=512, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file=f'{data_root}person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_aic-coco-256x192.py b/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_aic-coco-256x192.py new file mode 100644 index 0000000000..abf3692647 --- /dev/null +++ b/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_aic-coco-256x192.py @@ -0,0 +1,273 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.167, + widen_factor=0.375, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=384, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type='RepeatDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[], + ), + times=3) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=17, + mapping=[ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + ]) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + # Turn off EMA while training the tiny model + # dict( + # type='EMAHook', + # ema_type='ExpMomentumEMA', + # momentum=0.0002, + # update_buffers=True, + # priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_coco-256x192.py b/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_coco-256x192.py new file mode 100644 index 0000000000..634a7cc04d --- /dev/null +++ b/configs/body_2d_keypoint/rtmpose/coco/rtmpose-t_8xb256-420e_coco-256x192.py @@ -0,0 +1,233 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.167, + widen_factor=0.375, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=384, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file=f'{data_root}person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + # Turn off EMA while training the tiny model + # dict( + # type='EMAHook', + # ema_type='ExpMomentumEMA', + # momentum=0.0002, + # update_buffers=True, + # priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/rtmpose/coco/rtmpose_coco.md b/configs/body_2d_keypoint/rtmpose/coco/rtmpose_coco.md new file mode 100644 index 0000000000..2b3d4447e1 --- /dev/null +++ b/configs/body_2d_keypoint/rtmpose/coco/rtmpose_coco.md @@ -0,0 +1,71 @@ + + +
+RTMPose (arXiv'2023) + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2303.07399, + doi = {10.48550/ARXIV.2303.07399}, + url = {https://arxiv.org/abs/2303.07399}, + author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose}, + publisher = {arXiv}, + year = {2023}, + copyright = {Creative Commons Attribution 4.0 International} +} + +``` + +
+ + + +
+RTMDet (arXiv'2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [rtmpose-t](./rtmpose-t_8xb256-420e_coco-256x192.py) | 256x192 | 0.682 | 0.883 | 0.759 | 0.736 | 0.920 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-tiny_simcc-coco_pt-aic-coco_420e-256x192-e613ba3f_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-tiny_simcc-coco_pt-aic-coco_420e-256x192-e613ba3f_20230127.json) | +| [rtmpose-s](./rtmpose-s_8xb256-420e_coco-256x192.py) | 256x192 | 0.716 | 0.892 | 0.789 | 0.768 | 0.929 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-s_simcc-coco_pt-aic-coco_420e-256x192-8edcf0d7_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-s_simcc-coco_pt-aic-coco_420e-256x192-8edcf0d7_20230127.json) | +| [rtmpose-m](./rtmpose-m_8xb256-420e_coco-256x192.py) | 256x192 | 0.746 | 0.899 | 0.817 | 0.795 | 0.935 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-coco_pt-aic-coco_420e-256x192-d8dd5ca4_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-coco_pt-aic-coco_420e-256x192-d8dd5ca4_20230127.json) | +| [rtmpose-l](./rtmpose-l_8xb256-420e_coco-256x192.py) | 256x192 | 0.758 | 0.906 | 0.826 | 0.806 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-coco_pt-aic-coco_420e-256x192-1352a4d2_20230127.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-coco_pt-aic-coco_420e-256x192-1352a4d2_20230127.json) | +| [rtmpose-t-aic-coco](./rtmpose-t_8xb256-420e_aic-coco-256x192.py) | 256x192 | 0.685 | 0.880 | 0.761 | 0.738 | 0.918 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-tiny_simcc-aic-coco_pt-aic-coco_420e-256x192-cfc8f33d_20230126.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-tiny_simcc-aic-coco_pt-aic-coco_420e-256x192-cfc8f33d_20230126.json) | +| [rtmpose-s-aic-coco](./rtmpose-s_8xb256-420e_aic-coco-256x192.py) | 256x192 | 0.722 | 0.892 | 0.794 | 0.772 | 0.929 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-s_simcc-aic-coco_pt-aic-coco_420e-256x192-fcb2599b_20230126.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-s_simcc-aic-coco_pt-aic-coco_420e-256x192-fcb2599b_20230126.json) | +| [rtmpose-m-aic-coco](./rtmpose-m_8xb256-420e_aic-coco-256x192.py) | 256x192 | 0.758 | 0.903 | 0.826 | 0.806 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.json) | +| [rtmpose-l-aic-coco](./rtmpose-l_8xb256-420e_aic-coco-256x192.py) | 256x192 | 0.765 | 0.906 | 0.835 | 0.813 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-256x192-f016ffe0_20230126.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-256x192-f016ffe0_20230126.json) | +| [rtmpose-m-aic-coco](./rtmpose-m_8xb256-420e_aic-coco-384x288.py) | 384x288 | 0.770 | 0.908 | 0.833 | 0.816 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-384x288-a62a0b32_20230228.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-384x288-a62a0b32_20230228.json) | +| [rtmpose-l-aic-coco](./rtmpose-l_8xb256-420e_aic-coco-384x288.py) | 384x288 | 0.773 | 0.907 | 0.835 | 0.819 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-384x288-97d6cb0f_20230228.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-384x288-97d6cb0f_20230228.json) | diff --git a/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose-m_8xb64-210e_crowdpose-256x192.py b/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose-m_8xb64-210e_crowdpose-256x192.py new file mode 100644 index 0000000000..4f028fa1f5 --- /dev/null +++ b/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose-m_8xb64-210e_crowdpose-256x192.py @@ -0,0 +1,235 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 5e-4 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=14, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CrowdPoseDataset' +data_mode = 'topdown' +data_root = 'data/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_test.json', + bbox_file='data/crowdpose/annotations/det_for_crowd_test_0.1_0.5.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='crowdpose/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'crowdpose/annotations/mmpose_crowdpose_test.json', + use_area=False, + iou_type='keypoints_crowd', + prefix='crowdpose') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose_crowdpose.md b/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose_crowdpose.md new file mode 100644 index 0000000000..35048ee9ef --- /dev/null +++ b/configs/body_2d_keypoint/rtmpose/crowdpose/rtmpose_crowdpose.md @@ -0,0 +1,60 @@ + + +
+RTMPose (arXiv'2023) + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2303.07399, + doi = {10.48550/ARXIV.2303.07399}, + url = {https://arxiv.org/abs/2303.07399}, + author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose}, + publisher = {arXiv}, + year = {2023}, + copyright = {Creative Commons Attribution 4.0 International} +} + +``` + +
+ + + +
+RTMDet (arXiv'2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+CrowdPose (CVPR'2019) + +```bibtex +@article{li2018crowdpose, + title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark}, + author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu}, + journal={arXiv preprint arXiv:1812.00324}, + year={2018} +} +``` + +
+ +Results on CrowdPose test with [YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3) human detector + +| Arch | Input Size | AP | AP50 | AP75 | AP (E) | AP (M) | AP (H) | ckpt | log | +| :--------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :----: | :----: | :----: | :--------------------------------------------: | :-------------------------------------------: | +| [rtmpose-m](./rtmpose-m_8xb64-210e_crowdpose-256x192.py) | 256x192 | 0.706 | 0.841 | 0.765 | 0.799 | 0.719 | 0.582 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-crowdpose_pt-aic-coco_210e-256x192-e6192cac_20230224.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-crowdpose_pt-aic-coco_210e-256x192-e6192cac_20230224.json) | diff --git a/configs/body_2d_keypoint/rtmpose/mpii/rtmpose-m_8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/rtmpose/mpii/rtmpose-m_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000000..dfaf384037 --- /dev/null +++ b/configs/body_2d_keypoint/rtmpose/mpii/rtmpose-m_8xb64-210e_mpii-256x256.py @@ -0,0 +1,228 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=16, + input_size=codec['input_size'], + in_featuremap_size=(8, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/pose/MPI/', +# f'{data_root}': 's3://openmmlab/datasets/pose/MPI/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file=f'{data_root}/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='PCK', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/rtmpose/mpii/rtmpose_mpii.md b/configs/body_2d_keypoint/rtmpose/mpii/rtmpose_mpii.md new file mode 100644 index 0000000000..b9c8f5a6bd --- /dev/null +++ b/configs/body_2d_keypoint/rtmpose/mpii/rtmpose_mpii.md @@ -0,0 +1,43 @@ + + +
+RTMPose (arXiv'2023) + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2303.07399, + doi = {10.48550/ARXIV.2303.07399}, + url = {https://arxiv.org/abs/2303.07399}, + author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose}, + publisher = {arXiv}, + year = {2023}, + copyright = {Creative Commons Attribution 4.0 International} +} + +``` + +
+ + + +
+MPII (CVPR'2014) + +```bibtex +@inproceedings{andriluka14cvpr, + author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt}, + title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis}, + booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2014}, + month = {June} +} +``` + +
+ +Results on MPII val set + +| Arch | Input Size | Mean / w. flip | Mean@0.1 | ckpt | log | +| :-------------------------------------------------- | :--------: | :------------: | :------: | :---------------------------------------------------------: | :--------------------------------------------------------: | +| [rtmpose-m](./rtmpose-m_8xb64-210e_mpii-256x256.py) | 256x256 | 0.907 | 0.348 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-mpii_pt-aic-coco_210e-256x256-ec4dbec8_20230206.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-mpii_pt-aic-coco_210e-256x256-ec4dbec8_20230206.json) | diff --git a/configs/body_2d_keypoint/simcc/coco/mobilenetv2_coco.yml b/configs/body_2d_keypoint/simcc/coco/mobilenetv2_coco.yml new file mode 100644 index 0000000000..6ee4a0fd48 --- /dev/null +++ b/configs/body_2d_keypoint/simcc/coco/mobilenetv2_coco.yml @@ -0,0 +1,25 @@ +Collections: +- Name: SimCC + Paper: + Title: A Simple Coordinate Classification Perspective for Human Pose Estimation + URL: https://arxiv.org/abs/2107.03332 + README: https://github.com/open-mmlab/mmpose/blob/1.x/docs/src/papers/algorithms/simcc.md +Models: +- Config: configs/body_2d_keypoint/simcc/coco/simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192.py + In Collection: SimCC + Metadata: + Architecture: &id001 + - SimCC + - MobilenetV2 + Training Data: COCO + Name: simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.62 + AP@0.5: 0.855 + AP@0.75: 0.697 + AR: 0.678 + AR@0.5: 0.902 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192-4b0703bb_20221010.pth diff --git a/configs/body_2d_keypoint/simcc/coco/resnet_coco.yml b/configs/body_2d_keypoint/simcc/coco/resnet_coco.yml new file mode 100644 index 0000000000..3ad0caddca --- /dev/null +++ b/configs/body_2d_keypoint/simcc/coco/resnet_coco.yml @@ -0,0 +1,41 @@ +Collections: +- Name: SimCC + Paper: + Title: A Simple Coordinate Classification Perspective for Human Pose Estimation + URL: https://arxiv.org/abs/2107.03332 + README: https://github.com/open-mmlab/mmpose/blob/1.x/docs/src/papers/algorithms/simcc.md +Models: +- Config: configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb64-210e_coco-256x192.py + In Collection: SimCC + Metadata: + Architecture: &id001 + - SimCC + - ResNet + Training Data: COCO + Name: simcc_res50_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.721 + AP@0.5: 0.900 + AP@0.75: 0.798 + AR: 0.781 + AR@0.5: 0.937 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_res50_8xb64-210e_coco-256x192-8e0f5b59_20220919.pth +- Config: configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb32-140e_coco-384x288.py + In Collection: SimCC + Metadata: + Architecture: *id001 + Training Data: COCO + Name: simcc_res50_8xb32-140e_coco-384x288 + Results: + - Dataset: COCO + Metrics: + AP: 0.735 + AP@0.5: 0.899 + AP@0.75: 0.800 + AR: 0.790 + AR@0.5: 0.939 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_res50_8xb32-140e_coco-384x288-45c3ba34_20220913.pth diff --git a/configs/body_2d_keypoint/simcc/coco/simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/simcc/coco/simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192.py index 0999c99516..7bb6b47bda 100644 --- a/configs/body_2d_keypoint/simcc/coco/simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/simcc/coco/simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192.py @@ -73,8 +73,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict( - type='GenerateTarget', target_type='keypoint_xy_label', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb32-140e_coco-384x288.py b/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb32-140e_coco-384x288.py index a3446fef1b..02a10931be 100644 --- a/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb32-140e_coco-384x288.py +++ b/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb32-140e_coco-384x288.py @@ -69,8 +69,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict( - type='GenerateTarget', target_type='keypoint_xy_label', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] test_pipeline = [ diff --git a/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb64-210e_coco-256x192.py index 51555d601f..27795e6dfe 100644 --- a/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/simcc/coco/simcc_res50_8xb64-210e_coco-256x192.py @@ -63,8 +63,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict( - type='GenerateTarget', target_type='keypoint_xy_label', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] test_pipeline = [ diff --git a/configs/body_2d_keypoint/simcc/coco/simcc_vipnas-mbv3_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/simcc/coco/simcc_vipnas-mbv3_8xb64-210e_coco-256x192.py index 1b24ac23b2..23ff3f34cf 100644 --- a/configs/body_2d_keypoint/simcc/coco/simcc_vipnas-mbv3_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/simcc/coco/simcc_vipnas-mbv3_8xb64-210e_coco-256x192.py @@ -68,8 +68,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict( - type='GenerateTarget', target_type='keypoint_xy_label', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/simcc/coco/vipnas_coco.yml b/configs/body_2d_keypoint/simcc/coco/vipnas_coco.yml new file mode 100644 index 0000000000..bc3b4a43de --- /dev/null +++ b/configs/body_2d_keypoint/simcc/coco/vipnas_coco.yml @@ -0,0 +1,25 @@ +Collections: +- Name: SimCC + Paper: + Title: A Simple Coordinate Classification Perspective for Human Pose Estimation + URL: https://arxiv.org/abs/2107.03332 + README: https://github.com/open-mmlab/mmpose/blob/1.x/docs/src/papers/algorithms/simcc.md +Models: +- Config: configs/body_2d_keypoint/simcc/coco/simcc_vipnas-mbv3_8xb64-210e_coco-256x192.py + In Collection: SimCC + Metadata: + Architecture: &id001 + - SimCC + - ViPNAS + Training Data: COCO + Name: simcc_vipnas-mbv3_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.695 + AP@0.5: 0.883 + AP@0.75: 0.772 + AR: 0.755 + AR@0.5: 0.927 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/simcc/coco/simcc_vipnas-mbv3_8xb64-210e_coco-256x192-719f3489_20220922.pth diff --git a/configs/body_2d_keypoint/simcc/mpii/simcc_res50_wo-deconv-8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/simcc/mpii/simcc_res50_wo-deconv-8xb64-210e_mpii-256x256.py index d09d160764..b4ee1cf9a9 100644 --- a/configs/body_2d_keypoint/simcc/mpii/simcc_res50_wo-deconv-8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/simcc/mpii/simcc_res50_wo-deconv-8xb64-210e_mpii-256x256.py @@ -72,8 +72,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict( - type='GenerateTarget', target_type='keypoint_xy_label', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ @@ -116,8 +115,8 @@ test_dataloader = val_dataloader # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # evaluators -val_evaluator = dict(type='MpiiPCKAccuracy', norm_item='head') +val_evaluator = dict(type='MpiiPCKAccuracy') test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/README.md b/configs/body_2d_keypoint/topdown_heatmap/README.md index 68948d16cb..9e23b874bc 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/README.md +++ b/configs/body_2d_keypoint/topdown_heatmap/README.md @@ -14,40 +14,46 @@ Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 da | Model | Input Size | AP | AR | Details and Download | | :-------------: | :--------: | :---: | :---: | :-------------------------------------------------: | +| ViTPose-h | 256x192 | 0.790 | 0.840 | [vitpose_coco.md](./coco/vitpose_coco.md) | | HRNet-w48+UDP | 256x192 | 0.768 | 0.817 | [hrnet_udp_coco.md](./coco/hrnet_udp_coco.md) | | MSPN 4-stg | 256x192 | 0.765 | 0.826 | [mspn_coco.md](./coco/mspn_coco.md) | | HRNet-w48+Dark | 256x192 | 0.764 | 0.814 | [hrnet_dark_coco.md](./coco/hrnet_dark_coco.md) | | HRNet-w48 | 256x192 | 0.756 | 0.809 | [hrnet_coco.md](./coco/hrnet_coco.md) | | HRFormer-B | 256x192 | 0.754 | 0.807 | [hrformer_coco.md](./coco/hrformer_coco.md) | -| RSN-50-3x | 256x192 | 0.749 | 0.812 | [rsn_coco.md](./coco/rsn_coco.md) | +| RSN-50-3x | 256x192 | 0.750 | 0.814 | [rsn_coco.md](./coco/rsn_coco.md) | +| CSPNeXt-l | 256x192 | 0.750 | 0.800 | [cspnext_udp_coco.md](./coco/cspnext_udp_coco.md) | | HRNet-w32 | 256x192 | 0.749 | 0.804 | [hrnet_coco.md](./coco/hrnet_coco.md) | | Swin-L | 256x192 | 0.743 | 0.798 | [swin_coco.md](./coco/swin_coco.md) | +| ViTPose-s | 256x192 | 0.739 | 0.792 | [vitpose_coco.md](./coco/vitpose_coco.md) | | HRFormer-S | 256x192 | 0.738 | 0.793 | [hrformer_coco.md](./coco/hrformer_coco.md) | | Swin-B | 256x192 | 0.737 | 0.794 | [swin_coco.md](./coco/swin_coco.md) | | SEResNet-101 | 256x192 | 0.734 | 0.790 | [seresnet_coco.md](./coco/seresnet_coco.md) | | SCNet-101 | 256x192 | 0.733 | 0.789 | [scnet_coco.md](./coco/scnet_coco.md) | -| ResNet-101+Dark | 256x192 | 0.732 | 0.786 | [resnet_dark_coco.md](./coco/resnet_dark_coco.md) | -| ResNetV1d-101 | 256x192 | 0.731 | 0.786 | [resnetv1d_coco.md](./coco/resnetv1d_coco.md) | +| ResNet-101+Dark | 256x192 | 0.733 | 0.786 | [resnet_dark_coco.md](./coco/resnet_dark_coco.md) | +| CSPNeXt-m | 256x192 | 0.732 | 0.785 | [cspnext_udp_coco.md](./coco/cspnext_udp_coco.md) | +| ResNetV1d-101 | 256x192 | 0.732 | 0.785 | [resnetv1d_coco.md](./coco/resnetv1d_coco.md) | | SEResNet-50 | 256x192 | 0.729 | 0.784 | [seresnet_coco.md](./coco/seresnet_coco.md) | | SCNet-50 | 256x192 | 0.728 | 0.784 | [scnet_coco.md](./coco/scnet_coco.md) | -| ResNet-101 | 256x192 | 0.726 | 0.781 | [resnet_coco.md](./coco/resnet_coco.md) | +| ResNet-101 | 256x192 | 0.726 | 0.783 | [resnet_coco.md](./coco/resnet_coco.md) | | ResNeXt-101 | 256x192 | 0.726 | 0.781 | [resnext_coco.md](./coco/resnext_coco.md) | -| RSN-50 | 256x192 | 0.726 | 0.781 | [rsn_coco.md](./coco/rsn_coco.md) | | HourglassNet | 256x256 | 0.726 | 0.780 | [hourglass_coco.md](./coco/hourglass_coco.md) | | ResNeSt-101 | 256x192 | 0.725 | 0.781 | [resnest_coco.md](./coco/resnest_coco.md) | +| RSN-50 | 256x192 | 0.724 | 0.790 | [rsn_coco.md](./coco/rsn_coco.md) | | Swin-T | 256x192 | 0.724 | 0.782 | [swin_coco.md](./coco/swin_coco.md) | | MSPN 1-stg | 256x192 | 0.723 | 0.788 | [mspn_coco.md](./coco/mspn_coco.md) | | ResNetV1d-50 | 256x192 | 0.722 | 0.777 | [resnetv1d_coco.md](./coco/resnetv1d_coco.md) | | ResNeSt-50 | 256x192 | 0.720 | 0.775 | [resnest_coco.md](./coco/resnest_coco.md) | -| ResNet-50 | 256x192 | 0.718 | 0.773 | [resnet_coco.md](./coco/resnet_coco.md) | +| ResNet-50 | 256x192 | 0.718 | 0.774 | [resnet_coco.md](./coco/resnet_coco.md) | | ResNeXt-50 | 256x192 | 0.715 | 0.771 | [resnext_coco.md](./coco/resnext_coco.md) | | PVT-S | 256x192 | 0.714 | 0.773 | [pvt_coco.md](./coco/pvt_coco.md) | +| CSPNeXt-s | 256x192 | 0.697 | 0.753 | [cspnext_udp_coco.md](./coco/cspnext_udp_coco.md) | | LiteHRNet-30 | 256x192 | 0.676 | 0.736 | [litehrnet_coco.md](./coco/litehrnet_coco.md) | -| MobileNet-v2 | 256x192 | 0.647 | 0.708 | [mobilenetv2_coco.md](./coco/mobilenetv2_coco.md) | +| CSPNeXt-tiny | 256x192 | 0.665 | 0.723 | [cspnext_udp_coco.md](./coco/cspnext_udp_coco.md) | +| MobileNet-v2 | 256x192 | 0.648 | 0.709 | [mobilenetv2_coco.md](./coco/mobilenetv2_coco.md) | | LiteHRNet-18 | 256x192 | 0.642 | 0.705 | [litehrnet_coco.md](./coco/litehrnet_coco.md) | -| CPM | 256x192 | 0.623 | 0.685 | [cpm_coco.md](./coco/cpm_coco.md) | -| ShuffleNet-v2 | 256x192 | 0.598 | 0.664 | [shufflenetv2_coco.md](./coco/shufflenetv2_coco.md) | -| ShuffleNet-v1 | 256x192 | 0.586 | 0.651 | [shufflenetv1_coco.md](./coco/shufflenetv1_coco.md) | +| CPM | 256x192 | 0.627 | 0.689 | [cpm_coco.md](./coco/cpm_coco.md) | +| ShuffleNet-v2 | 256x192 | 0.602 | 0.668 | [shufflenetv2_coco.md](./coco/shufflenetv2_coco.md) | +| ShuffleNet-v1 | 256x192 | 0.587 | 0.654 | [shufflenetv1_coco.md](./coco/shufflenetv1_coco.md) | | AlexNet | 256x192 | 0.448 | 0.521 | [alexnet_coco.md](./coco/alexnet_coco.md) | ### MPII Dataset @@ -55,6 +61,7 @@ Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 da | Model | Input Size | PCKh@0.5 | PCKh@0.1 | Details and Download | | :------------: | :--------: | :------: | :------: | :-------------------------------------------------: | | HRNet-w48+Dark | 256x256 | 0.905 | 0.360 | [hrnet_dark_mpii.md](./mpii/hrnet_dark_mpii.md) | +| HRNet-w48 | 256x256 | 0.902 | 0.303 | [hrnet_mpii.md](./mpii/cspnext_udp_mpii.md) | | HRNet-w48 | 256x256 | 0.901 | 0.337 | [hrnet_mpii.md](./mpii/hrnet_mpii.md) | | HRNet-w32 | 256x256 | 0.900 | 0.334 | [hrnet_mpii.md](./mpii/hrnet_mpii.md) | | HourglassNet | 256x256 | 0.889 | 0.317 | [hourglass_mpii.md](./mpii/hourglass_mpii.md) | @@ -76,11 +83,12 @@ Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 da Results on CrowdPose test with [YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3) human detector -| Model | Input Size | AP | AR | Details and Download | -| :--------: | :--------: | :---: | :---: | :----------------------------------------------------: | -| HRNet-w32 | 256x192 | 0.675 | 0.816 | [hrnet_crowdpose.md](./crowdpose/hrnet_crowdpose.md) | -| ResNet-101 | 256x192 | 0.647 | 0.800 | [resnet_crowdpose.md](./crowdpose/resnet_crowdpose.md) | -| HRNet-w32 | 256x192 | 0.637 | 0.785 | [resnet_crowdpose.md](./crowdpose/resnet_crowdpose.md) | +| Model | Input Size | AP | AR | Details and Download | +| :--------: | :--------: | :---: | :---: | :--------------------------------------------------------: | +| HRNet-w32 | 256x192 | 0.675 | 0.816 | [hrnet_crowdpose.md](./crowdpose/hrnet_crowdpose.md) | +| CSPNeXt-m | 256x192 | 0.662 | 0.755 | [hrnet_crowdpose.md](./crowdpose/cspnext_udp_crowdpose.md) | +| ResNet-101 | 256x192 | 0.647 | 0.800 | [resnet_crowdpose.md](./crowdpose/resnet_crowdpose.md) | +| HRNet-w32 | 256x192 | 0.637 | 0.785 | [resnet_crowdpose.md](./crowdpose/resnet_crowdpose.md) | ### AIC Dataset diff --git a/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_hrnet-w32_8xb64-210e_aic-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_hrnet-w32_8xb64-210e_aic-256x192.py index decc68e04a..78f62d7d3f 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_hrnet-w32_8xb64-210e_aic-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_hrnet-w32_8xb64-210e_aic-256x192.py @@ -100,7 +100,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_res101_8xb64-210e_aic-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_res101_8xb64-210e_aic-256x192.py index 82ce831123..6cd7ffea02 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_res101_8xb64-210e_aic-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/aic/td-hm_res101_8xb64-210e_aic-256x192.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/cpm_coco.md b/configs/body_2d_keypoint/topdown_heatmap/coco/cpm_coco.md index 6fb0d1f342..3d4453a369 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/cpm_coco.md +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/cpm_coco.md @@ -37,5 +37,5 @@ Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 da | Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | | :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | -| [cpm](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192.py) | 256x192 | 0.623 | 0.858 | 0.706 | 0.685 | 0.902 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_coco_256x192-aa4ba095_20200817.pth) | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_coco_256x192_20200817.log.json) | -| [cpm](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288.py) | 384x288 | 0.650 | 0.863 | 0.725 | 0.707 | 0.905 | [ckpt](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_coco_384x288-80feb4bc_20200821.pth) | [log](https://download.openmmlab.com/mmpose/top_down/cpm/cpm_coco_384x288_20200821.log.json) | +| [cpm](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192.py) | 256x192 | 0.627 | 0.862 | 0.709 | 0.689 | 0.906 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192-0e978875_20220920.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192_20220920.log) | +| [cpm](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288.py) | 384x288 | 0.652 | 0.865 | 0.730 | 0.710 | 0.907 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288-165487b8_20221011.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288_20221011.log) | diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_aic-coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_aic-coco-256x192.py new file mode 100644 index 0000000000..c641e90441 --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_aic-coco-256x192.py @@ -0,0 +1,284 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 105 to 210 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# keypoint mappings +keypoint_mapping_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +keypoint_mapping_aic = [ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + (12, 17), + (13, 18), +] + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-l_8xb256-rsb-a1-600e_in1k-6a760974.pth')), + head=dict( + type='HeatmapHead', + in_channels=1024, + out_channels=19, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=False, + output_keypoint_indices=[ + target for _, target in keypoint_mapping_coco + ])) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type='RepeatDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, + mapping=keypoint_mapping_coco) + ], + ), + times=3) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, + mapping=keypoint_mapping_aic) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_aic.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_coco-256x192.py new file mode 100644 index 0000000000..70b07f7f40 --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_coco-256x192.py @@ -0,0 +1,214 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 105 to 210 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-l_8xb256-rsb-a1-600e_in1k-6a760974.pth')), + head=dict( + type='HeatmapHead', + in_channels=1024, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_aic-coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_aic-coco-256x192.py new file mode 100644 index 0000000000..a126afd469 --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_aic-coco-256x192.py @@ -0,0 +1,284 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 105 to 210 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# keypoint mappings +keypoint_mapping_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +keypoint_mapping_aic = [ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + (12, 17), + (13, 18), +] + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth')), + head=dict( + type='HeatmapHead', + in_channels=768, + out_channels=19, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=False, + output_keypoint_indices=[ + target for _, target in keypoint_mapping_coco + ])) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type='RepeatDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, + mapping=keypoint_mapping_coco) + ], + ), + times=3) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, + mapping=keypoint_mapping_aic) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_aic.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_coco-256x192.py new file mode 100644 index 0000000000..70f1c63bf8 --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_coco-256x192.py @@ -0,0 +1,214 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 105 to 210 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth')), + head=dict( + type='HeatmapHead', + in_channels=768, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_aic-coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_aic-coco-256x192.py new file mode 100644 index 0000000000..354d38dc74 --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_aic-coco-256x192.py @@ -0,0 +1,284 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.0), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 105 to 210 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# keypoint mappings +keypoint_mapping_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +keypoint_mapping_aic = [ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + (12, 17), + (13, 18), +] + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.33, + widen_factor=0.5, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-s_imagenet_600e-ea671761.pth')), + head=dict( + type='HeatmapHead', + in_channels=512, + out_channels=19, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=False, + output_keypoint_indices=[ + target for _, target in keypoint_mapping_coco + ])) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type='RepeatDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, + mapping=keypoint_mapping_coco) + ], + ), + times=3) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, + mapping=keypoint_mapping_aic) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_aic.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_coco-256x192.py new file mode 100644 index 0000000000..c665f68831 --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_coco-256x192.py @@ -0,0 +1,214 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 105 to 210 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.33, + widen_factor=0.5, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-s_imagenet_600e-ea671761.pth')), + head=dict( + type='HeatmapHead', + in_channels=512, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_aic-coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_aic-coco-256x192.py new file mode 100644 index 0000000000..b2660428e4 --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_aic-coco-256x192.py @@ -0,0 +1,284 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.0), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 105 to 210 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# keypoint mappings +keypoint_mapping_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +keypoint_mapping_aic = [ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + (12, 17), + (13, 18), +] + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.167, + widen_factor=0.375, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-tiny_imagenet_600e-3a2dd350.pth')), + head=dict( + type='HeatmapHead', + in_channels=384, + out_channels=19, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=False, + output_keypoint_indices=[ + target for _, target in keypoint_mapping_coco + ])) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type='RepeatDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_train2017.json', + data_prefix=dict(img='detection/coco/train2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, + mapping=keypoint_mapping_coco) + ], + ), + times=3) + +dataset_aic = dict( + type='AicDataset', + data_root=data_root, + data_mode=data_mode, + ann_file='aic/annotations/aic_train.json', + data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint' + '_train_20170902/keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, + mapping=keypoint_mapping_aic) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_aic.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='coco/annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='detection/coco/val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + # dict( + # type='EMAHook', + # ema_type='ExpMomentumEMA', + # momentum=0.0002, + # update_buffers=True, + # priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_coco-256x192.py new file mode 100644 index 0000000000..caed0b553a --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_coco-256x192.py @@ -0,0 +1,214 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 105 to 210 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.167, + widen_factor=0.375, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-tiny_imagenet_600e-3a2dd350.pth')), + head=dict( + type='HeatmapHead', + in_channels=384, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + # dict( + # type='EMAHook', + # ema_type='ExpMomentumEMA', + # momentum=0.0002, + # update_buffers=True, + # priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext_udp_coco.md b/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext_udp_coco.md new file mode 100644 index 0000000000..fbcdff73a8 --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext_udp_coco.md @@ -0,0 +1,69 @@ + + +
+RTMDet (ArXiv 2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+UDP (CVPR'2020) + +```bibtex +@InProceedings{Huang_2020_CVPR, + author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan}, + title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation}, + booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2020} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [pose_cspnext_t_udp](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_coco-256x192.py) | 256x192 | 0.665 | 0.874 | 0.723 | 0.723 | 0.917 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-tiny_udp-coco_pt-in1k_210e-256x192-0908dd2d_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-tiny_udp-coco_pt-in1k_210e-256x192-0908dd2d_20230123.json) | +| [pose_cspnext_s_udp](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_coco-256x192.py) | 256x192 | 0.697 | 0.886 | 0.776 | 0.753 | 0.929 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-s_udp-coco_pt-in1k_210e-256x192-92dbfc1d_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-s_udp-coco_pt-in1k_210e-256x192-92dbfc1d_20230123.json) | +| [pose_cspnext_m_udp](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_coco-256x192.py) | 256x192 | 0.732 | 0.896 | 0.806 | 0.785 | 0.937 | \[ckpt\]https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_udp-coco_pt-in1k_210e-256x192-95f5967e_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_udp-coco_pt-in1k_210e-256x192-95f5967e_20230123.json) | +| [pose_cspnext_l_udp](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_coco-256x192.py) | 256x192 | 0.750 | 0.904 | 0.822 | 0.800 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_udp-coco_pt-in1k_210e-256x192-661cdd8c_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_udp-coco_pt-in1k_210e-256x192-661cdd8c_20230123.json) | +| [pose_cspnext_t_udp_aic_coco](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-tiny_udp_8xb256-210e_aic-coco-256x192.py) | 256x192 | 0.655 | 0.884 | 0.731 | 0.689 | 0.890 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.json) | +| [pose_cspnext_s_udp_aic_coco](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-s_udp_8xb256-210e_aic-coco-256x192.py) | 256x192 | 0.700 | 0.905 | 0.783 | 0.733 | 0.918 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/cspnext-s_udp-aic-coco_210e-256x192-92f5a029_20230130.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/cspnext-s_udp-aic-coco_210e-256x192-92f5a029_20230130.json) | +| [pose_cspnext_m_udp_aic_coco](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-m_udp_8xb256-210e_aic-coco-256x192.py) | 256x192 | 0.748 | 0.925 | 0.818 | 0.777 | 0.933 | \[ckpt\]https://download.openmmlab.com/mmpose/v1/projects/rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.json) | +| [pose_cspnext_l_udp_aic_coco](/configs/body_2d_keypoint/topdown_heatmap/coco/cspnext-l_udp_8xb256-210e_aic-coco-256x192.py) | 256x192 | 0.772 | 0.936 | 0.839 | 0.799 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.json) | + +Note that, UDP also adopts the unbiased encoding/decoding algorithm of [DARK](https://mmpose.readthedocs.io/en/1.x/model_zoo_papers/techniques.html#darkpose-cvpr-2020). + +Flip test and detector is not used in the result of aic-coco training. diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/hourglass_coco.yml b/configs/body_2d_keypoint/topdown_heatmap/coco/hourglass_coco.yml new file mode 100644 index 0000000000..51e7f56ac6 --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/hourglass_coco.yml @@ -0,0 +1,40 @@ +Collections: +- Name: Hourglass + Paper: + Title: Stacked hourglass networks for human pose estimation + URL: https://link.springer.com/chapter/10.1007/978-3-319-46484-8_29 + README: https://github.com/open-mmlab/mmpose/blob/1.x/docs/src/papers/backbones/hourglass.md +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-256x256.py + In Collection: Hourglass + Metadata: + Architecture: &id001 + - Hourglass + Training Data: COCO + Name: td-hm_hourglass52_8xb32-210e_coco-256x256 + Results: + - Dataset: COCO + Metrics: + AP: 0.726 + AP@0.5: 0.896 + AP@0.75: 0.799 + AR: 0.780 + AR@0.5: 0.934 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_256x256-4ec713ba_20200709.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-384x384.py + In Collection: Hourglass + Metadata: + Architecture: *id001 + Training Data: COCO + Name: td-hm_hourglass52_8xb32-210e_coco-384x384 + Results: + - Dataset: COCO + Metrics: + AP: 0.746 + AP@0.5: 0.900 + AP@0.75: 0.812 + AR: 0.797 + AR@0.5: 0.939 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/top_down/hourglass/hourglass52_coco_384x384-be91ba2b_20200812.pth diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco.yml b/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco.yml index 0131493c15..86a305d223 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco.yml +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco.yml @@ -7,6 +7,7 @@ Collections: Models: - Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py In Collection: HRNet + Alias: human Metadata: Architecture: &id001 - HRNet diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco_aic.md b/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco_aic.md new file mode 100644 index 0000000000..fd88e25e64 --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco_aic.md @@ -0,0 +1,61 @@ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +
+AI Challenger (ArXiv'2017) + +```bibtex +@article{wu2017ai, + title={Ai challenger: A large-scale dataset for going deeper in image understanding}, + author={Wu, Jiahong and Zheng, He and Zhao, Bo and Li, Yixin and Yan, Baoming and Liang, Rui and Wang, Wenjia and Zhou, Shipei and Lin, Guosen and Fu, Yanwei and others}, + journal={arXiv preprint arXiv:1711.06475}, + year={2017} +} +``` + +
+ +MMPose supports training model with combined datasets. [coco-aic-merge](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge.py) and [coco-aic-combine](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py) are two examples. + +- [coco-aic-merge](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge.py) leverages AIC data with partial keypoints as auxiliary data to train a COCO model +- [coco-aic-combine](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py) constructs a combined dataset whose keypoints are the union of COCO and AIC keypoints to train a model that predicts keypoints of both datasets. + +Evaluation results on COCO val2017 of models trained with solely COCO dataset and combined dataset as shown below. These models are evaluated with detector having human AP of 56.4 on COCO val2017 dataset. + +| Train Set | Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :------------------------------------------- | :------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------: | :------------------------------------: | +| [coco](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py) | pose_hrnet_w32 | 256x192 | 0.749 | 0.906 | 0.821 | 0.804 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192_20220909.log) | +| [coco-aic-merge](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge.py) | pose_hrnet_w32 | 256x192 | 0.757 | 0.907 | 0.829 | 0.809 | 0.944 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge-b05435b9_20221025.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge_20221025.log) | +| [coco-aic-combine](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py) | pose_hrnet_w32 | 256x192 | 0.756 | 0.906 | 0.826 | 0.807 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine-4ce66880_20221026.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine_20221026.log) | diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_udp_coco.md b/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_udp_coco.md index 8f4e67597f..34e05740fb 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_udp_coco.md +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_udp_coco.md @@ -58,6 +58,6 @@ Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 da | [pose_hrnet_w32_udp](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288.py) | 384x288 | 0.768 | 0.909 | 0.832 | 0.815 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288-9a3f7c85_20220914.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288_20220914.log) | | [pose_hrnet_w48_udp](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192.py) | 256x192 | 0.768 | 0.908 | 0.833 | 0.817 | 0.945 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192-3feaef8f_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192_20220913.log) | | [pose_hrnet_w48_udp](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288.py) | 384x288 | 0.773 | 0.911 | 0.836 | 0.821 | 0.946 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288-70d7ab01_20220913.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288_20220913.log) | -| [pose_hrnet_w32_udp_regress](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192.py) | 256x192 | 0.757 | 0.907 | 0.824 | 0.812 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192-9c0b77b4_20220926.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192_20220226.log) | +| [pose_hrnet_w32_udp_regress](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192.py) | 256x192 | 0.759 | 0.907 | 0.827 | 0.813 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192-9c0b77b4_20220926.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192_20220226.log) | Note that, UDP also adopts the unbiased encoding/decoding algorithm of [DARK](https://mmpose.readthedocs.io/en/1.x/model_zoo_papers/techniques.html#darkpose-cvpr-2020). diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/mobilenetv2_coco.md b/configs/body_2d_keypoint/topdown_heatmap/coco/mobilenetv2_coco.md index 87e31aa5f9..aed9fd0246 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/mobilenetv2_coco.md +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/mobilenetv2_coco.md @@ -37,5 +37,5 @@ Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 da | Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | | :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | -| [pose_mobilenetv2](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192.py) | 256x192 | 0.647 | 0.874 | 0.723 | 0.708 | 0.917 | [ckpt](https://download.openmmlab.com/mmpose/top_down/mobilenetv2/mobilenetv2_coco_256x192-d1e58e7b_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/mobilenetv2/mobilenetv2_coco_256x192_20200727.log.json) | -| [pose_mobilenetv2](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288.py) | 384x288 | 0.673 | 0.879 | 0.741 | 0.728 | 0.916 | [ckpt](https://download.openmmlab.com/mmpose/top_down/mobilenetv2/mobilenetv2_coco_384x288-26be4816_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/mobilenetv2/mobilenetv2_coco_384x288_20200727.log.json) | +| [pose_mobilenetv2](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192.py) | 256x192 | 0.648 | 0.874 | 0.725 | 0.709 | 0.918 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192-55a04c35_20221016.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192_20221016.log) | +| [pose_mobilenetv2](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288.py) | 384x288 | 0.677 | 0.882 | 0.746 | 0.734 | 0.920 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288-d3ab1457_20221013.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288_20221013.log) | diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_coco.md b/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_coco.md index f2a2969bea..f0f7df3497 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_coco.md +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_coco.md @@ -54,9 +54,9 @@ Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 da | Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | | :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | -| [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py) | 256x192 | 0.718 | 0.898 | 0.795 | 0.773 | 0.937 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192-ec54d7f3_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192_20200709.log.json) | -| [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288.py) | 384x288 | 0.731 | 0.900 | 0.799 | 0.783 | 0.937 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_384x288-e6f795e9_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_384x288_20200709.log.json) | -| [pose_resnet_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192.py) | 256x192 | 0.726 | 0.899 | 0.806 | 0.781 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_256x192-6e6babf0_20200708.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_256x192_20200708.log.json) | -| [pose_resnet_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-384x288.py) | 384x288 | 0.749 | 0.906 | 0.817 | 0.799 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_384x288-8c71bdc9_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_384x288_20200709.log.json) | -| [pose_resnet_152](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192.py) | 256x192 | 0.735 | 0.904 | 0.812 | 0.789 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_256x192-f6e307c2_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_256x192_20200709.log.json) | -| [pose_resnet_152](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288.py) | 384x288 | 0.750 | 0.908 | 0.821 | 0.800 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_384x288-3860d4c9_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_384x288_20200709.log.json) | +| [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py) | 256x192 | 0.718 | 0.898 | 0.796 | 0.774 | 0.934 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192-04af38ce_20220923.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192_20220923.log) | +| [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288.py) | 384x288 | 0.731 | 0.900 | 0.799 | 0.782 | 0.937 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288-7b8db90e_20220923.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288_20220923.log) | +| [pose_resnet_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192.py) | 256x192 | 0.728 | 0.904 | 0.809 | 0.783 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_256x192-6e6babf0_20200708.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_256x192_20200708.log.json) | +| [pose_resnet_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb32-210e_coco-384x288.py) | 384x288 | 0.749 | 0.906 | 0.817 | 0.799 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192-065d3625_20220926.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192_20220926.log) | +| [pose_resnet_152](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192.py) | 256x192 | 0.736 | 0.904 | 0.818 | 0.791 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192-0345f330_20220928.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192_20220928.log) | +| [pose_resnet_152](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288.py) | 384x288 | 0.750 | 0.908 | 0.821 | 0.800 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288-7fbb906f_20220927.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288_20220927.log) | diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_dark_coco.md b/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_dark_coco.md index 7433e027cd..6f1b0107f3 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_dark_coco.md +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_dark_coco.md @@ -71,9 +71,9 @@ Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 da | Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | | :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | -| [pose_resnet_50_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192.py) | 256x192 | 0.724 | 0.898 | 0.800 | 0.777 | 0.936 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192_dark-43379d20_20200709.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192_dark_20200709.log.json) | -| [pose_resnet_50_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288.py) | 384x288 | 0.734 | 0.900 | 0.801 | 0.785 | 0.937 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_384x288_dark-33d3e5e5_20210203.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_384x288_dark_20210203.log.json) | -| [pose_resnet_101_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192.py) | 256x192 | 0.732 | 0.899 | 0.807 | 0.786 | 0.938 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_256x192_dark-64d433e6_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_256x192_dark_20200812.log.json) | -| [pose_resnet_101_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288.py) | 384x288 | 0.749 | 0.902 | 0.817 | 0.799 | 0.938 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_384x288_dark-cb45c88d_20210203.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_coco_384x288_dark_20210203.log.json) | -| [pose_resnet_152_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192.py) | 256x192 | 0.744 | 0.904 | 0.821 | 0.797 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_256x192_dark-ab4840d5_20200812.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_256x192_dark_20200812.log.json) | -| [pose_resnet_152_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288.py) | 384x288 | 0.756 | 0.909 | 0.826 | 0.805 | 0.944 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_384x288_dark-d3b8ebd7_20210203.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_coco_384x288_dark_20210203.log.json) | +| [pose_resnet_50_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192.py) | 256x192 | 0.724 | 0.897 | 0.797 | 0.777 | 0.934 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192-c129dcb6_20220926.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192_20220926.log) | +| [pose_resnet_50_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288.py) | 384x288 | 0.735 | 0.902 | 0.801 | 0.786 | 0.938 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288-8b90b538_20220926.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288_20220926.log) | +| [pose_resnet_101_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192.py) | 256x192 | 0.733 | 0.900 | 0.810 | 0.786 | 0.938 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192-528ec248_20220926.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192_20220926.log) | +| [pose_resnet_101_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288.py) | 384x288 | 0.749 | 0.905 | 0.818 | 0.799 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288-487d40a4_20220926.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288_20220926.log) | +| [pose_resnet_152_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192.py) | 256x192 | 0.743 | 0.906 | 0.819 | 0.796 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192-f754df5f_20221031.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192_20221031.log) | +| [pose_resnet_152_dark](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288.py) | 384x288 | 0.755 | 0.907 | 0.825 | 0.805 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288-329f8454_20221031.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288_20221031.log) | diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_fp16_coco.md b/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_fp16_coco.md index ae8d2af8d8..2731ca8534 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_fp16_coco.md +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/resnet_fp16_coco.md @@ -70,4 +70,4 @@ Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 da | Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | | :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | -| [pose_resnet_50_fp16](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_fp16-8xb64-210e_coco-256x192.py) | 256x192 | 0.716 | 0.897 | 0.793 | 0.772 | 0.935 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192_fp16_dynamic-6edb79f3_20210430.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192_fp16_dynamic_20210430.log.json) | +| [pose_resnet_50_fp16](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_fp16-8xb64-210e_coco-256x192.py) | 256x192 | 0.716 | 0.898 | 0.798 | 0.772 | 0.937 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_fp16-8xb64-210e_coco-256x192-463da051_20220927.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_fp16-8xb64-210e_coco-256x192_20220927.log) | diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/resnetv1d_coco.md b/configs/body_2d_keypoint/topdown_heatmap/coco/resnetv1d_coco.md index 1c9846ace5..1067201532 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/resnetv1d_coco.md +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/resnetv1d_coco.md @@ -37,9 +37,9 @@ Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 da | Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | | :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | -| [pose_resnetv1d_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192.py) | 256x192 | 0.722 | 0.897 | 0.798 | 0.777 | 0.934 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d50_coco_256x192-a243b840_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d50_coco_256x192_20200727.log.json) | -| [pose_resnetv1d_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288.py) | 384x288 | 0.730 | 0.900 | 0.799 | 0.781 | 0.934 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d50_coco_384x288-01f3fbb9_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d50_coco_384x288_20200727.log.json) | -| [pose_resnetv1d_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192.py) | 256x192 | 0.731 | 0.900 | 0.809 | 0.786 | 0.937 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d101_coco_256x192-5bd08cab_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d101_coco_256x192_20200727.log.json) | -| [pose_resnetv1d_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288.py) | 384x288 | 0.748 | 0.902 | 0.818 | 0.799 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d101_coco_384x288-5f9e421d_20200730.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d101_coco_384x288-20200730.log.json) | -| [pose_resnetv1d_152](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192.py) | 256x192 | 0.737 | 0.902 | 0.813 | 0.791 | 0.938 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d152_coco_256x192-c4df51dc_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d152_coco_256x192_20200727.log.json) | -| [pose_resnetv1d_152](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288.py) | 384x288 | 0.751 | 0.907 | 0.820 | 0.802 | 0.943 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d152_coco_384x288-626c622d_20200730.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnetv1d/resnetv1d152_coco_384x288-20200730.log.json) | +| [pose_resnetv1d_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192.py) | 256x192 | 0.722 | 0.897 | 0.796 | 0.777 | 0.936 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192-27545d63_20221020.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192_20221020.log) | +| [pose_resnetv1d_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288.py) | 384x288 | 0.730 | 0.899 | 0.800 | 0.782 | 0.935 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288-0646b46e_20221020.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288_20221020.log) | +| [pose_resnetv1d_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192.py) | 256x192 | 0.732 | 0.901 | 0.808 | 0.785 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192-ee9e7212_20221021.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192_20221021.log) | +| [pose_resnetv1d_101](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288.py) | 384x288 | 0.748 | 0.906 | 0.817 | 0.798 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288-d0b5875f_20221028.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288_20221028.log) | +| [pose_resnetv1d_152](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192.py) | 256x192 | 0.737 | 0.904 | 0.814 | 0.790 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192-fd49f947_20221021.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192_20221021.log) | +| [pose_resnetv1d_152](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288.py) | 384x288 | 0.751 | 0.907 | 0.821 | 0.801 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288-b9a99602_20221022.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288_20221022.log) | diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/rsn_coco.md b/configs/body_2d_keypoint/topdown_heatmap/coco/rsn_coco.md index 95fbe69193..40f570c3c1 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/rsn_coco.md +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/rsn_coco.md @@ -38,7 +38,7 @@ Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 da | Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | | :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | -| [rsn_18](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192.py) | 256x192 | 0.703 | 0.887 | 0.780 | 0.770 | 0.927 | [ckpt](https://download.openmmlab.com/mmpose/top_down/rsn/rsn18_coco_256x192-72f4b4a7_20201127.pth) | [log](https://download.openmmlab.com/mmpose/top_down/rsn/rsn18_coco_256x192_20201127.log.json) | -| [rsn_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192.py) | 256x192 | 0.722 | 0.895 | 0.799 | 0.787 | 0.933 | [ckpt](https://download.openmmlab.com/mmpose/top_down/rsn/rsn50_coco_256x192-72ffe709_20201127.pth) | [log](https://download.openmmlab.com/mmpose/top_down/rsn/rsn50_coco_256x192_20201127.log.json) | -| [2xrsn_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192.py) | 256x192 | 0.746 | 0.898 | 0.818 | 0.809 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/top_down/rsn/2xrsn50_coco_256x192-50648f0e_20201127.pth) | [log](https://download.openmmlab.com/mmpose/top_down/rsn/2xrsn50_coco_256x192_20201127.log.json) | -| [3xrsn_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192.py) | 256x192 | 0.749 | 0.899 | 0.824 | 0.812 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/top_down/rsn/3xrsn50_coco_256x192-58f57a68_20201127.pth) | [log](https://download.openmmlab.com/mmpose/top_down/rsn/3xrsn50_coco_256x192_20201127.log.json) | +| [rsn_18](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192.py) | 256x192 | 0.704 | 0.887 | 0.781 | 0.773 | 0.927 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192-9049ed09_20221013.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192_20221013.log) | +| [rsn_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192.py) | 256x192 | 0.724 | 0.894 | 0.799 | 0.790 | 0.935 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192-c35901d5_20221013.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192_20221013.log) | +| [2xrsn_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192.py) | 256x192 | 0.748 | 0.900 | 0.821 | 0.810 | 0.939 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192-9ede341e_20221013.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192_20221013.log) | +| [3xrsn_50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192.py) | 256x192 | 0.750 | 0.900 | 0.824 | 0.814 | 0.941 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192-c3e3c4fe_20221013.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192_20221013.log) | diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv1_coco.md b/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv1_coco.md index 8d576547c2..0c8be860ab 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv1_coco.md +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv1_coco.md @@ -37,5 +37,5 @@ Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 da | Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | | :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | -| [pose_shufflenetv1](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192.py) | 256x192 | 0.586 | 0.846 | 0.650 | 0.651 | 0.897 | [ckpt](https://download.openmmlab.com/mmpose/top_down/shufflenetv1/shufflenetv1_coco_256x192-353bc02c_20200727.pth) | [log](https://download.openmmlab.com/mmpose/top_down/shufflenetv1/shufflenetv1_coco_256x192_20200727.log.json) | -| [pose_shufflenetv1](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288.py) | 384x288 | 0.622 | 0.859 | 0.683 | 0.684 | 0.903 | [ckpt](https://download.openmmlab.com/mmpose/top_down/shufflenetv1/shufflenetv1_coco_384x288-b2930b24_20200804.pth) | [log](https://download.openmmlab.com/mmpose/top_down/shufflenetv1/shufflenetv1_coco_384x288_20200804.log.json) | +| [pose_shufflenetv1](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192.py) | 256x192 | 0.587 | 0.849 | 0.654 | 0.654 | 0.896 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192-7a7ea4f4_20221013.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192_20221013.log) | +| [pose_shufflenetv1](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288.py) | 384x288 | 0.626 | 0.862 | 0.696 | 0.687 | 0.903 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288-8342f8ba_20221013.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288_20221013.log) | diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv2_coco.md b/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv2_coco.md index 59fdcd0d8c..f613f4fef1 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv2_coco.md +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/shufflenetv2_coco.md @@ -37,5 +37,5 @@ Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 da | Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | | :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | -| [pose_shufflenetv2](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192.py) | 256x192 | 0.598 | 0.852 | 0.667 | 0.664 | 0.899 | [ckpt](https://download.openmmlab.com/mmpose/top_down/shufflenetv2/shufflenetv2_coco_256x192-0aba71c7_20200921.pth) | [log](https://download.openmmlab.com/mmpose/top_down/shufflenetv2/shufflenetv2_coco_256x192_20200921.log.json) | -| [pose_shufflenetv2](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288.py) | 384x288 | 0.636 | 0.865 | 0.703 | 0.697 | 0.910 | [ckpt](https://download.openmmlab.com/mmpose/top_down/shufflenetv2/shufflenetv2_coco_384x288-fb38ac3a_20200921.pth) | [log](https://download.openmmlab.com/mmpose/top_down/shufflenetv2/shufflenetv2_coco_384x288_20200921.log.json) | +| [pose_shufflenetv2](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192.py) | 256x192 | 0.602 | 0.857 | 0.672 | 0.668 | 0.902 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192-51fb931e_20221014.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192_20221014.log) | +| [pose_shufflenetv2](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288.py) | 384x288 | 0.638 | 0.866 | 0.707 | 0.699 | 0.910 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288-d30ab55c_20221014.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288_20221014.log) | diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xmspn50_8xb32-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xmspn50_8xb32-210e_coco-256x192.py index 8787c7093a..982b327a24 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xmspn50_8xb32-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xmspn50_8xb32-210e_coco-256x192.py @@ -101,9 +101,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec[0]['input_size']), - dict( - type='GenerateTarget', target_type='multilevel_heatmap', - encoder=codec), + dict(type='GenerateTarget', multilevel=True, encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192.py index 16113b1681..b175e60161 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_2xrsn50_8xb32-210e_coco-256x192.py @@ -99,9 +99,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec[0]['input_size']), - dict( - type='GenerateTarget', target_type='multilevel_heatmap', - encoder=codec), + dict(type='GenerateTarget', multilevel=True, encoder=codec), dict(type='PackPoseInputs') ] diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xmspn50_8xb32-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xmspn50_8xb32-210e_coco-256x192.py index 97c59f1837..9fcff191c5 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xmspn50_8xb32-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xmspn50_8xb32-210e_coco-256x192.py @@ -101,9 +101,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec[0]['input_size']), - dict( - type='GenerateTarget', target_type='multilevel_heatmap', - encoder=codec), + dict(type='GenerateTarget', multilevel=True, encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192.py index eb3d93da39..9835e66ea1 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_3xrsn50_8xb32-210e_coco-256x192.py @@ -99,9 +99,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec[0]['input_size']), - dict( - type='GenerateTarget', target_type='multilevel_heatmap', - encoder=codec), + dict(type='GenerateTarget', multilevel=True, encoder=codec), dict(type='PackPoseInputs') ] diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_4xmspn50_8xb32-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_4xmspn50_8xb32-210e_coco-256x192.py index eb145f56ad..4533f40383 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_4xmspn50_8xb32-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_4xmspn50_8xb32-210e_coco-256x192.py @@ -101,9 +101,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec[0]['input_size']), - dict( - type='GenerateTarget', target_type='multilevel_heatmap', - encoder=codec), + dict(type='GenerateTarget', multilevel=True, encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000000..b3edbfa4b2 --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py @@ -0,0 +1,151 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +custom_imports = dict( + imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'], + allow_failed_imports=False) + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1), + paramwise_cfg=dict( + num_layers=12, + layer_decay_rate=0.75, + custom_keys={ + 'bias': dict(decay_multi=0.0), + 'pos_embed': dict(decay_mult=0.0), + 'relative_position_bias_table': dict(decay_mult=0.0), + 'norm': dict(decay_mult=0.0), + }, + ), + constructor='LayerDecayOptimWrapperConstructor', + clip_grad=dict(max_norm=1., norm_type=2), +) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='mmcls.VisionTransformer', + arch='base', + img_size=(256, 192), + patch_size=16, + qkv_bias=True, + drop_path_rate=0.3, + with_cls_token=False, + output_cls_token=False, + patch_cfg=dict(padding=2), + init_cfg=dict( + type='Pretrained', + checkpoint='pretrained/mae_pretrain_vit_base.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=768, + out_channels=17, + deconv_out_channels=[], + deconv_kernel_sizes=[], + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec, + extra=dict(upsample=4, final_conv_kernel=3), + ), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +data_root = 'data/coco/' +dataset_type = 'CocoDataset' +data_mode = 'topdown' + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000000..f1fbd2d857 --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py @@ -0,0 +1,149 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +custom_imports = dict( + imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'], + allow_failed_imports=False) + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1), + paramwise_cfg=dict( + num_layers=12, + layer_decay_rate=0.75, + custom_keys={ + 'bias': dict(decay_multi=0.0), + 'pos_embed': dict(decay_mult=0.0), + 'relative_position_bias_table': dict(decay_mult=0.0), + 'norm': dict(decay_mult=0.0), + }, + ), + constructor='LayerDecayOptimWrapperConstructor', + clip_grad=dict(max_norm=1., norm_type=2), +) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='mmcls.VisionTransformer', + arch='base', + img_size=(256, 192), + patch_size=16, + qkv_bias=True, + drop_path_rate=0.3, + with_cls_token=False, + output_cls_token=False, + patch_cfg=dict(padding=2), + init_cfg=dict( + type='Pretrained', + checkpoint='pretrained/mae_pretrain_vit_base.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=768, + out_channels=17, + deconv_out_channels=(256, 256), + deconv_kernel_sizes=(4, 4), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +data_root = 'data/coco/' +dataset_type = 'CocoDataset' +data_mode = 'topdown' + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000000..797192cb25 --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py @@ -0,0 +1,151 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +custom_imports = dict( + imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'], + allow_failed_imports=False) + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1), + paramwise_cfg=dict( + num_layers=32, + layer_decay_rate=0.85, + custom_keys={ + 'bias': dict(decay_multi=0.0), + 'pos_embed': dict(decay_mult=0.0), + 'relative_position_bias_table': dict(decay_mult=0.0), + 'norm': dict(decay_mult=0.0), + }, + ), + constructor='LayerDecayOptimWrapperConstructor', + clip_grad=dict(max_norm=1., norm_type=2), +) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='mmcls.VisionTransformer', + arch='huge', + img_size=(256, 192), + patch_size=16, + qkv_bias=True, + drop_path_rate=0.55, + with_cls_token=False, + output_cls_token=False, + patch_cfg=dict(padding=2), + init_cfg=dict( + type='Pretrained', + checkpoint='pretrained/mae_pretrain_vit_huge.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=1280, + out_channels=17, + deconv_out_channels=[], + deconv_kernel_sizes=[], + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec, + extra=dict(upsample=4, final_conv_kernel=3), + ), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +data_root = 'data/coco/' +dataset_type = 'CocoDataset' +data_mode = 'topdown' + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000000..43df966568 --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py @@ -0,0 +1,149 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +custom_imports = dict( + imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'], + allow_failed_imports=False) + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1), + paramwise_cfg=dict( + num_layers=32, + layer_decay_rate=0.85, + custom_keys={ + 'bias': dict(decay_multi=0.0), + 'pos_embed': dict(decay_mult=0.0), + 'relative_position_bias_table': dict(decay_mult=0.0), + 'norm': dict(decay_mult=0.0), + }, + ), + constructor='LayerDecayOptimWrapperConstructor', + clip_grad=dict(max_norm=1., norm_type=2), +) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='mmcls.VisionTransformer', + arch='huge', + img_size=(256, 192), + patch_size=16, + qkv_bias=True, + drop_path_rate=0.55, + with_cls_token=False, + output_cls_token=False, + patch_cfg=dict(padding=2), + init_cfg=dict( + type='Pretrained', + checkpoint='pretrained/mae_pretrain_vit_huge.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=1280, + out_channels=17, + deconv_out_channels=(256, 256), + deconv_kernel_sizes=(4, 4), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +data_root = 'data/coco/' +dataset_type = 'CocoDataset' +data_mode = 'topdown' + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000000..9413665e6a --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py @@ -0,0 +1,151 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +custom_imports = dict( + imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'], + allow_failed_imports=False) + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1), + paramwise_cfg=dict( + num_layers=24, + layer_decay_rate=0.8, + custom_keys={ + 'bias': dict(decay_multi=0.0), + 'pos_embed': dict(decay_mult=0.0), + 'relative_position_bias_table': dict(decay_mult=0.0), + 'norm': dict(decay_mult=0.0), + }, + ), + constructor='LayerDecayOptimWrapperConstructor', + clip_grad=dict(max_norm=1., norm_type=2), +) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='mmcls.VisionTransformer', + arch='large', + img_size=(256, 192), + patch_size=16, + qkv_bias=True, + drop_path_rate=0.5, + with_cls_token=False, + output_cls_token=False, + patch_cfg=dict(padding=2), + init_cfg=dict( + type='Pretrained', + checkpoint='pretrained/mae_pretrain_vit_large.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=1024, + out_channels=17, + deconv_out_channels=[], + deconv_kernel_sizes=[], + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec, + extra=dict(upsample=4, final_conv_kernel=3), + ), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +data_root = 'data/coco/' +dataset_type = 'CocoDataset' +data_mode = 'topdown' + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000000..3f67f9999f --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py @@ -0,0 +1,149 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +custom_imports = dict( + imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'], + allow_failed_imports=False) + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1), + paramwise_cfg=dict( + num_layers=24, + layer_decay_rate=0.8, + custom_keys={ + 'bias': dict(decay_multi=0.0), + 'pos_embed': dict(decay_mult=0.0), + 'relative_position_bias_table': dict(decay_mult=0.0), + 'norm': dict(decay_mult=0.0), + }, + ), + constructor='LayerDecayOptimWrapperConstructor', + clip_grad=dict(max_norm=1., norm_type=2), +) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='mmcls.VisionTransformer', + arch='large', + img_size=(256, 192), + patch_size=16, + qkv_bias=True, + drop_path_rate=0.5, + with_cls_token=False, + output_cls_token=False, + patch_cfg=dict(padding=2), + init_cfg=dict( + type='Pretrained', + checkpoint='pretrained/mae_pretrain_vit_large.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=1024, + out_channels=17, + deconv_out_channels=(256, 256), + deconv_kernel_sizes=(4, 4), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +data_root = 'data/coco/' +dataset_type = 'CocoDataset' +data_mode = 'topdown' + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000000..fdd8428891 --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py @@ -0,0 +1,156 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +custom_imports = dict( + imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'], + allow_failed_imports=False) + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1), + paramwise_cfg=dict( + num_layers=12, + layer_decay_rate=0.8, + custom_keys={ + 'bias': dict(decay_multi=0.0), + 'pos_embed': dict(decay_mult=0.0), + 'relative_position_bias_table': dict(decay_mult=0.0), + 'norm': dict(decay_mult=0.0), + }, + ), + constructor='LayerDecayOptimWrapperConstructor', + clip_grad=dict(max_norm=1., norm_type=2), +) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='mmcls.VisionTransformer', + arch={ + 'embed_dims': 384, + 'num_layers': 12, + 'num_heads': 12, + 'feedforward_channels': 384 * 4 + }, + img_size=(256, 192), + patch_size=16, + qkv_bias=True, + drop_path_rate=0.1, + with_cls_token=False, + output_cls_token=False, + patch_cfg=dict(padding=2), + init_cfg=dict( + type='Pretrained', + checkpoint='pretrained/mae_pretrain_vit_small.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=384, + out_channels=17, + deconv_out_channels=[], + deconv_kernel_sizes=[], + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec, + extra=dict(upsample=4, final_conv_kernel=3), + ), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +data_root = 'data/coco/' +dataset_type = 'CocoDataset' +data_mode = 'topdown' + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000000..f50ce7a9c7 --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py @@ -0,0 +1,154 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +custom_imports = dict( + imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'], + allow_failed_imports=False) + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1), + paramwise_cfg=dict( + num_layers=12, + layer_decay_rate=0.8, + custom_keys={ + 'bias': dict(decay_multi=0.0), + 'pos_embed': dict(decay_mult=0.0), + 'relative_position_bias_table': dict(decay_mult=0.0), + 'norm': dict(decay_mult=0.0), + }, + ), + constructor='LayerDecayOptimWrapperConstructor', + clip_grad=dict(max_norm=1., norm_type=2), +) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='mmcls.VisionTransformer', + arch={ + 'embed_dims': 384, + 'num_layers': 12, + 'num_heads': 12, + 'feedforward_channels': 384 * 4 + }, + img_size=(256, 192), + patch_size=16, + qkv_bias=True, + drop_path_rate=0.1, + with_cls_token=False, + output_cls_token=False, + patch_cfg=dict(padding=2), + init_cfg=dict( + type='Pretrained', + checkpoint='pretrained/mae_pretrain_vit_small.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=384, + out_channels=17, + deconv_out_channels=(256, 256), + deconv_kernel_sizes=(4, 4), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +data_root = 'data/coco/' +dataset_type = 'CocoDataset' +data_mode = 'topdown' + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_alexnet_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_alexnet_8xb64-210e_coco-256x192.py index 97675a491b..c7bfbb283c 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_alexnet_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_alexnet_8xb64-210e_coco-256x192.py @@ -67,7 +67,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288.py index 1f8d24b6a8..be4dd779e8 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb32-210e_coco-384x288.py @@ -75,7 +75,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192.py index 76fbc98482..7c5cd3ef21 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_cpm_8xb64-210e_coco-256x192.py @@ -75,7 +75,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-256x256.py index f0ffd6851a..4ca4fe5cea 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-256x256.py @@ -72,7 +72,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-384x384.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-384x384.py index 003f835f19..d165ec582b 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-384x384.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hourglass52_8xb32-210e_coco-384x384.py @@ -72,7 +72,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-256x192.py index 1d7851f7e3..af1397bf73 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-256x192.py @@ -10,9 +10,9 @@ lr=5e-4, betas=(0.9, 0.999), weight_decay=0.01, - paramwise_cfg=dict( - custom_keys={'relative_position_bias_table': dict( - decay_mult=0.)}))) + ), + paramwise_cfg=dict( + custom_keys={'relative_position_bias_table': dict(decay_mult=0.)})) # learning policy param_scheduler = [ @@ -120,7 +120,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-384x288.py index 227cb4af6a..97d80e94aa 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-base_8xb32-210e_coco-384x288.py @@ -10,9 +10,9 @@ lr=5e-4, betas=(0.9, 0.999), weight_decay=0.01, - paramwise_cfg=dict( - custom_keys={'relative_position_bias_table': dict( - decay_mult=0.)}))) + ), + paramwise_cfg=dict( + custom_keys={'relative_position_bias_table': dict(decay_mult=0.)})) # learning policy param_scheduler = [ @@ -120,7 +120,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-256x192.py index ef425a8702..1d5123f44d 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-256x192.py @@ -10,9 +10,9 @@ lr=5e-4, betas=(0.9, 0.999), weight_decay=0.01, - paramwise_cfg=dict( - custom_keys={'relative_position_bias_table': dict( - decay_mult=0.)}))) + ), + paramwise_cfg=dict( + custom_keys={'relative_position_bias_table': dict(decay_mult=0.)})) # learning policy param_scheduler = [ @@ -120,7 +120,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-384x288.py index bdb8158e63..4fb03097bb 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrformer-small_8xb32-210e_coco-384x288.py @@ -10,9 +10,9 @@ lr=5e-4, betas=(0.9, 0.999), weight_decay=0.01, - paramwise_cfg=dict( - custom_keys={'relative_position_bias_table': dict( - decay_mult=0.)}))) + ), + paramwise_cfg=dict( + custom_keys={'relative_position_bias_table': dict(decay_mult=0.)})) # learning policy param_scheduler = [ @@ -120,7 +120,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py index 1e74ecf4b6..3024e15842 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py @@ -100,7 +100,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-384x288.py index be294541ef..ffa6dcfef7 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-384x288.py @@ -100,7 +100,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py new file mode 100644 index 0000000000..428f27445a --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py @@ -0,0 +1,221 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=3)) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# keypoint mappings +keypoint_mapping_coco = [ + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5), + (6, 6), + (7, 7), + (8, 8), + (9, 9), + (10, 10), + (11, 11), + (12, 12), + (13, 13), + (14, 14), + (15, 15), + (16, 16), +] + +keypoint_mapping_aic = [ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + (12, 17), + (13, 18), +] + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + metainfo=dict(from_file='configs/_base_/datasets/coco_aic.py'), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=19, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + output_keypoint_indices=[ + target for _, target in keypoint_mapping_coco + ])) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, + mapping=keypoint_mapping_coco) + ], +) + +dataset_aic = dict( + type='AicDataset', + data_root='data/aic/', + data_mode=data_mode, + ann_file='annotations/aic_train.json', + data_prefix=dict(img='ai_challenger_keypoint_train_20170902/' + 'keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, + mapping=keypoint_mapping_aic) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco_aic.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge.py new file mode 100644 index 0000000000..6a914f2af3 --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge.py @@ -0,0 +1,187 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +optim_wrapper = dict(optimizer=dict( + type='Adam', + lr=5e-4, +)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +# codec settings +codec = dict( + type='MSRAHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=32, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=True, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args={{_base_.file_client_args}}), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +# train datasets +dataset_coco = dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=[], +) + +dataset_aic = dict( + type='AicDataset', + data_root='data/aic/', + data_mode=data_mode, + ann_file='annotations/aic_train.json', + data_prefix=dict(img='ai_challenger_keypoint_train_20170902/' + 'keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=17, + mapping=[ + (0, 6), + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + ]) + ], +) + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CombinedDataset', + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[dataset_coco, dataset_aic], + pipeline=train_pipeline, + test_mode=False, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_coarsedropout-8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_coarsedropout-8xb64-210e_coco-256x192.py index f439cd780a..7f9f849496 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_coarsedropout-8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_coarsedropout-8xb64-210e_coco-256x192.py @@ -115,7 +115,7 @@ min_width=10, p=0.5), ]), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-256x192.py index 343ab6acb8..65bf94a485 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-256x192.py @@ -104,7 +104,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-384x288.py index 9787245ab2..833daccc50 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_dark-8xb64-210e_coco-384x288.py @@ -104,7 +104,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_gridmask-8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_gridmask-8xb64-210e_coco-256x192.py index d3e1de499b..5f37e4546b 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_gridmask-8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_gridmask-8xb64-210e_coco-256x192.py @@ -112,7 +112,7 @@ random_offset=True, p=0.5), ]), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_photometric-8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_photometric-8xb64-210e_coco-256x192.py index f3a151101c..9db0c1b550 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_photometric-8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_photometric-8xb64-210e_coco-256x192.py @@ -103,7 +103,7 @@ dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), dict(type='PhotometricDistortion'), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192.py index 3611f86caa..91c2d4d27a 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-256x192.py @@ -100,7 +100,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288.py index ca2568cd98..053cb26bd5 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-8xb64-210e_coco-384x288.py @@ -100,7 +100,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192.py index 1940e18eeb..787df4766f 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_udp-regress-8xb64-210e_coco-256x192.py @@ -105,7 +105,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192.py index daa38bc1e0..5d9e3485e3 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192.py @@ -100,7 +100,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-384x288.py index 4e0ae4886d..b2673662f7 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-384x288.py @@ -100,7 +100,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-256x192.py index ec930e4077..08a462df8d 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-256x192.py @@ -104,7 +104,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-384x288.py index 70da5342f3..86247be2ca 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_dark-8xb32-210e_coco-384x288.py @@ -104,7 +104,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192.py index 0f31fc7b48..f836385529 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-256x192.py @@ -100,7 +100,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288.py index 61df53b2a9..c43fe2a5e4 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_udp-8xb32-210e_coco-384x288.py @@ -100,7 +100,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb32-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb32-210e_coco-384x288.py index 5a0624cf51..c3508b190b 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb32-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb32-210e_coco-384x288.py @@ -90,7 +90,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb64-210e_coco-256x192.py index 2692032435..2bfef92457 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-18_8xb64-210e_coco-256x192.py @@ -90,7 +90,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb32-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb32-210e_coco-384x288.py index f73ec9f59f..1871d8eb8e 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb32-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb32-210e_coco-384x288.py @@ -90,7 +90,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb64-210e_coco-256x192.py index 2739da6967..bb63b09731 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_litehrnet-30_8xb64-210e_coco-256x192.py @@ -90,7 +90,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192.py index f0e1ff5f16..73d350e988 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-256x192.py @@ -74,7 +74,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288.py index a1933ee9d9..eb54e8bceb 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mobilenetv2_8xb64-210e_coco-384x288.py @@ -74,7 +74,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mspn50_8xb32-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mspn50_8xb32-210e_coco-256x192.py index 5e65ceebdd..6ce2d032bb 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mspn50_8xb32-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_mspn50_8xb32-210e_coco-256x192.py @@ -101,9 +101,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec[0]['input_size']), - dict( - type='GenerateTarget', target_type='multilevel_heatmap', - encoder=codec), + dict(type='GenerateTarget', multilevel=True, encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvt-s_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvt-s_8xb64-210e_coco-256x192.py index 35c195505c..bf956792d5 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvt-s_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvt-s_8xb64-210e_coco-256x192.py @@ -76,7 +76,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvtv2-b2_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvtv2-b2_8xb64-210e_coco-256x192.py index c707be751e..dc9e07b3a6 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvtv2-b2_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_pvtv2-b2_8xb64-210e_coco-256x192.py @@ -77,7 +77,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb32-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb32-210e_coco-384x288.py index 9341fde3ac..eaafa87acc 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb32-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb32-210e_coco-384x288.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ @@ -96,7 +96,7 @@ pipeline=train_pipeline, )) val_dataloader = dict( - batch_size=32, + batch_size=64, num_workers=2, persistent_workers=True, drop_last=False, diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192.py index 0c555ebe06..8f384fdbf3 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_8xb64-210e_coco-256x192.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192.py index 1888dd98f4..7f3667b551 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-256x192.py @@ -75,7 +75,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288.py index 6041a3eb90..3cd98097b6 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res101_dark-8xb64-210e_coco-384x288.py @@ -75,7 +75,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192.py index 05a5024f43..f99204d5ea 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-256x192.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288.py index c73bc56be4..e273b245f5 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_8xb32-210e_coco-384x288.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192.py index 2942f00f75..2906adb4a6 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-256x192.py @@ -24,7 +24,7 @@ ] # automatically scaling LR based on the actual training batch size -auto_scale_lr = dict(base_batch_size=512) +auto_scale_lr = dict(base_batch_size=256) # hooks default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) @@ -75,7 +75,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288.py index 438e757728..4800a0cfe9 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res152_dark-8xb32-210e_coco-384x288.py @@ -24,7 +24,7 @@ ] # automatically scaling LR based on the actual training batch size -auto_scale_lr = dict(base_batch_size=512) +auto_scale_lr = dict(base_batch_size=256) # hooks default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) @@ -35,7 +35,8 @@ input_size=(288, 384), heatmap_size=(72, 96), sigma=3, - unbiased=True) + unbiased=True, + blur_kernel_size=17) # model settings model = dict( @@ -75,7 +76,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py index ebc309eed8..68928a45f9 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288.py index 48ef6539ff..141075eec7 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-384x288.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192.py index 1f568b943a..50051ecb7f 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-256x192.py @@ -75,7 +75,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288.py index 9ae1d7eddb..9651416d2d 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_dark-8xb64-210e_coco-384x288.py @@ -75,7 +75,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb32-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb32-210e_coco-384x288.py index f14ce56771..e3340f3f0c 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb32-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb32-210e_coco-384x288.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb64-210e_coco-256x192.py index 8f884136c3..d7765f4af9 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest101_8xb64-210e_coco-256x192.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb16-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb16-210e_coco-384x288.py index 6c9f96c05b..128bf87c1e 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb16-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb16-210e_coco-384x288.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb64-210e_coco-256x192.py index b0d9f9fa32..69b6d13d13 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest200_8xb64-210e_coco-256x192.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb16-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb16-210e_coco-384x288.py index 08edd373ed..b191d726aa 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb16-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb16-210e_coco-384x288.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb32-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb32-210e_coco-256x192.py index 4ac41b24fd..af718bf0a0 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb32-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest269_8xb32-210e_coco-256x192.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-256x192.py index 2b98a3be10..9915210328 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-256x192.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-384x288.py index 9654d5f74c..68a4a07be5 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnest50_8xb64-210e_coco-384x288.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288.py index 7af1e2bc38..9e05ead2b3 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb32-210e_coco-384x288.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192.py index 8d8ebe9d6b..52b6d4b1f4 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d101_8xb64-210e_coco-256x192.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192.py index c63b459007..1af6549998 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb32-210e_coco-256x192.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288.py index 3bf8164967..8eef9aed8c 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d152_8xb48-210e_coco-384x288.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192.py index 474f9e5745..8d196a5880 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-256x192.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288.py index 8aabd85628..3210848000 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnetv1d50_8xb64-210e_coco-384x288.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb32-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb32-210e_coco-384x288.py index c27c9a05c1..53a1d00ce3 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb32-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb32-210e_coco-384x288.py @@ -72,7 +72,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb64-210e_coco-256x192.py index e5a393ed6b..d93d9918c3 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext101_8xb64-210e_coco-256x192.py @@ -72,7 +72,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb32-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb32-210e_coco-256x192.py index c35f51eeb3..2d39cf8ef0 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb32-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb32-210e_coco-256x192.py @@ -72,7 +72,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb48-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb48-210e_coco-384x288.py index c4c0288db9..5b7d38c2a3 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb48-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext152_8xb48-210e_coco-384x288.py @@ -72,7 +72,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-256x192.py index 84ba0c53e8..0585734fa7 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-256x192.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-384x288.py index 30abdf103f..89e9040794 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_resnext50_8xb64-210e_coco-384x288.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192.py index bc2baf170f..f267b0a9bf 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn18_8xb32-210e_coco-256x192.py @@ -99,9 +99,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec[0]['input_size']), - dict( - type='GenerateTarget', target_type='multilevel_heatmap', - encoder=codec), + dict(type='GenerateTarget', multilevel=True, encoder=codec), dict(type='PackPoseInputs') ] diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192.py index bb43bd58ed..9f39e13fe2 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_rsn50_8xb32-210e_coco-256x192.py @@ -99,9 +99,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec[0]['input_size']), - dict( - type='GenerateTarget', target_type='multilevel_heatmap', - encoder=codec), + dict(type='GenerateTarget', multilevel=True, encoder=codec), dict(type='PackPoseInputs') ] diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb32-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb32-210e_coco-256x192.py index 8bfd5da1e4..3ab2af9a08 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb32-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb32-210e_coco-256x192.py @@ -74,7 +74,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb48-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb48-210e_coco-384x288.py index e5394a54a3..1cb55eab04 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb48-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet101_8xb48-210e_coco-384x288.py @@ -74,7 +74,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb32-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb32-210e_coco-384x288.py index e28753e1a9..8581a5e245 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb32-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb32-210e_coco-384x288.py @@ -74,7 +74,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb64-210e_coco-256x192.py index d94e84b7a1..b9cb7db543 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_scnet50_8xb64-210e_coco-256x192.py @@ -74,7 +74,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb32-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb32-210e_coco-384x288.py index f3b95aff23..e1e34c500c 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb32-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb32-210e_coco-384x288.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb64-210e_coco-256x192.py index b0948fa53e..441547df97 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet101_8xb64-210e_coco-256x192.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb32-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb32-210e_coco-256x192.py index 4746630620..71ea8d1f3c 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb32-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb32-210e_coco-256x192.py @@ -70,7 +70,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb48-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb48-210e_coco-384x288.py index d12edce231..9ff5fd9682 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb48-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet152_8xb48-210e_coco-384x288.py @@ -70,7 +70,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-256x192.py index ecdd37b32c..69c7968955 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-256x192.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-384x288.py index a68f2ae3ed..b2317b48da 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_seresnet50_8xb64-210e_coco-384x288.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192.py index 5c8eab454f..093534bedb 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-256x192.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288.py index 011d4891e8..20b6e3032d 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv1_8xb64-210e_coco-384x288.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192.py index 51d382b57d..29755bfa12 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-256x192.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288.py index 78fe11cf69..a97156384e 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_shufflenetv2_8xb64-210e_coco-384x288.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-256x192.py index 0bcc89dcd0..e7cac7b8c0 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-256x192.py @@ -88,7 +88,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-384x288.py index c2ce977a10..77e93399f5 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-b-p4-w7_8xb32-210e_coco-384x288.py @@ -88,7 +88,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-256x192.py index f024552e23..b1862fa3e0 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-256x192.py @@ -10,12 +10,13 @@ lr=5e-4, betas=(0.9, 0.999), weight_decay=0.01, - paramwise_cfg=dict( - custom_keys={ - 'absolute_pos_embed': dict(decay_mult=0.), - 'relative_position_bias_table': dict(decay_mult=0.), - 'norm': dict(decay_mult=0.) - }))) + ), + paramwise_cfg=dict( + custom_keys={ + 'absolute_pos_embed': dict(decay_mult=0.), + 'relative_position_bias_table': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.) + })) # learning policy param_scheduler = [ @@ -96,7 +97,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-384x288.py index 59d725d923..8d09cb1d24 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-l-p4-w7_8xb32-210e_coco-384x288.py @@ -10,12 +10,13 @@ lr=5e-4, betas=(0.9, 0.999), weight_decay=0.01, - paramwise_cfg=dict( - custom_keys={ - 'absolute_pos_embed': dict(decay_mult=0.), - 'relative_position_bias_table': dict(decay_mult=0.), - 'norm': dict(decay_mult=0.) - }))) + ), + paramwise_cfg=dict( + custom_keys={ + 'absolute_pos_embed': dict(decay_mult=0.), + 'relative_position_bias_table': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.) + })) # learning policy param_scheduler = [ @@ -96,7 +97,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-t-p4-w7_8xb32-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-t-p4-w7_8xb32-210e_coco-256x192.py index 5202968f2c..1225b06465 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-t-p4-w7_8xb32-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_swin-t-p4-w7_8xb32-210e_coco-256x192.py @@ -88,7 +88,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vgg16-bn_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vgg16-bn_8xb64-210e_coco-256x192.py index ce6b976f30..83ca1f7793 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vgg16-bn_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vgg16-bn_8xb64-210e_coco-256x192.py @@ -72,7 +72,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192.py index 2d95cf147f..834ab75965 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192.py @@ -72,7 +72,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192.py index 8aa0b4af25..e1117c444c 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192.py @@ -70,7 +70,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/vipnas_coco.md b/configs/body_2d_keypoint/topdown_heatmap/coco/vipnas_coco.md index 8a395c980e..b6a178865b 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/coco/vipnas_coco.md +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/vipnas_coco.md @@ -36,5 +36,5 @@ Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 da | Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | | :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | -| [S-ViPNAS-MobileNetV3](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192.py) | 256x192 | 0.700 | 0.887 | 0.777 | 0.757 | 0.929 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_256x192-7018731a_20211122.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_mbv3_coco_256x192_20211122.log.json) | -| [S-ViPNAS-Res50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192.py) | 256x192 | 0.711 | 0.893 | 0.790 | 0.769 | 0.934 | [ckpt](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_coco_256x192-cc43b466_20210624.pth) | [log](https://download.openmmlab.com/mmpose/top_down/vipnas/vipnas_res50_coco_256x192_20210624.log.json) | +| [S-ViPNAS-MobileNetV3](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192.py) | 256x192 | 0.700 | 0.887 | 0.783 | 0.758 | 0.929 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192-e0987441_20221010.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-mbv3_8xb64-210e_coco-256x192_20221010.log) | +| [S-ViPNAS-Res50](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192.py) | 256x192 | 0.711 | 0.894 | 0.787 | 0.769 | 0.934 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192-35d4bff9_20220917.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_vipnas-res50_8xb64-210e_coco-256x192_20220917.log) | diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md b/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md new file mode 100644 index 0000000000..bf1985e35f --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.md @@ -0,0 +1,56 @@ + + +
+ +ViTPose (NeurIPS'2022) + +```bibtex +@inproceedings{ + xu2022vitpose, + title={Vi{TP}ose: Simple Vision Transformer Baselines for Human Pose Estimation}, + author={Yufei Xu and Jing Zhang and Qiming Zhang and Dacheng Tao}, + booktitle={Advances in Neural Information Processing Systems}, + year={2022}, +} +``` + +
+ + + +
+COCO-WholeBody (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO val2017 with detector having human AP of 56.4 on COCO val2017 dataset + +> With classic decoder + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [ViTPose-S](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py) | 256x192 | 0.739 | 0.903 | 0.816 | 0.792 | 0.942 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192-62d7a712_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192-62d7a712_20230314.json) | +| [ViTPose-B](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py) | 256x192 | 0.757 | 0.905 | 0.829 | 0.810 | 0.946 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192-216eae50_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192-216eae50_20230314.json) | +| [ViTPose-L](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py) | 256x192 | 0.782 | 0.914 | 0.850 | 0.834 | 0.952 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192-53609f55_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192-53609f55_20230314.json) | +| [ViTPose-H](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py) | 256x192 | 0.788 | 0.917 | 0.855 | 0.839 | 0.954 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192-e32adcd4_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192-e32adcd4_20230314.json) | +| [ViTPose-H\*](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py) | 256x192 | 0.790 | 0.916 | 0.857 | 0.840 | 0.953 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_3rdparty_coco-256x192-5b738c8e_20230314) | - | + +*Models with * are converted from the [official repo](https://github.com/ViTAE-Transformer/ViTPose). The config files of these models are only for validation.* + +> With simple decoder + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [ViTPose-S](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py) | 256x192 | 0.736 | 0.900 | 0.811 | 0.790 | 0.940 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192-4c101a76_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192-4c101a76_20230314.json) | +| [ViTPose-B](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py) | 256x192 | 0.756 | 0.906 | 0.826 | 0.809 | 0.946 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192-fd73707d_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192-fd73707d_20230314.json) | +| [ViTPose-L](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py) | 256x192 | 0.781 | 0.914 | 0.853 | 0.833 | 0.952 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192-3a7ee9e1_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192-3a7ee9e1_20230314.json) | +| [ViTPose-H](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py) | 256x192 | 0.789 | 0.916 | 0.856 | 0.839 | 0.953 | [ckpt](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192-ffd48c05_20230314.pth) | [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192-ffd48c05_20230314.json) | diff --git a/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.yml b/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.yml new file mode 100644 index 0000000000..82539568a3 --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.yml @@ -0,0 +1,155 @@ +Collections: +- Name: ViTPose + Paper: + Title: 'ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation' + URL: https://arxiv.org/abs/2204.12484 + README: https://github.com/open-mmlab/mmpose/blob/1.x/docs/src/papers/algorithms/vitpose.md + Metadata: + Training Resources: 8x A100 GPUs +Models: +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py + In Collection: ViTPose + Metadata: + Architecture: &id001 + - ViTPose + - Classic Head + Model Size: Small + Training Data: COCO + Name: td-hm_ViTPose-small_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.739 + AP@0.5: 0.903 + AP@0.75: 0.816 + AR: 0.792 + AR@0.5: 0.942 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192-62d7a712_20230314.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192.py + In Collection: ViTPose + Metadata: + Architecture: *id001 + Model Size: Base + Training Data: COCO + Name: td-hm_ViTPose-base_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.757 + AP@0.5: 0.905 + AP@0.75: 0.829 + AR: 0.81 + AR@0.5: 0.946 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base_8xb64-210e_coco-256x192-216eae50_20230314.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192.py + In Collection: ViTPose + Metadata: + Architecture: *id001 + Model Size: Large + Training Data: COCO + Name: td-hm_ViTPose-large_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.782 + AP@0.5: 0.914 + AP@0.75: 0.850 + AR: 0.834 + AR@0.5: 0.952 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large_8xb64-210e_coco-256x192-53609f55_20230314.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192.py + In Collection: ViTPose + Metadata: + Architecture: *id001 + Model Size: Huge + Training Data: COCO + Name: td-hm_ViTPose-huge_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.788 + AP@0.5: 0.917 + AP@0.75: 0.855 + AR: 0.839 + AR@0.5: 0.954 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge_8xb64-210e_coco-256x192-e32adcd4_20230314.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192.py + In Collection: ViTPose + Alias: vitpose-s + Metadata: + Architecture: &id002 + - ViTPose + - Simple Head + Model Size: Small + Training Data: COCO + Name: td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.736 + AP@0.5: 0.900 + AP@0.75: 0.811 + AR: 0.790 + AR@0.5: 0.940 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small-simple_8xb64-210e_coco-256x192-4c101a76_20230314.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192.py + In Collection: ViTPose + Alias: + - vitpose + - vitpose-b + Metadata: + Architecture: *id002 + Model Size: Base + Training Data: COCO + Name: td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.756 + AP@0.5: 0.906 + AP@0.75: 0.826 + AR: 0.809 + AR@0.5: 0.946 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-base-simple_8xb64-210e_coco-256x192-fd73707d_20230314.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192.py + In Collection: ViTPose + Alias: vitpose-l + Metadata: + Architecture: *id002 + Model Size: Large + Training Data: COCO + Name: td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.781 + AP@0.5: 0.914 + AP@0.75: 0.853 + AR: 0.833 + AR@0.5: 0.952 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-large-simple_8xb64-210e_coco-256x192-3a7ee9e1_20230314.pth +- Config: configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192.py + In Collection: ViTPose + Alias: vitpose-h + Metadata: + Architecture: *id002 + Model Size: Huge + Training Data: COCO + Name: td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192 + Results: + - Dataset: COCO + Metrics: + AP: 0.789 + AP@0.5: 0.916 + AP@0.75: 0.856 + AR: 0.839 + AR@0.5: 0.953 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-huge-simple_8xb64-210e_coco-256x192-ffd48c05_20230314.pth diff --git a/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext-m_udp_8xb64-210e_crowpose-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext-m_udp_8xb64-210e_crowpose-256x192.py new file mode 100644 index 0000000000..d5f0760b2b --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext-m_udp_8xb64-210e_crowpose-256x192.py @@ -0,0 +1,217 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth')), + head=dict( + type='HeatmapHead', + in_channels=768, + out_channels=14, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CrowdPoseDataset' +data_mode = 'topdown' +data_root = 'data/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/', +# f'{data_root}': 's3://openmmlab/datasets/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='crowdpose/annotations/mmpose_crowdpose_test.json', + bbox_file='data/crowdpose/annotations/det_for_crowd_test_0.1_0.5.json', + data_prefix=dict(img='pose/CrowdPose/images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='crowdpose/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'crowdpose/annotations/mmpose_crowdpose_test.json', + use_area=False, + iou_type='keypoints_crowd', + prefix='crowdpose') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext_udp_crowdpose.md b/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext_udp_crowdpose.md new file mode 100644 index 0000000000..80cf3466ca --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext_udp_crowdpose.md @@ -0,0 +1,56 @@ + + +
+RTMDet (ArXiv 2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+UDP (CVPR'2020) + +```bibtex +@InProceedings{Huang_2020_CVPR, + author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan}, + title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation}, + booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2020} +} +``` + +
+ + + +
+CrowdPose (CVPR'2019) + +```bibtex +@article{li2018crowdpose, + title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark}, + author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu}, + journal={arXiv preprint arXiv:1812.00324}, + year={2018} +} +``` + +
+ +Results on CrowdPose test with [YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3) human detector + +| Arch | Input Size | AP | AP50 | AP75 | AP (E) | AP (M) | AP (H) | ckpt | log | +| :--------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :----: | :----: | :----: | :--------------------------------------------: | :-------------------------------------------: | +| [pose_cspnext_m](/configs/body_2d_keypoint/topdown_heatmap/crowdpose/cspnext-m_udp_8xb64-210e_crowpose-256x192.py) | 256x192 | 0.662 | 0.821 | 0.723 | 0.759 | 0.675 | 0.539 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_udp-crowdpose_pt-in1k_210e-256x192-f591079f_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_udp-crowdpose_pt-in1k_210e-256x192-f591079f_20230123.json) | diff --git a/configs/body_2d_keypoint/topdown_heatmap/crowdpose/hrnet_crowdpose.md b/configs/body_2d_keypoint/topdown_heatmap/crowdpose/hrnet_crowdpose.md index 8fd9e6820b..c0d24d4717 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/crowdpose/hrnet_crowdpose.md +++ b/configs/body_2d_keypoint/topdown_heatmap/crowdpose/hrnet_crowdpose.md @@ -33,6 +33,6 @@ Results on CrowdPose test with [YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3) human detector -| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | -| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | -| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_hrnet-w32_8xb64-210e_crowdpose-256x192.py) | 256x192 | 0.675 | 0.825 | 0.729 | 0.816 | 0.769 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_crowdpose_256x192-960be101_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_crowdpose_256x192_20201227.log.json) | +| Arch | Input Size | AP | AP50 | AP75 | AP (E) | AP (M) | AP (H) | ckpt | log | +| :--------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :----: | :----: | :----: | :--------------------------------------------: | :-------------------------------------------: | +| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_hrnet-w32_8xb64-210e_crowdpose-256x192.py) | 256x192 | 0.675 | 0.825 | 0.729 | 0.770 | 0.687 | 0.553 | [ckpt](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_crowdpose_256x192-960be101_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_crowdpose_256x192_20201227.log.json) | diff --git a/configs/body_2d_keypoint/topdown_heatmap/crowdpose/resnet_crowdpose.md b/configs/body_2d_keypoint/topdown_heatmap/crowdpose/resnet_crowdpose.md index bc509800ed..56a771806d 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/crowdpose/resnet_crowdpose.md +++ b/configs/body_2d_keypoint/topdown_heatmap/crowdpose/resnet_crowdpose.md @@ -50,9 +50,9 @@ Results on CrowdPose test with [YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3) human detector -| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | -| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | -| [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res50_8xb64-210e_crowdpose-256x192.py) | 256x192 | 0.637 | 0.808 | 0.692 | 0.785 | 0.738 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_crowdpose_256x192-c6a526b6_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_crowdpose_256x192_20201227.log.json) | -| [pose_resnet_101](/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-256x192.py) | 256x192 | 0.647 | 0.810 | 0.703 | 0.796 | 0.746 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_256x192-8f5870f4_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_256x192_20201227.log.json) | -| [pose_resnet_101](/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-320x256.py) | 320x256 | 0.661 | 0.821 | 0.714 | 0.800 | 0.759 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_320x256-c88c512a_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_320x256_20201227.log.json) | -| [pose_resnet_152](/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res152_8xb64-210e_crowdpose-256x192.py) | 256x192 | 0.656 | 0.818 | 0.712 | 0.803 | 0.754 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_crowdpose_256x192-dbd49aba_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_crowdpose_256x192_20201227.log.json) | +| Arch | Input Size | AP | AP50 | AP75 | AP (E) | AP (M) | AP (H) | ckpt | log | +| :--------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :----: | :----: | :----: | :--------------------------------------------: | :-------------------------------------------: | +| [pose_resnet_50](/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res50_8xb64-210e_crowdpose-256x192.py) | 256x192 | 0.637 | 0.808 | 0.692 | 0.738 | 0.650 | 0.506 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res50_crowdpose_256x192-c6a526b6_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res50_crowdpose_256x192_20201227.log.json) | +| [pose_resnet_101](/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-256x192.py) | 256x192 | 0.647 | 0.810 | 0.703 | 0.745 | 0.658 | 0.521 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_256x192-8f5870f4_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_256x192_20201227.log.json) | +| [pose_resnet_101](/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-320x256.py) | 320x256 | 0.661 | 0.821 | 0.714 | 0.759 | 0.672 | 0.534 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_320x256-c88c512a_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res101_crowdpose_320x256_20201227.log.json) | +| [pose_resnet_152](/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res152_8xb64-210e_crowdpose-256x192.py) | 256x192 | 0.656 | 0.818 | 0.712 | 0.754 | 0.666 | 0.533 | [ckpt](https://download.openmmlab.com/mmpose/top_down/resnet/res152_crowdpose_256x192-dbd49aba_20201227.pth) | [log](https://download.openmmlab.com/mmpose/top_down/resnet/res152_crowdpose_256x192_20201227.log.json) | diff --git a/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_hrnet-w32_8xb64-210e_crowdpose-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_hrnet-w32_8xb64-210e_crowdpose-256x192.py index 2e47d9c056..a12c221db9 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_hrnet-w32_8xb64-210e_crowdpose-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_hrnet-w32_8xb64-210e_crowdpose-256x192.py @@ -100,7 +100,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-256x192.py index b90d3d815c..3cba829326 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-256x192.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-320x256.py b/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-320x256.py index eaf69edcfd..721526b192 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-320x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res101_8xb64-210e_crowdpose-320x256.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res152_8xb64-210e_crowdpose-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res152_8xb64-210e_crowdpose-256x192.py index 2795cf55a9..d74c0d618b 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res152_8xb64-210e_crowdpose-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res152_8xb64-210e_crowdpose-256x192.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res50_8xb64-210e_crowdpose-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res50_8xb64-210e_crowdpose-256x192.py index 82a86ad151..1275f15bb3 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res50_8xb64-210e_crowdpose-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/crowdpose/td-hm_res50_8xb64-210e_crowdpose-256x192.py @@ -71,7 +71,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub1-368x368.py b/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub1-368x368.py index 71783c7a40..fdc1a88a71 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub1-368x368.py +++ b/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub1-368x368.py @@ -27,7 +27,8 @@ auto_scale_lr = dict(base_batch_size=256) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/Mean PCK', rule='greater')) +default_hooks = dict( + checkpoint=dict(save_best='PCK', rule='greater', interval=1)) # codec settings codec = dict( @@ -77,7 +78,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] @@ -121,7 +122,6 @@ # evaluators val_evaluator = [ - dict(type='JhmdbPCKAccuracy', thr=0.2), - dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item='torso'), + dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']), ] test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub2-368x368.py b/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub2-368x368.py index fcd2dde622..e44b704a5a 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub2-368x368.py +++ b/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub2-368x368.py @@ -27,7 +27,8 @@ auto_scale_lr = dict(base_batch_size=256) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/Mean PCK', rule='greater')) +default_hooks = dict( + checkpoint=dict(save_best='PCK', rule='greater', interval=1)) # codec settings codec = dict( @@ -77,7 +78,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] @@ -121,7 +122,6 @@ # evaluators val_evaluator = [ - dict(type='JhmdbPCKAccuracy', thr=0.2), - dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item='torso'), + dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']), ] test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub3-368x368.py b/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub3-368x368.py index e5fa8009e8..f0e10d057c 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub3-368x368.py +++ b/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_cpm_8xb32-40e_jhmdb-sub3-368x368.py @@ -27,7 +27,8 @@ auto_scale_lr = dict(base_batch_size=256) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/Mean PCK', rule='greater')) +default_hooks = dict( + checkpoint=dict(save_best='PCK', rule='greater', interval=1)) # codec settings codec = dict( @@ -77,7 +78,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] @@ -121,7 +122,6 @@ # evaluators val_evaluator = [ - dict(type='JhmdbPCKAccuracy', thr=0.2), - dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item='torso'), + dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']), ] test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub1-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub1-256x256.py index 72aa41b46f..c85a66dbe6 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub1-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub1-256x256.py @@ -27,7 +27,8 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/Mean PCK', rule='greater')) +default_hooks = dict( + checkpoint=dict(save_best='PCK', rule='greater', interval=1)) # codec settings codec = dict( @@ -72,7 +73,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] @@ -116,7 +117,6 @@ # evaluators val_evaluator = [ - dict(type='JhmdbPCKAccuracy', thr=0.2), - dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item='torso'), + dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']), ] test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub2-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub2-256x256.py index 7a40a1e49f..26ba287e26 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub2-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub2-256x256.py @@ -27,7 +27,8 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/Mean PCK', rule='greater')) +default_hooks = dict( + checkpoint=dict(save_best='PCK', rule='greater', interval=1)) # codec settings codec = dict( @@ -72,7 +73,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] @@ -116,7 +117,6 @@ # evaluators val_evaluator = [ - dict(type='JhmdbPCKAccuracy', thr=0.2), - dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item='torso'), + dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']), ] test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub3-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub3-256x256.py index 59a9464847..16318e76a0 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub3-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50-2deconv_8xb64-40e_jhmdb-sub3-256x256.py @@ -27,7 +27,8 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/Mean PCK', rule='greater')) +default_hooks = dict( + checkpoint=dict(save_best='PCK', rule='greater', interval=1)) # codec settings codec = dict( @@ -72,7 +73,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] @@ -116,7 +117,6 @@ # evaluators val_evaluator = [ - dict(type='JhmdbPCKAccuracy', thr=0.2), - dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item='torso'), + dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']), ] test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub1-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub1-256x256.py index a294b53fb0..730d5193ca 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub1-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub1-256x256.py @@ -27,7 +27,8 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/Mean PCK', rule='greater')) +default_hooks = dict( + checkpoint=dict(save_best='PCK', rule='greater', interval=1)) # codec settings codec = dict( @@ -70,7 +71,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] @@ -114,7 +115,6 @@ # evaluators val_evaluator = [ - dict(type='JhmdbPCKAccuracy', thr=0.2), - dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item='torso'), + dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']), ] test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub2-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub2-256x256.py index d48e32030d..4b8ab016e0 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub2-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub2-256x256.py @@ -27,7 +27,8 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/Mean PCK', rule='greater')) +default_hooks = dict( + checkpoint=dict(save_best='PCK', rule='greater', interval=1)) # codec settings codec = dict( @@ -70,7 +71,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] @@ -114,7 +115,6 @@ # evaluators val_evaluator = [ - dict(type='JhmdbPCKAccuracy', thr=0.2), - dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item='torso'), + dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']), ] test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub3-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub3-256x256.py index 0a2a771b88..42865e95d2 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub3-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/jhmdb/td-hm_res50_8xb64-20e_jhmdb-sub3-256x256.py @@ -27,7 +27,8 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/Mean PCK', rule='greater')) +default_hooks = dict( + checkpoint=dict(save_best='PCK', rule='greater', interval=1)) # codec settings codec = dict( @@ -70,7 +71,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] @@ -114,7 +115,6 @@ # evaluators val_evaluator = [ - dict(type='JhmdbPCKAccuracy', thr=0.2), - dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item='torso'), + dict(type='JhmdbPCKAccuracy', thr=0.2, norm_item=['bbox', 'torso']), ] test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext-m_udp_8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext-m_udp_8xb64-210e_mpii-256x256.py new file mode 100644 index 0000000000..14dce4e2f2 --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext-m_udp_8xb64-210e_mpii-256x256.py @@ -0,0 +1,210 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth')), + head=dict( + type='HeatmapHead', + in_channels=768, + out_channels=16, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=False, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'MpiiDataset' +data_mode = 'topdown' +data_root = 'data/mpii/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/pose/MPI/', +# f'{data_root}': 's3://openmmlab/datasets/pose/MPI/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/mpii_val.json', + headbox_file=f'{data_root}/annotations/mpii_gt_val.mat', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='PCK', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict(type='MpiiPCKAccuracy') +test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext_udp_mpii.md b/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext_udp_mpii.md new file mode 100644 index 0000000000..895de8119a --- /dev/null +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext_udp_mpii.md @@ -0,0 +1,57 @@ + + +
+RTMDet (arXiv'2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+UDP (CVPR'2020) + +```bibtex +@InProceedings{Huang_2020_CVPR, + author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan}, + title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation}, + booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2020} +} +``` + +
+ + + +
+MPII (CVPR'2014) + +```bibtex +@inproceedings{andriluka14cvpr, + author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt}, + title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis}, + booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2014}, + month = {June} +} +``` + +
+ +Results on MPII val set + +| Arch | Input Size | Mean | Mean@0.1 | ckpt | log | +| :---------------------------------------------------------- | :--------: | :---: | :------: | :---------------------------------------------------------: | :---------------------------------------------------------: | +| [pose_hrnet_w32](/configs/body_2d_keypoint/topdown_heatmap/mpii/cspnext-m_udp_8xb64-210e_mpii-256x256.py) | 256x256 | 0.902 | 0.303 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_udp-mpii_pt-in1k_210e-256x256-68d0402f_20230208.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_udp-mpii_pt-in1k_210e-256x256-68d0402f_20230208.json) | diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_cpm_8xb64-210e_mpii-368x368.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_cpm_8xb64-210e_mpii-368x368.py index 70aaefff30..452382e833 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_cpm_8xb64-210e_mpii-368x368.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_cpm_8xb64-210e_mpii-368x368.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -78,7 +78,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb32-210e_mpii-384x384.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb32-210e_mpii-384x384.py index 5e96ebbb17..8f8612d3c2 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb32-210e_mpii-384x384.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb32-210e_mpii-384x384.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -71,7 +71,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb64-210e_mpii-256x256.py index f6ab6eddbe..010249e4d1 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hourglass52_8xb64-210e_mpii-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -71,7 +71,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_8xb64-210e_mpii-256x256.py index cdf5dee149..96e0f7f265 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_8xb64-210e_mpii-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -99,7 +99,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ @@ -111,7 +111,7 @@ # data loaders train_dataloader = dict( - batch_size=64, + batch_size=16, num_workers=2, persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=True), @@ -124,7 +124,7 @@ pipeline=train_pipeline, )) val_dataloader = dict( - batch_size=32, + batch_size=16, num_workers=2, persistent_workers=True, drop_last=False, diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_dark-8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_dark-8xb64-210e_mpii-256x256.py index 9bbaf571bc..d21091e74c 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_dark-8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w32_dark-8xb64-210e_mpii-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -103,7 +103,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_8xb64-210e_mpii-256x256.py index db38fa9024..357eeb04b4 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_8xb64-210e_mpii-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -99,7 +99,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_dark-8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_dark-8xb64-210e_mpii-256x256.py index 5a2f2c90a9..c7b642c905 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_dark-8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_hrnet-w48_dark-8xb64-210e_mpii-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -103,7 +103,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-18_8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-18_8xb64-210e_mpii-256x256.py index 495c8f13df..49d4fd2289 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-18_8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-18_8xb64-210e_mpii-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -90,7 +90,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-30_8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-30_8xb64-210e_mpii-256x256.py index 320d7c87fd..edcf54dce1 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-30_8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_litehrnet-30_8xb64-210e_mpii-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -90,7 +90,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_mobilenetv2_8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_mobilenetv2_8xb64-210e_mpii-256x256.py index 350adb6e6d..bd02309c43 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_mobilenetv2_8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_mobilenetv2_8xb64-210e_mpii-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -71,7 +71,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res101_8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res101_8xb64-210e_mpii-256x256.py index 6ed7cd3f9e..fb6dc60a79 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res101_8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res101_8xb64-210e_mpii-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -70,7 +70,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res152_8xb32-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res152_8xb32-210e_mpii-256x256.py index 4243bf215b..bbec516763 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res152_8xb32-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res152_8xb32-210e_mpii-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=256) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -70,7 +70,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res50_8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res50_8xb64-210e_mpii-256x256.py index 783acb567d..5515c21bc0 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res50_8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_res50_8xb64-210e_mpii-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -70,7 +70,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d101_8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d101_8xb64-210e_mpii-256x256.py index 70c4e70d92..6b0d8e64f2 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d101_8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d101_8xb64-210e_mpii-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -70,7 +70,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d152_8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d152_8xb64-210e_mpii-256x256.py index b9cbaa6ae3..78e45b4ae1 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d152_8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d152_8xb64-210e_mpii-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -70,7 +70,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d50_8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d50_8xb64-210e_mpii-256x256.py index 58e7445d19..73ea36fcb6 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d50_8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnetv1d50_8xb64-210e_mpii-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -70,7 +70,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnext152_8xb32-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnext152_8xb32-210e_mpii-256x256.py index edf06bbf3f..664e83fac6 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnext152_8xb32-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_resnext152_8xb32-210e_mpii-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=256) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -71,7 +71,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet101_8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet101_8xb64-210e_mpii-256x256.py index d0e7e5ec5a..db42e23b0d 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet101_8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet101_8xb64-210e_mpii-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -73,7 +73,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet50_8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet50_8xb64-210e_mpii-256x256.py index 44c243b60f..15c79405e5 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet50_8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_scnet50_8xb64-210e_mpii-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -73,7 +73,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet101_8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet101_8xb64-210e_mpii-256x256.py index 87c55e52e6..deceae15a8 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet101_8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet101_8xb64-210e_mpii-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -70,7 +70,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet152_8xb32-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet152_8xb32-210e_mpii-256x256.py index e37c06a11a..6485797ad1 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet152_8xb32-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet152_8xb32-210e_mpii-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=256) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -69,7 +69,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet50_8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet50_8xb64-210e_mpii-256x256.py index 89e850dc08..93273fdc07 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet50_8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_seresnet50_8xb64-210e_mpii-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -70,7 +70,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv1_8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv1_8xb64-210e_mpii-256x256.py index f6d08a8f0f..95622425a3 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv1_8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv1_8xb64-210e_mpii-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -70,7 +70,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv2_8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv2_8xb64-210e_mpii-256x256.py index d51e32fa30..397a6be89e 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv2_8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_heatmap/mpii/td-hm_shufflenetv2_8xb64-210e_mpii-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # codec settings codec = dict( @@ -70,7 +70,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-256x192.py index 7cfa25e935..e4b6e059ce 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-256x192.py @@ -28,7 +28,8 @@ # hooks default_hooks = dict( - checkpoint=dict(save_best='posetrack18/Total AP', rule='greater')) + checkpoint=dict( + save_best='posetrack18/Total AP', rule='greater', interval=1)) # load from the pretrained model load_from = 'https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth' # noqa: E501 @@ -101,7 +102,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] diff --git a/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-384x288.py index 0ee273d0a9..733e7412ab 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w32_8xb64-20e_posetrack18-384x288.py @@ -28,7 +28,8 @@ # hooks default_hooks = dict( - checkpoint=dict(save_best='posetrack18/Total AP', rule='greater')) + checkpoint=dict( + save_best='posetrack18/Total AP', rule='greater', interval=1)) # load from the pretrained model load_from = 'https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-384x288-ca5956af_20220909.pth' # noqa: E501 @@ -101,7 +102,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] diff --git a/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-256x192.py index 378e220d3c..b7e9735c5a 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-256x192.py @@ -28,7 +28,8 @@ # hooks default_hooks = dict( - checkpoint=dict(save_best='posetrack18/Total AP', rule='greater')) + checkpoint=dict( + save_best='posetrack18/Total AP', rule='greater', interval=1)) # load from the pretrained model load_from = 'https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-256x192-0e67c616_20220913.pth' # noqa: E501 @@ -101,7 +102,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] diff --git a/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-384x288.py b/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-384x288.py index a583198852..5700683667 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-384x288.py +++ b/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_hrnet-w48_8xb64-20e_posetrack18-384x288.py @@ -28,7 +28,8 @@ # hooks default_hooks = dict( - checkpoint=dict(save_best='posetrack18/Total AP', rule='greater')) + checkpoint=dict( + save_best='posetrack18/Total AP', rule='greater', interval=1)) # load from the pretrained model load_from = 'https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w48_8xb32-210e_coco-384x288-c161b7de_20220915.pth' # noqa: E501 @@ -101,7 +102,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] diff --git a/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_res50_8xb64-20e_posetrack18-256x192.py b/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_res50_8xb64-20e_posetrack18-256x192.py index 33db4e0316..046f8a5c57 100644 --- a/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_res50_8xb64-20e_posetrack18-256x192.py +++ b/configs/body_2d_keypoint/topdown_heatmap/posetrack18/td-hm_res50_8xb64-20e_posetrack18-256x192.py @@ -28,7 +28,8 @@ # hooks default_hooks = dict( - checkpoint=dict(save_best='posetrack18/Total AP', rule='greater')) + checkpoint=dict( + save_best='posetrack18/Total AP', rule='greater', interval=1)) # load from the pretrained model load_from = 'https://download.openmmlab.com/mmpose/top_down/resnet/res50_coco_256x192-ec54d7f3_20200709.pth' # noqa: E501 @@ -76,7 +77,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] diff --git a/configs/body_2d_keypoint/topdown_regression/coco/td-reg_mobilenetv2_rle-pretrained-8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_regression/coco/td-reg_mobilenetv2_rle-pretrained-8xb64-210e_coco-256x192.py index 6d776875c2..5ed3fa554b 100644 --- a/configs/body_2d_keypoint/topdown_regression/coco/td-reg_mobilenetv2_rle-pretrained-8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_regression/coco/td-reg_mobilenetv2_rle-pretrained-8xb64-210e_coco-256x192.py @@ -74,7 +74,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='keypoint_label', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_8xb64-210e_coco-256x192.py index 03ddc5cdb4..33fdf1587c 100644 --- a/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_8xb64-210e_coco-256x192.py @@ -67,7 +67,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='keypoint_label', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_rle-8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_rle-8xb64-210e_coco-256x192.py index c3e065ec5f..2669ad8db9 100644 --- a/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_rle-8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res101_rle-8xb64-210e_coco-256x192.py @@ -67,7 +67,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='keypoint_label', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_8xb64-210e_coco-256x192.py index 7d0cbe906b..c581b914c6 100644 --- a/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_8xb64-210e_coco-256x192.py @@ -67,7 +67,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='keypoint_label', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-256x192.py index 7d7a816583..197f58e1c1 100644 --- a/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-256x192.py @@ -67,7 +67,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='keypoint_label', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-384x288.py b/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-384x288.py index d0b183dc33..026e96e033 100644 --- a/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-384x288.py +++ b/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res152_rle-8xb64-210e_coco-384x288.py @@ -67,7 +67,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='keypoint_label', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_8xb64-210e_coco-256x192.py index a365acff1d..c055310ec1 100644 --- a/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_8xb64-210e_coco-256x192.py @@ -67,7 +67,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='keypoint_label', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-8xb64-210e_coco-256x192.py index 851afb38cd..8cd39cd0ee 100644 --- a/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-8xb64-210e_coco-256x192.py @@ -67,7 +67,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='keypoint_label', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-pretrained-8xb64-210e_coco-256x192.py b/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-pretrained-8xb64-210e_coco-256x192.py index 4a24f2aaaf..995274fa15 100644 --- a/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-pretrained-8xb64-210e_coco-256x192.py +++ b/configs/body_2d_keypoint/topdown_regression/coco/td-reg_res50_rle-pretrained-8xb64-210e_coco-256x192.py @@ -73,7 +73,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='keypoint_label', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] test_pipeline = [ diff --git a/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res101_8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res101_8xb64-210e_mpii-256x256.py index 00dcc6f4d2..9076ffe104 100644 --- a/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res101_8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res101_8xb64-210e_mpii-256x256.py @@ -68,7 +68,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='keypoint_label', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ @@ -111,8 +111,8 @@ test_dataloader = val_dataloader # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # evaluators -val_evaluator = dict(type='MpiiPCKAccuracy', norm_item='head') +val_evaluator = dict(type='MpiiPCKAccuracy') test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res152_8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res152_8xb64-210e_mpii-256x256.py index 779ae37a84..c1a19b0d6e 100644 --- a/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res152_8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res152_8xb64-210e_mpii-256x256.py @@ -68,7 +68,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='keypoint_label', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ @@ -111,8 +111,8 @@ test_dataloader = val_dataloader # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # evaluators -val_evaluator = dict(type='MpiiPCKAccuracy', norm_item='head') +val_evaluator = dict(type='MpiiPCKAccuracy') test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_8xb64-210e_mpii-256x256.py index 87b03ac79e..8af5976cc1 100644 --- a/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_8xb64-210e_mpii-256x256.py @@ -68,7 +68,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='keypoint_label', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ @@ -111,8 +111,8 @@ test_dataloader = val_dataloader # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # evaluators -val_evaluator = dict(type='MpiiPCKAccuracy', norm_item='head') +val_evaluator = dict(type='MpiiPCKAccuracy') test_evaluator = val_evaluator diff --git a/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_rle-8xb64-210e_mpii-256x256.py b/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_rle-8xb64-210e_mpii-256x256.py index 1a62e710b1..31bacf4e36 100644 --- a/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_rle-8xb64-210e_mpii-256x256.py +++ b/configs/body_2d_keypoint/topdown_regression/mpii/td-reg_res50_rle-8xb64-210e_mpii-256x256.py @@ -68,7 +68,7 @@ dict(type='RandomFlip', direction='horizontal'), dict(type='RandomBBoxTransform', shift_prob=0), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='keypoint_label', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ @@ -111,8 +111,8 @@ test_dataloader = val_dataloader # hooks -default_hooks = dict(checkpoint=dict(save_best='pck/PCKh', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater')) # evaluators -val_evaluator = dict(type='MpiiPCKAccuracy', norm_item='head') +val_evaluator = dict(type='MpiiPCKAccuracy') test_evaluator = val_evaluator diff --git a/configs/face_2d_keypoint/rtmpose/README.md b/configs/face_2d_keypoint/rtmpose/README.md new file mode 100644 index 0000000000..d309696bed --- /dev/null +++ b/configs/face_2d_keypoint/rtmpose/README.md @@ -0,0 +1,24 @@ +# RTMPose + +Recent studies on 2D pose estimation have achieved excellent performance on public benchmarks, yet its application in the industrial community still suffers from heavy model parameters and high latency. +In order to bridge this gap, we empirically study five aspects that affect the performance of multi-person pose estimation algorithms: paradigm, backbone network, localization algorithm, training strategy, and deployment inference, and present a high-performance real-time multi-person pose estimation framework, **RTMPose**, based on MMPose. +Our RTMPose-m achieves **75.8% AP** on COCO with **90+ FPS** on an Intel i7-11700 CPU and **430+ FPS** on an NVIDIA GTX 1660 Ti GPU, and RTMPose-l achieves **67.0% AP** on COCO-WholeBody with **130+ FPS**, outperforming existing open-source libraries. +To further evaluate RTMPose's capability in critical real-time applications, we also report the performance after deploying on the mobile device. + +## Results and Models + +### COCO-WholeBody-Face Dataset + +Results on COCO-WholeBody-Face val set + +| Model | Input Size | NME | Details and Download | +| :-------: | :--------: | :----: | :------------------------------------------------------------------------------------: | +| RTMPose-m | 256x256 | 0.0466 | [rtmpose_coco_wholebody_face.md](./coco_wholebody_face/rtmpose_coco_wholebody_face.md) | + +### WFLW Dataset + +Results on WFLW dataset + +| Model | Input Size | NME | Details and Download | +| :-------: | :--------: | :--: | :---------------------------------------: | +| RTMPose-m | 256x256 | 4.01 | [rtmpose_wflw.md](./wflw/rtmpose_wflw.md) | diff --git a/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose-m_8xb32-60e_coco-wholebody-face-256x256.py b/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose-m_8xb32-60e_coco-wholebody-face-256x256.py new file mode 100644 index 0000000000..a19569b6ba --- /dev/null +++ b/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose-m_8xb32-60e_coco-wholebody-face-256x256.py @@ -0,0 +1,232 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 60 +stage2_num_epochs = 10 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=1) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=68, + input_size=codec['input_size'], + in_featuremap_size=(8, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoWholeBodyFaceDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + # dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + # dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='NME', rule='less', max_keep_ckpts=1, interval=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose_coco_wholebody_face.md b/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose_coco_wholebody_face.md new file mode 100644 index 0000000000..913fabe99c --- /dev/null +++ b/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose_coco_wholebody_face.md @@ -0,0 +1,39 @@ + + +
+RTMDet (ArXiv 2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+COCO-WholeBody-Face (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody-Face val set + +| Arch | Input Size | NME | ckpt | log | +| :------------------------------------------------------------ | :--------: | :----: | :------------------------------------------------------------: | :-----------------------------------------------------------: | +| [pose_rtmpose_m](/configs/face_2d_keypoint/rtmpose/coco_wholebody_face/rtmpose-m_8xb32-60e_coco-wholebody-face-256x256.py) | 256x256 | 0.0466 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-coco-wholebody-face_pt-aic-coco_60e-256x256-62026ef2_20230228.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-coco-wholebody-face_pt-aic-coco_60e-256x256-62026ef2_20230228.json) | diff --git a/configs/face_2d_keypoint/rtmpose/wflw/rtmpose-m_8xb64-60e_wflw-256x256.py b/configs/face_2d_keypoint/rtmpose/wflw/rtmpose-m_8xb64-60e_wflw-256x256.py new file mode 100644 index 0000000000..1f13d434dd --- /dev/null +++ b/configs/face_2d_keypoint/rtmpose/wflw/rtmpose-m_8xb64-60e_wflw-256x256.py @@ -0,0 +1,232 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 60 +stage2_num_epochs = 10 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=1) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=98, + input_size=codec['input_size'], + in_featuremap_size=(8, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'WFLWDataset' +data_mode = 'topdown' +data_root = 'data/wflw/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/pose/WFLW/', +# f'{data_root}': 's3://openmmlab/datasets/pose/WFLW/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + # dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + # dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_wflw_train.json', + data_prefix=dict(img='images/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/face_landmarks_wflw_test.json', + data_prefix=dict(img='images/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='NME', rule='less', max_keep_ckpts=1, interval=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/configs/face_2d_keypoint/rtmpose/wflw/rtmpose_wflw.md b/configs/face_2d_keypoint/rtmpose/wflw/rtmpose_wflw.md new file mode 100644 index 0000000000..2b7903cec5 --- /dev/null +++ b/configs/face_2d_keypoint/rtmpose/wflw/rtmpose_wflw.md @@ -0,0 +1,42 @@ + + +
+RTMDet (ArXiv 2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+WFLW (CVPR'2018) + +```bibtex +@inproceedings{wu2018look, + title={Look at boundary: A boundary-aware face alignment algorithm}, + author={Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, Quan and Cai, Yici and Zhou, Qiang}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={2129--2138}, + year={2018} +} +``` + +
+ +Results on WFLW dataset + +The model is trained on WFLW train. + +| Arch | Input Size | NME | ckpt | log | +| :------------------------------------------------------------- | :--------: | :--: | :------------------------------------------------------------: | :------------------------------------------------------------: | +| [pose_rtmpose_m](/configs/face_2d_keypoint/rtmpose/wflw/rtmpose-m_8xb64-60e_wflw-256x256.py) | 256x256 | 4.01 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-wflw_pt-aic-coco_60e-256x256-dc1dcdcf_20230228.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-wflw_pt-aic-coco_60e-256x256-dc1dcdcf_20230228.json) | diff --git a/configs/face_2d_keypoint/topdown_heatmap/300w/td-hm_hrnetv2-w18_8xb64-60e_300w-256x256.py b/configs/face_2d_keypoint/topdown_heatmap/300w/td-hm_hrnetv2-w18_8xb64-60e_300w-256x256.py index 032ce774e0..c628d4c241 100644 --- a/configs/face_2d_keypoint/topdown_heatmap/300w/td-hm_hrnetv2-w18_8xb64-60e_300w-256x256.py +++ b/configs/face_2d_keypoint/topdown_heatmap/300w/td-hm_hrnetv2-w18_8xb64-60e_300w-256x256.py @@ -27,8 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict( - checkpoint=dict(save_best='nme/@[36, 45]', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) # codec settings codec = dict( @@ -111,7 +110,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_8xb64-60e_aflw-256x256.py b/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_8xb64-60e_aflw-256x256.py index 8f8811c83e..35e3e8d56c 100644 --- a/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_8xb64-60e_aflw-256x256.py +++ b/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_8xb64-60e_aflw-256x256.py @@ -27,8 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict( - checkpoint=dict(save_best='nme/@bbox_size', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) # codec settings codec = dict( @@ -108,7 +107,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_dark-8xb64-60e_aflw-256x256.py b/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_dark-8xb64-60e_aflw-256x256.py index dbe2d2b3de..da855fe3d0 100644 --- a/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_dark-8xb64-60e_aflw-256x256.py +++ b/configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_dark-8xb64-60e_aflw-256x256.py @@ -27,8 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict( - checkpoint=dict(save_best='nme/@bbox_size', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) # codec settings codec = dict( @@ -112,7 +111,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hourglass52_8xb32-60e_coco-wholebody-face-256x256.py b/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hourglass52_8xb32-60e_coco-wholebody-face-256x256.py index b4db3d9e25..2080211f80 100644 --- a/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hourglass52_8xb32-60e_coco-wholebody-face-256x256.py +++ b/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hourglass52_8xb32-60e_coco-wholebody-face-256x256.py @@ -27,8 +27,7 @@ auto_scale_lr = dict(base_batch_size=256) # hooks -default_hooks = dict( - checkpoint=dict(save_best='nme/@[36, 45]', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) # codec settings codec = dict( @@ -75,7 +74,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_8xb32-60e_coco-wholebody-face-256x256.py b/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_8xb32-60e_coco-wholebody-face-256x256.py index 62aad12b9e..f99df1b51b 100644 --- a/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_8xb32-60e_coco-wholebody-face-256x256.py +++ b/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_8xb32-60e_coco-wholebody-face-256x256.py @@ -27,8 +27,7 @@ auto_scale_lr = dict(base_batch_size=256) # hooks -default_hooks = dict( - checkpoint=dict(save_best='nme/@[36, 45]', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) # codec settings codec = dict( @@ -106,7 +105,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_dark-8xb32-60e_coco-wholebody-face-256x256.py b/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_dark-8xb32-60e_coco-wholebody-face-256x256.py index e8d6a82ec3..c1cef0618a 100644 --- a/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_dark-8xb32-60e_coco-wholebody-face-256x256.py +++ b/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_hrnetv2-w18_dark-8xb32-60e_coco-wholebody-face-256x256.py @@ -27,8 +27,7 @@ auto_scale_lr = dict(base_batch_size=256) # hooks -default_hooks = dict( - checkpoint=dict(save_best='nme/@[36, 45]', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) # codec settings codec = dict( @@ -110,7 +109,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_mobilenetv2_8xb32-60e_coco-wholebody-face-256x256.py b/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_mobilenetv2_8xb32-60e_coco-wholebody-face-256x256.py index 9714d69bfe..22fc068368 100644 --- a/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_mobilenetv2_8xb32-60e_coco-wholebody-face-256x256.py +++ b/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_mobilenetv2_8xb32-60e_coco-wholebody-face-256x256.py @@ -27,8 +27,7 @@ auto_scale_lr = dict(base_batch_size=256) # hooks -default_hooks = dict( - checkpoint=dict(save_best='nme/@[36, 45]', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) # codec settings codec = dict( @@ -74,7 +73,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_res50_8xb32-60e_coco-wholebody-face-256x256.py b/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_res50_8xb32-60e_coco-wholebody-face-256x256.py index b4d3b1acf2..e1d9ae9e2d 100644 --- a/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_res50_8xb32-60e_coco-wholebody-face-256x256.py +++ b/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_res50_8xb32-60e_coco-wholebody-face-256x256.py @@ -27,8 +27,7 @@ auto_scale_lr = dict(base_batch_size=256) # hooks -default_hooks = dict( - checkpoint=dict(save_best='nme/@[36, 45]', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) # codec settings codec = dict( @@ -73,7 +72,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_scnet50_8xb32-60e_coco-wholebody-face-256x256.py b/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_scnet50_8xb32-60e_coco-wholebody-face-256x256.py index b9d9bcb708..1df37b3c93 100644 --- a/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_scnet50_8xb32-60e_coco-wholebody-face-256x256.py +++ b/configs/face_2d_keypoint/topdown_heatmap/coco_wholebody_face/td-hm_scnet50_8xb32-60e_coco-wholebody-face-256x256.py @@ -27,8 +27,7 @@ auto_scale_lr = dict(base_batch_size=256) # hooks -default_hooks = dict( - checkpoint=dict(save_best='nme/@[36, 45]', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) # codec settings codec = dict( @@ -76,7 +75,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/face_2d_keypoint/topdown_heatmap/cofw/td-hm_hrnetv2-w18_8xb64-60e_cofw-256x256.py b/configs/face_2d_keypoint/topdown_heatmap/cofw/td-hm_hrnetv2-w18_8xb64-60e_cofw-256x256.py index 5d53306160..35c3c7b082 100644 --- a/configs/face_2d_keypoint/topdown_heatmap/cofw/td-hm_hrnetv2-w18_8xb64-60e_cofw-256x256.py +++ b/configs/face_2d_keypoint/topdown_heatmap/cofw/td-hm_hrnetv2-w18_8xb64-60e_cofw-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='nme/@[8, 9]', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) # codec settings codec = dict( @@ -110,7 +110,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_wflw.yml b/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_wflw.yml new file mode 100644 index 0000000000..3a0a4a454a --- /dev/null +++ b/configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_wflw.yml @@ -0,0 +1,27 @@ +Collections: +- Name: HRNetv2 + Paper: + Title: Deep High-Resolution Representation Learning for Visual Recognition + URL: https://arxiv.org/abs/1908.07919 + README: https://github.com/open-mmlab/mmpose/blob/1.x/docs/src/papers/backbones/hrnetv2.md +Models: +- Config: configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_8xb64-60e_wflw-256x256.py + In Collection: HRNetv2 + Alias: face + Metadata: + Architecture: + - HRNetv2 + Training Data: WFLW + Name: topdown_heatmap_hrnetv2_w18_wflw_256x256 + Results: + - Dataset: WFLW + Metrics: + NME blur: 4.58 + NME expression: 4.33 + NME illumination: 3.99 + NME makeup: 3.94 + NME occlusion: 4.83 + NME pose: 6.97 + NME test: 4.06 + Task: Face 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_wflw_256x256-2bf032a6_20210125.pth diff --git a/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_8xb64-60e_wflw-256x256.py b/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_8xb64-60e_wflw-256x256.py index b14047f00e..8da2fd2420 100644 --- a/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_8xb64-60e_wflw-256x256.py +++ b/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_8xb64-60e_wflw-256x256.py @@ -27,8 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict( - checkpoint=dict(save_best='nme/@[60, 72]', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) # codec settings codec = dict( @@ -108,7 +107,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_awing-8xb64-60e_wflw-256x256.py b/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_awing-8xb64-60e_wflw-256x256.py index fc864ea538..f23ad5e685 100644 --- a/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_awing-8xb64-60e_wflw-256x256.py +++ b/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_awing-8xb64-60e_wflw-256x256.py @@ -27,8 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict( - checkpoint=dict(save_best='nme/@[60, 72]', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) # codec settings codec = dict( @@ -108,7 +107,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_dark-8xb64-60e_wflw-256x256.py b/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_dark-8xb64-60e_wflw-256x256.py index da7f05473c..cfa71200aa 100644 --- a/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_dark-8xb64-60e_wflw-256x256.py +++ b/configs/face_2d_keypoint/topdown_heatmap/wflw/td-hm_hrnetv2-w18_dark-8xb64-60e_wflw-256x256.py @@ -27,8 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict( - checkpoint=dict(save_best='nme/@[60, 72]', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='NME', rule='less', interval=1)) # codec settings codec = dict( @@ -112,7 +111,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/hand_2d_keypoint/rtmpose/README.md b/configs/hand_2d_keypoint/rtmpose/README.md new file mode 100644 index 0000000000..9687b7e72c --- /dev/null +++ b/configs/hand_2d_keypoint/rtmpose/README.md @@ -0,0 +1,16 @@ +# RTMPose + +Recent studies on 2D pose estimation have achieved excellent performance on public benchmarks, yet its application in the industrial community still suffers from heavy model parameters and high latency. +In order to bridge this gap, we empirically study five aspects that affect the performance of multi-person pose estimation algorithms: paradigm, backbone network, localization algorithm, training strategy, and deployment inference, and present a high-performance real-time multi-person pose estimation framework, **RTMPose**, based on MMPose. +Our RTMPose-m achieves **75.8% AP** on COCO with **90+ FPS** on an Intel i7-11700 CPU and **430+ FPS** on an NVIDIA GTX 1660 Ti GPU, and RTMPose-l achieves **67.0% AP** on COCO-WholeBody with **130+ FPS**, outperforming existing open-source libraries. +To further evaluate RTMPose's capability in critical real-time applications, we also report the performance after deploying on the mobile device. + +## Results and Models + +### COCO-WholeBody-Hand Dataset + +Results on COCO-WholeBody-Hand val set + +| Model | Input Size | PCK@0.2 | AUC | EPE | Details and Download | +| :-------: | :--------: | :-----: | :---: | :--: | :------------------------------------------------------------------------------------: | +| RTMPose-m | 256x256 | 0.815 | 0.837 | 4.51 | [rtmpose_coco_wholebody_hand.md](./coco_wholebody_hand/rtmpose_coco_wholebody_hand.md) | diff --git a/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py b/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py new file mode 100644 index 0000000000..62765ca2c7 --- /dev/null +++ b/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py @@ -0,0 +1,233 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=21, + input_size=codec['input_size'], + in_featuremap_size=(8, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoWholeBodyHandDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + # dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], + rotate_factor=180), + dict(type='RandomFlip', direction='horizontal'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + # dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=180), + dict(type='RandomFlip', direction='horizontal'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE') +] +test_evaluator = val_evaluator diff --git a/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose_coco_wholebody_hand.md b/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose_coco_wholebody_hand.md new file mode 100644 index 0000000000..cffb706493 --- /dev/null +++ b/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose_coco_wholebody_hand.md @@ -0,0 +1,39 @@ + + +
+RTMDet (ArXiv 2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+COCO-WholeBody-Hand (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody-Hand val set + +| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--------------------------------------------------------- | :--------: | :-----: | :---: | :--: | :--------------------------------------------------------: | :--------------------------------------------------------: | +| [rtmpose_m](/configs/hand_2d_keypoint/rtmpose/coco_wholebody_hand/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py) | 256x256 | 0.815 | 0.837 | 4.51 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-coco-wholebody-hand_pt-aic-coco_210e-256x256-99477206_20230228.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-coco-wholebody-hand_pt-aic-coco_210e-256x256-99477206_20230228.json) | diff --git a/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hourglass52_8xb32-210e_coco-wholebody-hand-256x256.py b/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hourglass52_8xb32-210e_coco-wholebody-hand-256x256.py index 38a769d6d6..da14205ca3 100644 --- a/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hourglass52_8xb32-210e_coco-wholebody-hand-256x256.py +++ b/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hourglass52_8xb32-210e_coco-wholebody-hand-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=256) # hooks -default_hooks = dict(checkpoint=dict(save_best='auc/@20thrs', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict( @@ -74,7 +74,7 @@ rotate_factor=180.0, scale_factor=(0.7, 1.3)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_8xb32-210e_coco-wholebody-hand-256x256.py b/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_8xb32-210e_coco-wholebody-hand-256x256.py index 948e7d9491..2b2f4496ff 100644 --- a/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_8xb32-210e_coco-wholebody-hand-256x256.py +++ b/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_8xb32-210e_coco-wholebody-hand-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=256) # hooks -default_hooks = dict(checkpoint=dict(save_best='auc/@20thrs', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict( type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) @@ -103,7 +103,7 @@ type='RandomBBoxTransform', rotate_factor=180, scale_factor=(0.7, 1.3)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_dark-8xb32-210e_coco-wholebody-hand-256x256.py b/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_dark-8xb32-210e_coco-wholebody-hand-256x256.py index 2c71e5d947..8773796701 100644 --- a/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_dark-8xb32-210e_coco-wholebody-hand-256x256.py +++ b/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_hrnetv2-w18_dark-8xb32-210e_coco-wholebody-hand-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=256) # hooks -default_hooks = dict(checkpoint=dict(save_best='auc/@20thrs', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict( type='MSRAHeatmap', @@ -107,7 +107,7 @@ scale_factor=(0.7, 1.3)), dict(type='RandomFlip', direction='horizontal'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_litehrnet-w18_8xb32-210e_coco-wholebody-hand-256x256.py b/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_litehrnet-w18_8xb32-210e_coco-wholebody-hand-256x256.py index b4537cd871..bc881094a8 100644 --- a/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_litehrnet-w18_8xb32-210e_coco-wholebody-hand-256x256.py +++ b/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_litehrnet-w18_8xb32-210e_coco-wholebody-hand-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=256) # hooks -default_hooks = dict(checkpoint=dict(save_best='auc/@20thrs', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict( type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) @@ -87,7 +87,7 @@ scale_factor=(0.7, 1.3)), dict(type='RandomFlip', direction='horizontal'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_mobilenetv2_8xb32-210e_coco-wholebody-hand-256x256.py b/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_mobilenetv2_8xb32-210e_coco-wholebody-hand-256x256.py index ea0cf2804d..a1e60ce674 100644 --- a/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_mobilenetv2_8xb32-210e_coco-wholebody-hand-256x256.py +++ b/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_mobilenetv2_8xb32-210e_coco-wholebody-hand-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=256) # hooks -default_hooks = dict(checkpoint=dict(save_best='auc/@20thrs', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict( type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) @@ -47,7 +47,7 @@ init_cfg=dict(type='Pretrained', checkpoint='mmcls://mobilenet_v2')), head=dict( type='HeatmapHead', - in_channels=2048, + in_channels=1280, out_channels=21, loss=dict(type='KeypointMSELoss', use_target_weight=True), decoder=codec), @@ -71,7 +71,7 @@ scale_factor=(0.7, 1.3)), dict(type='RandomFlip', direction='horizontal'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_res50_8xb32-210e_coco-wholebody-hand-256x256.py b/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_res50_8xb32-210e_coco-wholebody-hand-256x256.py index 5fca4e5da2..1455319679 100644 --- a/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_res50_8xb32-210e_coco-wholebody-hand-256x256.py +++ b/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_res50_8xb32-210e_coco-wholebody-hand-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=256) # hooks -default_hooks = dict(checkpoint=dict(save_best='auc/@20thrs', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict( type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) @@ -70,7 +70,7 @@ scale_factor=(0.7, 1.3)), dict(type='RandomFlip', direction='horizontal'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_scnet50_8xb32-210e_coco-wholebody-hand-256x256.py b/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_scnet50_8xb32-210e_coco-wholebody-hand-256x256.py index 10b6b732ee..720e398433 100644 --- a/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_scnet50_8xb32-210e_coco-wholebody-hand-256x256.py +++ b/configs/hand_2d_keypoint/topdown_heatmap/coco_wholebody_hand/td-hm_scnet50_8xb32-210e_coco-wholebody-hand-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=256) # hooks -default_hooks = dict(checkpoint=dict(save_best='auc/@20thrs', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict( type='MSRAHeatmap', input_size=(256, 256), heatmap_size=(64, 64), sigma=2) @@ -73,7 +73,7 @@ scale_factor=(0.7, 1.3)), dict(type='RandomFlip', direction='horizontal'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/td-hm_res50_8xb64-100e_freihand2d-224x224.py b/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/td-hm_res50_8xb64-100e_freihand2d-224x224.py index 7e7d1d6440..3c75453917 100644 --- a/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/td-hm_res50_8xb64-100e_freihand2d-224x224.py +++ b/configs/hand_2d_keypoint/topdown_heatmap/freihand2d/td-hm_res50_8xb64-100e_freihand2d-224x224.py @@ -27,7 +27,8 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='auc/@20thrs', rule='greater')) +default_hooks = dict( + checkpoint=dict(save_best='AUC', rule='greater', interval=1)) # codec settings codec = dict( @@ -73,7 +74,7 @@ rotate_factor=180, scale_factor=(0.7, 1.3)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/resnet_onehand10k.yml b/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/resnet_onehand10k.yml new file mode 100644 index 0000000000..828427899c --- /dev/null +++ b/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/resnet_onehand10k.yml @@ -0,0 +1,24 @@ +Collections: +- Name: SimpleBaseline2D + Paper: + Title: Simple baselines for human pose estimation and tracking + URL: http://openaccess.thecvf.com/content_ECCV_2018/html/Bin_Xiao_Simple_Baselines_for_ECCV_2018_paper.html + README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/simplebaseline2d.md +Models: +- Config: configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_res50_8xb32-210e_onehand10k-256x256.py + In Collection: SimpleBaseline2D + Alias: hand + Metadata: + Architecture: + - SimpleBaseline2D + - ResNet + Training Data: OneHand10K + Name: topdown_heatmap_res50_onehand10k_256x256 + Results: + - Dataset: OneHand10K + Metrics: + AUC: 0.555 + EPE: 25.16 + PCK@0.2: 0.989 + Task: Hand 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/hand/resnet/res50_onehand10k_256x256-739c8639_20210330.pth diff --git a/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_8xb64-210e_onehand10k-256x256.py b/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_8xb64-210e_onehand10k-256x256.py index fc6b4941ee..e0bb7fcb90 100644 --- a/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_8xb64-210e_onehand10k-256x256.py +++ b/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_8xb64-210e_onehand10k-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='auc/@20thrs', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict( @@ -106,7 +106,7 @@ type='RandomBBoxTransform', rotate_factor=180, scale_factor=(0.7, 1.3)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_dark-8xb64-210e_onehand10k-256x256.py b/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_dark-8xb64-210e_onehand10k-256x256.py index 50fb485d0e..b2760fea54 100644 --- a/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_dark-8xb64-210e_onehand10k-256x256.py +++ b/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_dark-8xb64-210e_onehand10k-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='auc/@20thrs', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict( @@ -110,7 +110,7 @@ type='RandomBBoxTransform', rotate_factor=180, scale_factor=(0.7, 1.3)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_udp-8xb64-210e_onehand10k-256x256.py b/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_udp-8xb64-210e_onehand10k-256x256.py index 8f64dc3962..eb80fcf89b 100644 --- a/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_udp-8xb64-210e_onehand10k-256x256.py +++ b/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_hrnetv2-w18_udp-8xb64-210e_onehand10k-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='auc/@20thrs', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict( @@ -106,7 +106,7 @@ type='RandomBBoxTransform', rotate_factor=180, scale_factor=(0.7, 1.3)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_mobilenetv2_8xb64-210e_onehand10k-256x256.py b/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_mobilenetv2_8xb64-210e_onehand10k-256x256.py index 5d2f3a7698..e2ee81cd87 100644 --- a/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_mobilenetv2_8xb64-210e_onehand10k-256x256.py +++ b/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_mobilenetv2_8xb64-210e_onehand10k-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='auc/@20thrs', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict( @@ -75,7 +75,7 @@ type='RandomBBoxTransform', rotate_factor=180, scale_factor=(0.7, 1.3)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_res50_8xb32-210e_onehand10k-256x256.py b/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_res50_8xb32-210e_onehand10k-256x256.py index 94055f778e..c728a1401d 100644 --- a/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_res50_8xb32-210e_onehand10k-256x256.py +++ b/configs/hand_2d_keypoint/topdown_heatmap/onehand10k/td-hm_res50_8xb32-210e_onehand10k-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='auc/@20thrs', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict( @@ -74,7 +74,7 @@ type='RandomBBoxTransform', rotate_factor=180, scale_factor=(0.7, 1.3)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_8xb64-210e_rhd2d-256x256.py b/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_8xb64-210e_rhd2d-256x256.py index ea89a94dc4..4072651322 100644 --- a/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_8xb64-210e_rhd2d-256x256.py +++ b/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_8xb64-210e_rhd2d-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='auc/@20thrs', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict( @@ -106,7 +106,7 @@ type='RandomBBoxTransform', rotate_factor=180, scale_factor=(0.7, 1.3)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_dark-8xb64-210e_rhd2d-256x256.py b/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_dark-8xb64-210e_rhd2d-256x256.py index 3ac12ca282..7df4e67227 100644 --- a/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_dark-8xb64-210e_rhd2d-256x256.py +++ b/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_dark-8xb64-210e_rhd2d-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='auc/@20thrs', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict( @@ -110,7 +110,7 @@ type='RandomBBoxTransform', rotate_factor=180, scale_factor=(0.7, 1.3)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_udp-8xb64-210e_rhd2d-256x256.py b/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_udp-8xb64-210e_rhd2d-256x256.py index 38f1cb8777..1ed3179816 100644 --- a/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_udp-8xb64-210e_rhd2d-256x256.py +++ b/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_hrnetv2-w18_udp-8xb64-210e_rhd2d-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='auc/@20thrs', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict( @@ -106,7 +106,7 @@ type='RandomBBoxTransform', rotate_factor=180, scale_factor=(0.7, 1.3)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_mobilenetv2_8xb64-210e_rhd2d-256x256.py b/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_mobilenetv2_8xb64-210e_rhd2d-256x256.py index f19c4cf8df..c7319f8a4e 100644 --- a/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_mobilenetv2_8xb64-210e_rhd2d-256x256.py +++ b/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_mobilenetv2_8xb64-210e_rhd2d-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='auc/@20thrs', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict( @@ -75,7 +75,7 @@ type='RandomBBoxTransform', rotate_factor=180, scale_factor=(0.7, 1.3)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_res50_8xb64-210e_rhd2d-256x256.py b/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_res50_8xb64-210e_rhd2d-256x256.py index 6f5e5f4281..401a5b9330 100644 --- a/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_res50_8xb64-210e_rhd2d-256x256.py +++ b/configs/hand_2d_keypoint/topdown_heatmap/rhd2d/td-hm_res50_8xb64-210e_rhd2d-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='auc/@20thrs', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict( @@ -74,7 +74,7 @@ type='RandomBBoxTransform', rotate_factor=180, scale_factor=(0.7, 1.3)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/hand_2d_keypoint/topdown_regression/onehand10k/td-reg_res50_8xb64-210e_onehand10k-256x256.py b/configs/hand_2d_keypoint/topdown_regression/onehand10k/td-reg_res50_8xb64-210e_onehand10k-256x256.py index b0b1bfff2c..492492ca7e 100644 --- a/configs/hand_2d_keypoint/topdown_regression/onehand10k/td-reg_res50_8xb64-210e_onehand10k-256x256.py +++ b/configs/hand_2d_keypoint/topdown_regression/onehand10k/td-reg_res50_8xb64-210e_onehand10k-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='auc/@20thrs', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict(type='RegressionLabel', input_size=(256, 256)) @@ -72,7 +72,7 @@ type='RandomBBoxTransform', rotate_factor=180, scale_factor=(0.7, 1.3)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='keypoint_label', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/hand_2d_keypoint/topdown_regression/rhd2d/td-reg_res50_8xb64-210e_rhd2d-256x256.py b/configs/hand_2d_keypoint/topdown_regression/rhd2d/td-reg_res50_8xb64-210e_rhd2d-256x256.py index ac04cbb1d9..bfe876f91a 100644 --- a/configs/hand_2d_keypoint/topdown_regression/rhd2d/td-reg_res50_8xb64-210e_rhd2d-256x256.py +++ b/configs/hand_2d_keypoint/topdown_regression/rhd2d/td-reg_res50_8xb64-210e_rhd2d-256x256.py @@ -27,7 +27,7 @@ auto_scale_lr = dict(base_batch_size=512) # hooks -default_hooks = dict(checkpoint=dict(save_best='auc/@20thrs', rule='greater')) +default_hooks = dict(checkpoint=dict(save_best='AUC', rule='greater')) # codec settings codec = dict(type='RegressionLabel', input_size=(256, 256)) @@ -72,7 +72,7 @@ type='RandomBBoxTransform', rotate_factor=180, scale_factor=(0.7, 1.3)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='keypoint_label', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/wholebody_2d_keypoint/rtmpose/README.md b/configs/wholebody_2d_keypoint/rtmpose/README.md new file mode 100644 index 0000000000..47e488567c --- /dev/null +++ b/configs/wholebody_2d_keypoint/rtmpose/README.md @@ -0,0 +1,18 @@ +# RTMPose + +Recent studies on 2D pose estimation have achieved excellent performance on public benchmarks, yet its application in the industrial community still suffers from heavy model parameters and high latency. +In order to bridge this gap, we empirically study five aspects that affect the performance of multi-person pose estimation algorithms: paradigm, backbone network, localization algorithm, training strategy, and deployment inference, and present a high-performance real-time multi-person pose estimation framework, **RTMPose**, based on MMPose. +Our RTMPose-m achieves **75.8% AP** on COCO with **90+ FPS** on an Intel i7-11700 CPU and **430+ FPS** on an NVIDIA GTX 1660 Ti GPU, and RTMPose-l achieves **67.0% AP** on COCO-WholeBody with **130+ FPS**, outperforming existing open-source libraries. +To further evaluate RTMPose's capability in critical real-time applications, we also report the performance after deploying on the mobile device. + +## Results and Models + +### COCO-WholeBody Dataset + +Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset + +| Model | Input Size | Whole AP | Whole AR | Details and Download | +| :-------: | :--------: | :------: | :------: | :---------------------------------------------------------------------: | +| RTMPose-m | 256x192 | 0.604 | 0.667 | [rtmpose_coco-wholebody.md](./coco-wholebody/rtmpose_coco-wholebody.md) | +| RTMPose-l | 256x192 | 0.632 | 0.694 | [rtmpose_coco-wholebody.md](./coco-wholebody/rtmpose_coco-wholebody.md) | +| RTMPose-l | 384x288 | 0.670 | 0.723 | [rtmpose_coco-wholebody.md](./coco-wholebody/rtmpose_coco-wholebody.md) | diff --git a/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py b/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py new file mode 100644 index 0000000000..1ad246a2b8 --- /dev/null +++ b/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py @@ -0,0 +1,231 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(288, 384), + sigma=(6., 6.93), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=1024, + out_channels=133, + input_size=codec['input_size'], + in_featuremap_size=(9, 12), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py b/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py new file mode 100644 index 0000000000..949cbd9c18 --- /dev/null +++ b/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py @@ -0,0 +1,231 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=1024, + out_channels=133, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py b/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py new file mode 100644 index 0000000000..eab0a46299 --- /dev/null +++ b/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py @@ -0,0 +1,231 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=133, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose_coco-wholebody.md b/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose_coco-wholebody.md new file mode 100644 index 0000000000..75cf6245ff --- /dev/null +++ b/configs/wholebody_2d_keypoint/rtmpose/coco-wholebody/rtmpose_coco-wholebody.md @@ -0,0 +1,62 @@ + + +
+RTMPose (arXiv'2023) + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2303.07399, + doi = {10.48550/ARXIV.2303.07399}, + url = {https://arxiv.org/abs/2303.07399}, + author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose}, + publisher = {arXiv}, + year = {2023}, + copyright = {Creative Commons Attribution 4.0 International} +} + +``` + +
+ + + +
+RTMDet (arXiv'2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+COCO-WholeBody (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log | +| :-------------------------------------- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :------: | :--------------------------------------: | :-------------------------------------: | +| [rtmpose-m](./rtmpose-m_8xb64-270e_coco-wholebody-256x192.py) | 256x192 | 0.697 | 0.743 | 0.660 | 0.749 | 0.822 | 0.858 | 0.483 | 0.564 | 0.604 | 0.667 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-coco-wholebody_pt-aic-coco_270e-256x192-cd5e845c_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-coco-wholebody_pt-aic-coco_270e-256x192-cd5e845c_20230123.json) | +| [rtmpose-l](./rtmpose-l_8xb64-270e_coco-wholebody-256x192.py) | 256x192 | 0.721 | 0.764 | 0.693 | 0.780 | 0.844 | 0.876 | 0.523 | 0.600 | 0.632 | 0.694 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-256x192-6f206314_20230124.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-256x192-6f206314_20230124.json) | +| [rtmpose-l](./rtmpose-l_8xb32-270e_coco-wholebody-384x288.py) | 384x288 | 0.736 | 0.776 | 0.738 | 0.810 | 0.895 | 0.918 | 0.591 | 0.659 | 0.670 | 0.723 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-384x288-eaeb96c8_20230125.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-384x288-eaeb96c8_20230125.json) | diff --git a/configs/wholebody_2d_keypoint/topdown_heatmap/README.md b/configs/wholebody_2d_keypoint/topdown_heatmap/README.md index c22b29f469..23ee1ed315 100644 --- a/configs/wholebody_2d_keypoint/topdown_heatmap/README.md +++ b/configs/wholebody_2d_keypoint/topdown_heatmap/README.md @@ -17,6 +17,7 @@ Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO | HRNet-w48+Dark+ | 384x288 | 0.661 | 0.743 | [hrnet_dark_coco-wholebody.md](./coco-wholebody/hrnet_dark_coco-wholebody.md) | | HRNet-w32+Dark | 256x192 | 0.582 | 0.671 | [hrnet_dark_coco-wholebody.md](./coco-wholebody/hrnet_dark_coco-wholebody.md) | | HRNet-w48 | 256x192 | 0.579 | 0.681 | [hrnet_coco-wholebody.md](./coco-wholebody/hrnet_coco-wholebody.md) | +| CSPNeXt-m | 256x192 | 0.567 | 0.641 | [cspnext_udp_coco-wholebody.md](./coco-wholebody/cspnext_udp_coco-wholebody.md) | | ResNet-152 | 256x192 | 0.548 | 0.661 | [resnet_coco-wholebody.md](./coco-wholebody/resnet_coco-wholebody.md) | | HRNet-w32 | 256x192 | 0.536 | 0.636 | [hrnet_coco-wholebody.md](./coco-wholebody/hrnet_coco-wholebody.md) | | ResNet-101 | 256x192 | 0.531 | 0.645 | [resnet_coco-wholebody.md](./coco-wholebody/resnet_coco-wholebody.md) | diff --git a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext-l_udp_8xb64-210e_coco-wholebody-256x192.py b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext-l_udp_8xb64-210e_coco-wholebody-256x192.py new file mode 100644 index 0000000000..f6aff18c22 --- /dev/null +++ b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext-l_udp_8xb64-210e_coco-wholebody-256x192.py @@ -0,0 +1,213 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-l_8xb256-rsb-a1-600e_in1k-6a760974.pth')), + head=dict( + type='HeatmapHead', + in_channels=1024, + out_channels=133, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=False, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext-m_udp_8xb64-210e_coco-wholebody-256x192.py b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext-m_udp_8xb64-210e_coco-wholebody-256x192.py new file mode 100644 index 0000000000..1203543a1d --- /dev/null +++ b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext-m_udp_8xb64-210e_coco-wholebody-256x192.py @@ -0,0 +1,213 @@ +_base_ = ['../../../_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth')), + head=dict( + type='HeatmapHead', + in_channels=768, + out_channels=133, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=False, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext_udp_coco-wholebody.md b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext_udp_coco-wholebody.md new file mode 100644 index 0000000000..0626738880 --- /dev/null +++ b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/cspnext_udp_coco-wholebody.md @@ -0,0 +1,56 @@ + + +
+RTMDet (ArXiv 2022) + +```bibtex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +
+ + + +
+UDP (CVPR'2020) + +```bibtex +@InProceedings{Huang_2020_CVPR, + author = {Huang, Junjie and Zhu, Zheng and Guo, Feng and Huang, Guan}, + title = {The Devil Is in the Details: Delving Into Unbiased Data Processing for Human Pose Estimation}, + booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2020} +} +``` + +
+ + + +
+COCO-WholeBody (ECCV'2020) + +```bibtex +@inproceedings{jin2020whole, + title={Whole-Body Human Pose Estimation in the Wild}, + author={Jin, Sheng and Xu, Lumin and Xu, Jin and Wang, Can and Liu, Wentao and Qian, Chen and Ouyang, Wanli and Luo, Ping}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2020} +} +``` + +
+ +Results on COCO-WholeBody v1.0 val with detector having human AP of 56.4 on COCO val2017 dataset + +| Arch | Input Size | Body AP | Body AR | Foot AP | Foot AR | Face AP | Face AR | Hand AP | Hand AR | Whole AP | Whole AR | ckpt | log | +| :-------------------------------------- | :--------: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :------: | :------: | :--------------------------------------: | :-------------------------------------: | +| [pose_cspnext_m_udp](/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_dark-8xb64-210e_coco-wholebody-256x192.py) | 256x192 | 0.687 | 0.735 | 0.680 | 0.763 | 0.697 | 0.755 | 0.460 | 0.543 | 0.567 | 0.641 | [ckpt](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_udp-coco-wholebody_pt-in1k_210e-256x192-320fa258_20230123.pth) | [log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_udp-coco-wholebody_pt-in1k_210e-256x192-320fa258_20230123.json) | diff --git a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-256x192.py b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-256x192.py index ee6b4d56d2..49d43a5d14 100644 --- a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-256x192.py +++ b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-256x192.py @@ -101,7 +101,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-384x288.py b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-384x288.py index ca380756b4..eeb7856b0c 100644 --- a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-384x288.py +++ b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_8xb64-210e_coco-wholebody-384x288.py @@ -101,7 +101,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_dark-8xb64-210e_coco-wholebody-256x192.py b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_dark-8xb64-210e_coco-wholebody-256x192.py index bf3bbbd75a..47055de250 100644 --- a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_dark-8xb64-210e_coco-wholebody-256x192.py +++ b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w32_dark-8xb64-210e_coco-wholebody-256x192.py @@ -105,7 +105,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-256x192.py b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-256x192.py index 0b97283fec..1d653de7c7 100644 --- a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-256x192.py +++ b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-256x192.py @@ -101,7 +101,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-384x288.py b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-384x288.py index 1cabcfbaa9..f014185640 100644 --- a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-384x288.py +++ b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_8xb32-210e_coco-wholebody-384x288.py @@ -101,7 +101,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_dark-8xb32-210e_coco-wholebody-384x288.py b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_dark-8xb32-210e_coco-wholebody-384x288.py index ae8a77a163..309bcd8f70 100644 --- a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_dark-8xb32-210e_coco-wholebody-384x288.py +++ b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_dark-8xb32-210e_coco-wholebody-384x288.py @@ -105,7 +105,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-256x192.py b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-256x192.py index 2d2283db5b..8cf8d37cf3 100644 --- a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-256x192.py +++ b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-256x192.py @@ -72,7 +72,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-384x288.py b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-384x288.py index 772fe61a37..0a1150b438 100644 --- a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-384x288.py +++ b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res101_8xb32-210e_coco-wholebody-384x288.py @@ -72,7 +72,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-256x192.py b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-256x192.py index b04c771e16..18de6301a7 100644 --- a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-256x192.py +++ b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-256x192.py @@ -72,7 +72,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-384x288.py b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-384x288.py index 5f12dfc77f..d6c5a768b4 100644 --- a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-384x288.py +++ b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res152_8xb32-210e_coco-wholebody-384x288.py @@ -72,7 +72,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-256x192.py b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-256x192.py index 50fe0fc9f6..5d5d9bf71f 100644 --- a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-256x192.py +++ b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-256x192.py @@ -72,7 +72,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-384x288.py b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-384x288.py index 8055b428ac..e8c9575970 100644 --- a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-384x288.py +++ b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_res50_8xb64-210e_coco-wholebody-384x288.py @@ -72,7 +72,7 @@ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_8xb64-210e_coco-wholebody-256x192.py b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_8xb64-210e_coco-wholebody-256x192.py index 81713cc8af..daf5209100 100644 --- a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_8xb64-210e_coco-wholebody-256x192.py +++ b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_8xb64-210e_coco-wholebody-256x192.py @@ -73,7 +73,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_dark-8xb64-210e_coco-wholebody-256x192.py b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_dark-8xb64-210e_coco-wholebody-256x192.py index 0b341ceca1..dbd0109b44 100644 --- a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_dark-8xb64-210e_coco-wholebody-256x192.py +++ b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-mbv3_dark-8xb64-210e_coco-wholebody-256x192.py @@ -77,7 +77,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_8xb64-210e_coco-wholebody-256x192.py b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_8xb64-210e_coco-wholebody-256x192.py index 7e582e19a7..352c27aba0 100644 --- a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_8xb64-210e_coco-wholebody-256x192.py +++ b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_8xb64-210e_coco-wholebody-256x192.py @@ -74,7 +74,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_dark-8xb64-210e_coco-wholebody-256x192.py b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_dark-8xb64-210e_coco-wholebody-256x192.py index bb11e05625..aadfd0fc4b 100644 --- a/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_dark-8xb64-210e_coco-wholebody-256x192.py +++ b/configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_vipnas-res50_dark-8xb64-210e_coco-wholebody-256x192.py @@ -78,7 +78,7 @@ rotate_factor=60, scale_factor=(0.75, 1.25)), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), + dict(type='GenerateTarget', encoder=codec), dict(type='PackPoseInputs') ] val_pipeline = [ diff --git a/demo/MMPose_Tutorial.ipynb b/demo/MMPose_Tutorial.ipynb index 60de611eb9..19706e0da2 100644 --- a/demo/MMPose_Tutorial.ipynb +++ b/demo/MMPose_Tutorial.ipynb @@ -622,6 +622,7 @@ "import mmcv\n", "from mmcv import imread\n", "import mmengine\n", + "from mmengine.registry import init_default_scope\n", "import numpy as np\n", "\n", "from mmpose.apis import inference_topdown\n", @@ -629,11 +630,9 @@ "from mmpose.evaluation.functional import nms\n", "from mmpose.registry import VISUALIZERS\n", "from mmpose.structures import merge_data_samples\n", - "from mmpose.utils import register_all_modules as register_mmpose_modules\n", "\n", "try:\n", " from mmdet.apis import inference_detector, init_detector\n", - " from mmdet.utils import register_all_modules as register_mmdet_modules\n", " has_mmdet = True\n", "except (ImportError, ModuleNotFoundError):\n", " has_mmdet = False\n", @@ -656,7 +655,6 @@ "\n", "\n", "# build detector\n", - "register_mmdet_modules()\n", "detector = init_detector(\n", " det_config,\n", " det_checkpoint,\n", @@ -665,7 +663,6 @@ "\n", "\n", "# build pose estimator\n", - "register_mmpose_modules()\n", "pose_estimator = init_pose_estimator(\n", " pose_config,\n", " pose_checkpoint,\n", @@ -696,7 +693,7 @@ " \"\"\"Visualize predicted keypoints (and heatmaps) of one image.\"\"\"\n", "\n", " # predict bbox\n", - " register_mmdet_modules()\n", + " init_default_scope(detector.cfg.get('default_scope', 'mmdet'))\n", " detect_result = inference_detector(detector, img_path)\n", " pred_instance = detect_result.pred_instances.cpu().numpy()\n", " bboxes = np.concatenate(\n", @@ -706,7 +703,6 @@ " bboxes = bboxes[nms(bboxes, 0.3)][:, :4]\n", "\n", " # predict keypoints\n", - " register_mmpose_modules()\n", " pose_results = inference_topdown(pose_estimator, img_path, bboxes)\n", " data_samples = merge_data_samples(pose_results)\n", "\n", @@ -3476,11 +3472,6 @@ "source": [ "from mmengine.config import Config, DictAction\n", "from mmengine.runner import Runner\n", - "from mmpose.utils import register_all_modules\n", - "\n", - "# register all modules in mmpose into the registries\n", - "# do not init the default scope here because it will be init in the runner\n", - "register_all_modules(init_default_scope=False)\n", "\n", "# set preprocess configs to model\n", "cfg.model.setdefault('data_preprocessor', cfg.get('preprocess_cfg', {}))\n", @@ -3517,7 +3508,7 @@ }, "gpuClass": "standard", "kernelspec": { - "display_name": "Python 3.7.13 ('pt19cu113')", + "display_name": "dev2.0", "language": "python", "name": "python3" }, @@ -3531,11 +3522,11 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.13" + "version": "3.7.13 (default, Mar 29 2022, 02:18:16) \n[GCC 7.5.0]" }, "vscode": { "interpreter": { - "hash": "da739a86cd93f7808d44852bc442711db64702daf7deb8b8d6704b313da8028c" + "hash": "383ba00087b5a9caebf3648b758a31e474cc01be975489b58f119fa4bc17e1f8" } }, "widgets": { diff --git a/demo/bottomup_demo.py b/demo/bottomup_demo.py new file mode 100644 index 0000000000..8d27a17178 --- /dev/null +++ b/demo/bottomup_demo.py @@ -0,0 +1,179 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mimetypes +import os +import tempfile +from argparse import ArgumentParser + +import json_tricks as json +import mmcv +import mmengine + +from mmpose.apis import inference_bottomup, init_model +from mmpose.registry import VISUALIZERS +from mmpose.structures import split_instances + + +def process_one_image(args, img_path, pose_estimator, visualizer, + show_interval): + """Visualize predicted keypoints (and heatmaps) of one image.""" + + # inference a single image + batch_results = inference_bottomup(pose_estimator, img_path) + results = batch_results[0] + + # show the results + img = mmcv.imread(img_path, channel_order='rgb') + + out_file = None + if args.output_root: + out_file = f'{args.output_root}/{os.path.basename(img_path)}' + + visualizer.add_datasample( + 'result', + img, + data_sample=results, + draw_gt=False, + draw_bbox=False, + draw_heatmap=args.draw_heatmap, + show_kpt_idx=args.show_kpt_idx, + show=args.show, + wait_time=show_interval, + out_file=out_file, + kpt_score_thr=args.kpt_thr) + + return results.pred_instances + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--input', type=str, default='', help='Image/Video file') + parser.add_argument( + '--show', + action='store_true', + default=False, + help='whether to show img') + parser.add_argument( + '--output-root', + type=str, + default='', + help='root of the output img file. ' + 'Default not saving the visualization images.') + parser.add_argument( + '--save-predictions', + action='store_true', + default=False, + help='whether to save predicted results') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--draw-heatmap', + action='store_true', + help='Visualize the predicted heatmap') + parser.add_argument( + '--show-kpt-idx', + action='store_true', + default=False, + help='Whether to show the index of keypoints') + parser.add_argument( + '--kpt-thr', type=float, default=0.3, help='Keypoint score threshold') + parser.add_argument( + '--radius', + type=int, + default=3, + help='Keypoint radius for visualization') + parser.add_argument( + '--thickness', + type=int, + default=1, + help='Link thickness for visualization') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + assert args.show or (args.output_root != '') + assert args.input != '' + if args.output_root: + mmengine.mkdir_or_exist(args.output_root) + if args.save_predictions: + assert args.output_root != '' + args.pred_save_path = f'{args.output_root}/results_' \ + f'{os.path.splitext(os.path.basename(args.input))[0]}.json' + + # build the model from a config file and a checkpoint file + if args.draw_heatmap: + cfg_options = dict(model=dict(test_cfg=dict(output_heatmaps=True))) + else: + cfg_options = None + + model = init_model( + args.config, + args.checkpoint, + device=args.device, + cfg_options=cfg_options) + + # init visualizer + model.cfg.visualizer.radius = args.radius + model.cfg.visualizer.line_width = args.thickness + visualizer = VISUALIZERS.build(model.cfg.visualizer) + visualizer.set_dataset_meta(model.dataset_meta) + + input_type = mimetypes.guess_type(args.input)[0].split('/')[0] + if input_type == 'image': + pred_instances = process_one_image( + args, args.input, model, visualizer, show_interval=0) + pred_instances_list = split_instances(pred_instances) + + elif input_type == 'video': + tmp_folder = tempfile.TemporaryDirectory() + video = mmcv.VideoReader(args.input) + progressbar = mmengine.ProgressBar(len(video)) + video.cvt2frames(tmp_folder.name, show_progress=False) + output_root = args.output_root + args.output_root = tmp_folder.name + pred_instances_list = [] + + for frame_id, img_fname in enumerate(os.listdir(tmp_folder.name)): + pred_instances = process_one_image( + args, + f'{tmp_folder.name}/{img_fname}', + model, + visualizer, + show_interval=1) + progressbar.update() + pred_instances_list.append( + dict( + frame_id=frame_id, + instances=split_instances(pred_instances))) + + if output_root: + mmcv.frames2video( + tmp_folder.name, + f'{output_root}/{os.path.basename(args.input)}', + fps=video.fps, + fourcc='mp4v', + show_progress=False) + tmp_folder.cleanup() + + else: + args.save_predictions = False + raise ValueError( + f'file {os.path.basename(args.input)} has invalid format.') + + if args.save_predictions: + with open(args.pred_save_path, 'w') as f: + json.dump( + dict( + meta_info=model.dataset_meta, + instance_info=pred_instances_list), + f, + indent='\t') + print(f'predictions have been saved at {args.pred_save_path}') + + +if __name__ == '__main__': + main() diff --git a/demo/docs/2d_animal_demo.md b/demo/docs/2d_animal_demo.md index 38ee3078ea..38997acd5d 100644 --- a/demo/docs/2d_animal_demo.md +++ b/demo/docs/2d_animal_demo.md @@ -9,7 +9,7 @@ python demo/topdown_demo_with_mmdet.py \ ${MMDET_CONFIG_FILE} ${MMDET_CHECKPOINT_FILE} \ ${MMPOSE_CONFIG_FILE} ${MMPOSE_CHECKPOINT_FILE} \ --input ${INPUT_PATH} --det-cat-id ${DET_CAT_ID} \ - [--show] [--output-root ${OUTPUT_DIR}] \ + [--show] [--output-root ${OUTPUT_DIR}] [--save-predictions] \ [--draw-heatmap ${DRAW_HEATMAP}] [--radius ${KPT_RADIUS}] \ [--kpt-thr ${KPT_SCORE_THR}] [--bbox-thr ${BBOX_SCORE_THR}] \ [--device ${GPU_ID or CPU}] @@ -53,6 +53,18 @@ python demo/topdown_demo_with_mmdet.py \ --output-root vis_results --draw-heatmap --det-cat-id=15 ``` +To save predicted results on disk: + +```shell +python demo/topdown_demo_with_mmdet.py \ + demo/mmdetection_cfg/faster_rcnn_r50_fpn_coco.py \ + https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth \ + configs/animal_2d_keypoint/topdown_heatmap/animalpose/td-hm_hrnet-w32_8xb64-210e_animalpose-256x256.py \ + https://download.openmmlab.com/mmpose/animal/hrnet/hrnet_w32_animalpose_256x256-1aa7f075_20210426.pth \ + --input tests/data/animalpose/ca110.jpeg \ + --output-root vis_results --save-predictions --draw-heatmap --det-cat-id=15 +``` + To run demos on CPU: ```shell diff --git a/demo/docs/2d_face_demo.md b/demo/docs/2d_face_demo.md index d72d33307b..09b8ceb330 100644 --- a/demo/docs/2d_face_demo.md +++ b/demo/docs/2d_face_demo.md @@ -1,29 +1,28 @@ ## 2D Face Keypoint Demo -We provide a demo script to test a single image or video with face detectors and top-down pose estimators, Please install `face_recognition` before running the demo, by: +We provide a demo script to test a single image or video with hand detectors and top-down pose estimators. Assume that you have already installed [mmdet](https://github.com/open-mmlab/mmdetection) with version >= 3.0. -``` -pip install face_recognition -``` - -For more details, please refer to [face_recognition](https://github.com/ageitgey/face_recognition). +**Face Box Model Preparation:** The pre-trained face box estimation model can be found in [mmdet model zoo](/demo/docs/mmdet_modelzoo.md). ### 2D Face Image Demo ```shell -python demo/topdown_face_demo.py \ +python demo/topdown_demo_with_mmdet.py \ + ${MMDET_CONFIG_FILE} ${MMDET_CHECKPOINT_FILE} \ ${MMPOSE_CONFIG_FILE} ${MMPOSE_CHECKPOINT_FILE} \ --input ${INPUT_PATH} [--output-root ${OUTPUT_DIR}] \ - [--show] [--device ${GPU_ID or CPU}] \ + [--show] [--device ${GPU_ID or CPU}] [--save-predictions] \ [--draw-heatmap ${DRAW_HEATMAP}] [--radius ${KPT_RADIUS}] \ - [--kpt-thr ${KPT_SCORE_THR}] + [--kpt-thr ${KPT_SCORE_THR}] [--bbox-thr ${BBOX_SCORE_THR}] ``` The pre-trained face keypoint estimation models can be found from [model zoo](https://mmpose.readthedocs.io/en/1.x/model_zoo/face_2d_keypoint.html). Take [aflw model](https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_aflw_256x256-f2bbc62b_20210125.pth) as an example: ```shell -python demo/topdown_face_demo.py \ +python demo/topdown_demo_with_mmdet.py \ + demo/mmdetection_cfg/yolox-s_8xb8-300e_coco-face.py \ + https://download.openmmlab.com/mmpose/mmdet_pretrained/yolo-x_8xb8-300e_coco-face_13274d7c.pth \ configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_8xb64-60e_aflw-256x256.py \ https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_aflw_256x256-f2bbc62b_20210125.pth \ --input tests/data/cofw/001766.jpg \ @@ -32,24 +31,30 @@ python demo/topdown_face_demo.py \ Visualization result: -
+
If you use a heatmap-based model and set argument `--draw-heatmap`, the predicted heatmap will be visualized together with the keypoints. To save visualized results on disk: ```shell -python demo/topdown_face_demo.py \ +python demo/topdown_demo_with_mmdet.py \ + demo/mmdetection_cfg/yolox-s_8xb8-300e_coco-face.py \ + https://download.openmmlab.com/mmpose/mmdet_pretrained/yolo-x_8xb8-300e_coco-face_13274d7c.pth \ configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_8xb64-60e_aflw-256x256.py \ https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_aflw_256x256-f2bbc62b_20210125.pth \ --input tests/data/cofw/001766.jpg \ --draw-heatmap --output-root vis_results ``` +To save the predicted results on disk, please specify `--save-predictions`. + To run demos on CPU: ```shell -python demo/topdown_face_demo.py \ +python demo/topdown_demo_with_mmdet.py \ + demo/mmdetection_cfg/yolox-s_8xb8-300e_coco-face.py \ + https://download.openmmlab.com/mmpose/mmdet_pretrained/yolo-x_8xb8-300e_coco-face_13274d7c.pth \ configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_8xb64-60e_aflw-256x256.py \ https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_aflw_256x256-f2bbc62b_20210125.pth \ --input tests/data/cofw/001766.jpg \ @@ -61,14 +66,16 @@ python demo/topdown_face_demo.py \ Videos share the same interface with images. The difference is that the `${INPUT_PATH}` for videos can be the local path or **URL** link to video file. ```shell -python demo/topdown_face_demo.py \ +python demo/topdown_demo_with_mmdet.py \ + demo/mmdetection_cfg/yolox-s_8xb8-300e_coco-face.py \ + https://download.openmmlab.com/mmpose/mmdet_pretrained/yolo-x_8xb8-300e_coco-face_13274d7c.pth \ configs/face_2d_keypoint/topdown_heatmap/aflw/td-hm_hrnetv2-w18_8xb64-60e_aflw-256x256.py \ https://download.openmmlab.com/mmpose/face/hrnetv2/hrnetv2_w18_aflw_256x256-f2bbc62b_20210125.pth \ --input demo/resources/ \ --show --draw-heatmap --output-root vis_results ``` -
+
The original video can be downloaded from [Google Drive](https://drive.google.com/file/d/1kQt80t6w802b_vgVcmiV_QfcSJ3RWzmb/view?usp=sharing). diff --git a/demo/docs/2d_hand_demo.md b/demo/docs/2d_hand_demo.md index fe1ba18692..d120108c54 100644 --- a/demo/docs/2d_hand_demo.md +++ b/demo/docs/2d_hand_demo.md @@ -11,7 +11,7 @@ python demo/topdown_demo_with_mmdet.py \ ${MMDET_CONFIG_FILE} ${MMDET_CHECKPOINT_FILE} \ ${MMPOSE_CONFIG_FILE} ${MMPOSE_CHECKPOINT_FILE} \ --input ${INPUT_PATH} [--output-root ${OUTPUT_DIR}] \ - [--show] [--device ${GPU_ID or CPU}] \ + [--show] [--device ${GPU_ID or CPU}] [--save-predictions] \ [--draw-heatmap ${DRAW_HEATMAP}] [--radius ${KPT_RADIUS}] \ [--kpt-thr ${KPT_SCORE_THR}] [--bbox-thr ${BBOX_SCORE_THR}] @@ -48,6 +48,8 @@ python demo/topdown_demo_with_mmdet.py \ --output-root vis_results --show --draw-heatmap ``` +To save the predicted results on disk, please specify `--save-predictions`. + To run demos on CPU: ```shell diff --git a/demo/docs/2d_human_pose_demo.md b/demo/docs/2d_human_pose_demo.md index 60f3c80bce..963ba25a92 100644 --- a/demo/docs/2d_human_pose_demo.md +++ b/demo/docs/2d_human_pose_demo.md @@ -57,9 +57,9 @@ python demo/topdown_demo_with_mmdet.py \ ${MMDET_CONFIG_FILE} ${MMDET_CHECKPOINT_FILE} \ ${MMPOSE_CONFIG_FILE} ${MMPOSE_CHECKPOINT_FILE} \ --input ${INPUT_PATH} \ - --output-root ${OUTPUT_DIR} \ - [--show --draw-heatmap --device ${GPU_ID or CPU}] \ - [--bbox-thr ${BBOX_SCORE_THR} --kpt-thr ${KPT_SCORE_THR}] + [--output-root ${OUTPUT_DIR}] [--save-predictions] \ + [--show] [--draw-heatmap] [--device ${GPU_ID or CPU}] \ + [--bbox-thr ${BBOX_SCORE_THR}] [--kpt-thr ${KPT_SCORE_THR}] ``` Example: @@ -78,6 +78,8 @@ Visualization result:
+To save the predicted results on disk, please specify `--save-predictions`. + ### 2D Human Pose Top-Down Video Demo The above demo script can also take video as input, and run mmdet for human detection, and mmpose for pose estimation. The difference is, the `${INPUT_PATH}` for videos can be the local path or **URL** link to video file. @@ -96,6 +98,33 @@ python demo/topdown_demo_with_mmdet.py \ --output-root=vis_results/demo --show --draw-heatmap ``` +### 2D Human Pose Bottom-up Image/Video Demo + +We also provide a demo script using bottom-up models to estimate the human pose in an image or a video, which does not rely on human detectors. + +```shell +python demo/bottomup_demo.py \ + ${MMPOSE_CONFIG_FILE} ${MMPOSE_CHECKPOINT_FILE} \ + --input ${INPUT_PATH} \ + [--output-root ${OUTPUT_DIR}] [--save-predictions] \ + [--show] [--device ${GPU_ID or CPU}] \ + [--kpt-thr ${KPT_SCORE_THR}] +``` + +Example: + +```shell +python demo/bottomup_demo.py \ + configs/body_2d_keypoint/dekr/coco/dekr_hrnet-w32_8xb10-140e_coco-512x512.py \ + https://download.openmmlab.com/mmpose/bottom_up/dekr/hrnet_w32_coco_512x512-2a3056de_20220928.pth \ + --input tests/data/coco/000000197388.jpg --output-root=vis_results \ + --show --save-predictions +``` + +Visualization result: + +
+ ### Speed Up Inference Some tips to speed up MMPose inference: diff --git a/demo/docs/2d_wholebody_pose_demo.md b/demo/docs/2d_wholebody_pose_demo.md index 8551388172..1615778429 100644 --- a/demo/docs/2d_wholebody_pose_demo.md +++ b/demo/docs/2d_wholebody_pose_demo.md @@ -47,9 +47,9 @@ python demo/topdown_demo_with_mmdet.py \ ${MMDET_CONFIG_FILE} ${MMDET_CHECKPOINT_FILE} \ ${MMPOSE_CONFIG_FILE} ${MMPOSE_CHECKPOINT_FILE} \ --input ${INPUT_PATH} \ - --output-root ${OUTPUT_DIR} \ - [--show --draw-heatmap --device ${GPU_ID or CPU}] \ - [--bbox-thr ${BBOX_SCORE_THR} --kpt-thr ${KPT_SCORE_THR}] + [--output-root ${OUTPUT_DIR}] [--save-predictions] \ + [--show] [--draw-heatmap] [--device ${GPU_ID or CPU}] \ + [--bbox-thr ${BBOX_SCORE_THR}] [--kpt-thr ${KPT_SCORE_THR}] ``` Examples: @@ -64,6 +64,8 @@ python demo/topdown_demo_with_mmdet.py \ --output-root vis_results/ --show ``` +To save the predicted results on disk, please specify `--save-predictions`. + ### 2D Human Whole-Body Pose Top-Down Video Demo The above demo script can also take video as input, and run mmdet for human detection, and mmpose for pose estimation. diff --git a/demo/docs/mmdet_modelzoo.md b/demo/docs/mmdet_modelzoo.md index d438a5e982..a50be168a5 100644 --- a/demo/docs/mmdet_modelzoo.md +++ b/demo/docs/mmdet_modelzoo.md @@ -15,6 +15,16 @@ For hand bounding box detection, we simply train our hand box models on onehand1 | :---------------------------------------------------------------- | :----: | :---------------------------------------------------------------: | :--------------------------------------------------------------: | | [Cascade_R-CNN X-101-64x4d-FPN-1class](/demo/mmdetection_cfg/cascade_rcnn_x101_64x4d_fpn_1class.py) | 0.817 | [ckpt](https://download.openmmlab.com/mmpose/mmdet_pretrained/cascade_rcnn_x101_64x4d_fpn_20e_onehand10k-dac19597_20201030.pth) | [log](https://download.openmmlab.com/mmpose/mmdet_pretrained/cascade_rcnn_x101_64x4d_fpn_20e_onehand10k_20201030.log.json) | +### Face Bounding Box Detection Models + +For face bounding box detection, we train a YOLOX detector on COCO-face data using MMDetection. + +#### Hand detection results on OneHand10K test set + +| Arch | Box AP | ckpt | +| :-------------------------------------------------------------- | :----: | :----------------------------------------------------------------------------------------------------: | +| [YOLOX-s](/demo/mmdetection_cfg/yolox-s_8xb8-300e_coco-face.py) | 0.408 | [ckpt](https://download.openmmlab.com/mmpose/mmdet_pretrained/yolo-x_8xb8-300e_coco-face_13274d7c.pth) | + ### Animal Bounding Box Detection Models #### COCO animals diff --git a/demo/image_demo.py b/demo/image_demo.py index 17f1e34233..0aa4a9e057 100644 --- a/demo/image_demo.py +++ b/demo/image_demo.py @@ -6,7 +6,6 @@ from mmpose.apis import inference_topdown, init_model from mmpose.registry import VISUALIZERS from mmpose.structures import merge_data_samples -from mmpose.utils import register_all_modules def parse_args(): @@ -21,13 +20,22 @@ def parse_args(): '--draw-heatmap', action='store_true', help='Visualize the predicted heatmap') + parser.add_argument( + '--show-kpt-idx', + action='store_true', + default=False, + help='Whether to show the index of keypoints') + parser.add_argument( + '--show', + action='store_true', + default=False, + help='whether to show img') args = parser.parse_args() return args -def main(args): - # register all modules in mmpose into the registries - register_all_modules() +def main(): + args = parse_args() # build the model from a config file and a checkpoint file if args.draw_heatmap: @@ -46,8 +54,8 @@ def main(args): visualizer.set_dataset_meta(model.dataset_meta) # inference a single image - results = inference_topdown(model, args.img) - results = merge_data_samples(results) + batch_results = inference_topdown(model, args.img) + results = merge_data_samples(batch_results) # show the results img = imread(args.img, channel_order='rgb') @@ -58,10 +66,10 @@ def main(args): draw_gt=False, draw_bbox=True, draw_heatmap=args.draw_heatmap, - show=True, + show_kpt_idx=args.show_kpt_idx, + show=args.show, out_file=args.out_file) if __name__ == '__main__': - args = parse_args() - main(args) + main() diff --git a/demo/inferencer_demo.py b/demo/inferencer_demo.py new file mode 100644 index 0000000000..df4877e6d9 --- /dev/null +++ b/demo/inferencer_demo.py @@ -0,0 +1,104 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from argparse import ArgumentParser + +from mmpose.apis.inferencers import MMPoseInferencer + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument( + 'inputs', type=str, help='Input image/video path or folder path.') + parser.add_argument( + '--pose2d', + type=str, + default=None, + help='Pretrained 2D pose estimation algorithm. It\'s the path to the ' + 'config file or the model name defined in metafile.') + parser.add_argument( + '--pose2d-weights', + type=str, + default=None, + help='Path to the custom checkpoint file of the selected pose model. ' + 'If it is not specified and "pose2d" is a model name of metafile, ' + 'the weights will be loaded from metafile.') + parser.add_argument( + '--det-model', + type=str, + default=None, + help='Config path or alias of detection model.') + parser.add_argument( + '--det-weights', + type=str, + default=None, + help='Path to the checkpoints of detection model.') + parser.add_argument( + '--det-cat-ids', + type=int, + nargs='+', + default=None, + help='Category id for detection model.') + parser.add_argument( + '--device', + type=str, + default=None, + help='Device used for inference. ' + 'If not specified, the available device will be automatically used.') + parser.add_argument( + '--show', + action='store_true', + help='Display the image/video in a popup window.') + parser.add_argument( + '--bbox-thr', + type=float, + default=0.3, + help='Bounding box score threshold') + parser.add_argument( + '--nms-thr', + type=float, + default=0.3, + help='IoU threshold for bounding box NMS') + parser.add_argument( + '--kpt-thr', type=float, default=0.3, help='Keypoint score threshold') + parser.add_argument( + '--radius', + type=int, + default=3, + help='Keypoint radius for visualization.') + parser.add_argument( + '--thickness', + type=int, + default=1, + help='Link thickness for visualization.') + parser.add_argument( + '--vis-out-dir', + type=str, + default='', + help='Directory for saving visualized results.') + parser.add_argument( + '--pred-out-dir', + type=str, + default='', + help='Directory for saving inference results.') + + call_args = vars(parser.parse_args()) + + init_kws = [ + 'pose2d', 'pose2d_weights', 'device', 'det_model', 'det_weights', + 'det_cat_ids' + ] + init_args = {} + for init_kw in init_kws: + init_args[init_kw] = call_args.pop(init_kw) + + return init_args, call_args + + +def main(): + init_args, call_args = parse_args() + inferencer = MMPoseInferencer(**init_args) + for _ in inferencer(**call_args): + pass + + +if __name__ == '__main__': + main() diff --git a/demo/mmdetection_cfg/yolox-s_8xb8-300e_coco-face.py b/demo/mmdetection_cfg/yolox-s_8xb8-300e_coco-face.py new file mode 100644 index 0000000000..9180b831e6 --- /dev/null +++ b/demo/mmdetection_cfg/yolox-s_8xb8-300e_coco-face.py @@ -0,0 +1,306 @@ +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=300, val_interval=10) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') +param_scheduler = [ + dict( + type='mmdet.QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + eta_min=0.0005, + begin=5, + T_max=285, + end=285, + by_epoch=True, + convert_to_iter_based=True), + dict(type='ConstantLR', by_epoch=True, factor=1, begin=285, end=300) +] +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005, nesterov=True), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0)) +auto_scale_lr = dict(enable=False, base_batch_size=64) +default_scope = 'mmdet' +default_hooks = dict( + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=50), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict(type='CheckpointHook', interval=10, max_keep_ckpts=3), + sampler_seed=dict(type='DistSamplerSeedHook'), + visualization=dict(type='DetVisualizationHook')) +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl')) +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='DetLocalVisualizer', + vis_backends=[dict(type='LocalVisBackend')], + name='visualizer') +log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True) +log_level = 'INFO' +load_from = 'https://download.openmmlab.com/mmdetection/' \ + 'v2.0/yolox/yolox_s_8x8_300e_coco/' \ + 'yolox_s_8x8_300e_coco_20211121_095711-4592a793.pth' +resume = False +img_scale = (640, 640) +model = dict( + type='YOLOX', + data_preprocessor=dict( + type='DetDataPreprocessor', + pad_size_divisor=32, + batch_augments=[ + dict( + type='BatchSyncRandomResize', + random_size_range=(480, 800), + size_divisor=32, + interval=10) + ]), + backbone=dict( + type='CSPDarknet', + deepen_factor=0.33, + widen_factor=0.5, + out_indices=(2, 3, 4), + use_depthwise=False, + spp_kernal_sizes=(5, 9, 13), + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish')), + neck=dict( + type='YOLOXPAFPN', + in_channels=[128, 256, 512], + out_channels=128, + num_csp_blocks=1, + use_depthwise=False, + upsample_cfg=dict(scale_factor=2, mode='nearest'), + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish')), + bbox_head=dict( + type='YOLOXHead', + num_classes=1, + in_channels=128, + feat_channels=128, + stacked_convs=2, + strides=(8, 16, 32), + use_depthwise=False, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=1.0), + loss_bbox=dict( + type='IoULoss', + mode='square', + eps=1e-16, + reduction='sum', + loss_weight=5.0), + loss_obj=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=1.0), + loss_l1=dict(type='L1Loss', reduction='sum', loss_weight=1.0)), + train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)), + test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.65))) +data_root = 'data/coco/' +dataset_type = 'CocoDataset' +file_client_args = dict(backend='disk') +train_pipeline = [ + dict(type='Mosaic', img_scale=(640, 640), pad_val=114.0), + dict( + type='RandomAffine', scaling_ratio_range=(0.1, 2), + border=(-320, -320)), + dict( + type='MixUp', + img_scale=(640, 640), + ratio_range=(0.8, 1.6), + pad_val=114.0), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict(type='Resize', scale=(640, 640), keep_ratio=True), + dict( + type='Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False), + dict(type='PackDetInputs') +] +train_dataset = dict( + type='MultiImageMixDataset', + dataset=dict( + type='CocoDataset', + data_root='data/coco/', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=[ + dict( + type='LoadImageFromFile', + file_client_args=dict(backend='disk')), + dict(type='LoadAnnotations', with_bbox=True) + ], + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + pipeline=[ + dict(type='Mosaic', img_scale=(640, 640), pad_val=114.0), + dict( + type='RandomAffine', + scaling_ratio_range=(0.1, 2), + border=(-320, -320)), + dict( + type='MixUp', + img_scale=(640, 640), + ratio_range=(0.8, 1.6), + pad_val=114.0), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict(type='Resize', scale=(640, 640), keep_ratio=True), + dict( + type='Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict( + type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False), + dict(type='PackDetInputs') + ]) +test_pipeline = [ + dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')), + dict(type='Resize', scale=(640, 640), keep_ratio=True), + dict( + type='Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=8, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='MultiImageMixDataset', + dataset=dict( + type='CocoDataset', + data_root='data/coco/', + ann_file='annotations/coco_face_train.json', + data_prefix=dict(img='train2017/'), + pipeline=[ + dict( + type='LoadImageFromFile', + file_client_args=dict(backend='disk')), + dict(type='LoadAnnotations', with_bbox=True) + ], + filter_cfg=dict(filter_empty_gt=False, min_size=32), + metainfo=dict(CLASSES=('person', ), PALETTE=(220, 20, 60))), + pipeline=[ + dict(type='Mosaic', img_scale=(640, 640), pad_val=114.0), + dict( + type='RandomAffine', + scaling_ratio_range=(0.1, 2), + border=(-320, -320)), + dict( + type='MixUp', + img_scale=(640, 640), + ratio_range=(0.8, 1.6), + pad_val=114.0), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict(type='Resize', scale=(640, 640), keep_ratio=True), + dict( + type='Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict( + type='FilterAnnotations', + min_gt_bbox_wh=(1, 1), + keep_empty=False), + dict(type='PackDetInputs') + ])) +val_dataloader = dict( + batch_size=8, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type='CocoDataset', + data_root='data/coco/', + ann_file='annotations/coco_face_val.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=[ + dict( + type='LoadImageFromFile', + file_client_args=dict(backend='disk')), + dict(type='Resize', scale=(640, 640), keep_ratio=True), + dict( + type='Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) + ], + metainfo=dict(CLASSES=('person', ), PALETTE=(220, 20, 60)))) +test_dataloader = dict( + batch_size=8, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type='CocoDataset', + data_root='data/coco/', + ann_file='annotations/coco_face_val.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=[ + dict( + type='LoadImageFromFile', + file_client_args=dict(backend='disk')), + dict(type='Resize', scale=(640, 640), keep_ratio=True), + dict( + type='Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) + ], + metainfo=dict(CLASSES=('person', ), PALETTE=(220, 20, 60)))) +val_evaluator = dict( + type='CocoMetric', + ann_file='data/coco/annotations/coco_face_val.json', + metric='bbox') +test_evaluator = dict( + type='CocoMetric', + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') +max_epochs = 300 +num_last_epochs = 15 +interval = 10 +base_lr = 0.01 +custom_hooks = [ + dict(type='YOLOXModeSwitchHook', num_last_epochs=15, priority=48), + dict(type='SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + strict_load=False, + update_buffers=True, + priority=49) +] +metainfo = dict(CLASSES=('person', ), PALETTE=(220, 20, 60)) +launcher = 'pytorch' diff --git a/demo/topdown_demo_with_mmdet.py b/demo/topdown_demo_with_mmdet.py index 3e4d766f23..418f3695b9 100644 --- a/demo/topdown_demo_with_mmdet.py +++ b/demo/topdown_demo_with_mmdet.py @@ -4,6 +4,7 @@ import tempfile from argparse import ArgumentParser +import json_tricks as json import mmcv import mmengine import numpy as np @@ -12,39 +13,35 @@ from mmpose.apis import init_model as init_pose_estimator from mmpose.evaluation.functional import nms from mmpose.registry import VISUALIZERS -from mmpose.structures import merge_data_samples -from mmpose.utils import register_all_modules as register_mmpose_modules +from mmpose.structures import merge_data_samples, split_instances +from mmpose.utils import adapt_mmdet_pipeline try: from mmdet.apis import inference_detector, init_detector - from mmdet.utils import register_all_modules as register_mmdet_modules has_mmdet = True except (ImportError, ModuleNotFoundError): has_mmdet = False -def visualize_img(args, img_path, detector, pose_estimator, visualizer, - show_interval): +def process_one_image(args, img_path, detector, pose_estimator, visualizer, + show_interval): """Visualize predicted keypoints (and heatmaps) of one image.""" # predict bbox - register_mmdet_modules() - detect_result = inference_detector(detector, img_path) - pred_instance = detect_result.pred_instances.cpu().numpy() + det_result = inference_detector(detector, img_path) + pred_instance = det_result.pred_instances.cpu().numpy() bboxes = np.concatenate( (pred_instance.bboxes, pred_instance.scores[:, None]), axis=1) bboxes = bboxes[np.logical_and(pred_instance.labels == args.det_cat_id, pred_instance.scores > args.bbox_thr)] - bboxes = bboxes[nms(bboxes, args.nms_thr)][:, :4] + bboxes = bboxes[nms(bboxes, args.nms_thr), :4] # predict keypoints - register_mmpose_modules() pose_results = inference_topdown(pose_estimator, img_path, bboxes) data_samples = merge_data_samples(pose_results) # show the results - img = mmcv.imread(img_path) - img = mmcv.imconvert(img, 'bgr', 'rgb') + img = mmcv.imread(img_path, channel_order='rgb') out_file = None if args.output_root: @@ -56,12 +53,16 @@ def visualize_img(args, img_path, detector, pose_estimator, visualizer, data_sample=data_samples, draw_gt=False, draw_heatmap=args.draw_heatmap, - draw_bbox=False, + draw_bbox=args.draw_bbox, + show_kpt_idx=args.show_kpt_idx, show=args.show, wait_time=show_interval, out_file=out_file, kpt_score_thr=args.kpt_thr) + # if there is no instance detected, return None + return data_samples.get('pred_instances', None) + def main(): """Visualize the demo images. @@ -86,6 +87,11 @@ def main(): default='', help='root of the output img file. ' 'Default not saving the visualization images.') + parser.add_argument( + '--save-predictions', + action='store_true', + default=False, + help='whether to save predicted results') parser.add_argument( '--device', default='cuda:0', help='Device used for inference') parser.add_argument( @@ -109,7 +115,12 @@ def main(): '--draw-heatmap', action='store_true', default=False, - help='Whether to draw output heatmap') + help='Draw heatmap predicted by the model') + parser.add_argument( + '--show-kpt-idx', + action='store_true', + default=False, + help='Whether to show the index of keypoints') parser.add_argument( '--radius', type=int, @@ -120,6 +131,8 @@ def main(): type=int, default=1, help='Link thickness for visualization') + parser.add_argument( + '--draw-bbox', action='store_true', help='Draw bboxes of instances') assert has_mmdet, 'Please install mmdet to run the demo.' @@ -131,14 +144,17 @@ def main(): assert args.det_checkpoint is not None if args.output_root: mmengine.mkdir_or_exist(args.output_root) + if args.save_predictions: + assert args.output_root != '' + args.pred_save_path = f'{args.output_root}/results_' \ + f'{os.path.splitext(os.path.basename(args.input))[0]}.json' # build detector - register_mmdet_modules() detector = init_detector( args.det_config, args.det_checkpoint, device=args.device) + detector.cfg = adapt_mmdet_pipeline(detector.cfg) # build pose estimator - register_mmpose_modules() pose_estimator = init_pose_estimator( args.pose_config, args.pose_checkpoint, @@ -156,13 +172,15 @@ def main(): input_type = mimetypes.guess_type(args.input)[0].split('/')[0] if input_type == 'image': - visualize_img( + pred_instances = process_one_image( args, args.input, detector, pose_estimator, visualizer, show_interval=0) + pred_instances_list = split_instances(pred_instances) + elif input_type == 'video': tmp_folder = tempfile.TemporaryDirectory() video = mmcv.VideoReader(args.input) @@ -170,15 +188,23 @@ def main(): video.cvt2frames(tmp_folder.name, show_progress=False) output_root = args.output_root args.output_root = tmp_folder.name - for img_fname in os.listdir(tmp_folder.name): - visualize_img( + pred_instances_list = [] + + for frame_id, img_fname in enumerate(os.listdir(tmp_folder.name)): + pred_instances = process_one_image( args, f'{tmp_folder.name}/{img_fname}', detector, pose_estimator, visualizer, show_interval=1) + progressbar.update() + pred_instances_list.append( + dict( + frame_id=frame_id, + instances=split_instances(pred_instances))) + if output_root: mmcv.frames2video( tmp_folder.name, @@ -187,10 +213,22 @@ def main(): fourcc='mp4v', show_progress=False) tmp_folder.cleanup() + else: + args.save_predictions = False raise ValueError( f'file {os.path.basename(args.input)} has invalid format.') + if args.save_predictions: + with open(args.pred_save_path, 'w') as f: + json.dump( + dict( + meta_info=pose_estimator.dataset_meta, + instance_info=pred_instances_list), + f, + indent='\t') + print(f'predictions have been saved at {args.pred_save_path}') + if __name__ == '__main__': main() diff --git a/demo/topdown_face_demo.py b/demo/topdown_face_demo.py deleted file mode 100644 index b53a017a0f..0000000000 --- a/demo/topdown_face_demo.py +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import mimetypes -import os -import tempfile -from argparse import ArgumentParser - -import mmcv -import mmengine -import numpy as np - -from mmpose.apis import inference_topdown -from mmpose.apis import init_model as init_pose_estimator -from mmpose.evaluation.functional import nms -from mmpose.registry import VISUALIZERS -from mmpose.structures import merge_data_samples -from mmpose.utils import register_all_modules as register_mmpose_modules - -try: - import face_recognition - has_face_det = True -except (ImportError, ModuleNotFoundError): - has_face_det = False - - -def process_face_det_results(face_det_results): - """Process det results, and return a list of bboxes. - - :param face_det_results: (top, right, bottom and left) - :return: a list of detected bounding boxes (x,y,x,y)-format - """ - - person_results = [] - for bbox in face_det_results: - # left, top, right, bottom - person_results.append([bbox[3], bbox[0], bbox[1], bbox[2]]) - person_results = np.array(person_results) - - return person_results - - -def visualize_img(args, img_path, pose_estimator, visualizer, show_interval): - """Visualize predicted keypoints (and heatmaps) of one image.""" - - # predict bbox - image = face_recognition.load_image_file(img_path) - face_det_results = face_recognition.face_locations(image) - bboxes = process_face_det_results(face_det_results) - - bboxes = np.concatenate((bboxes, np.ones((bboxes.shape[0], 1))), axis=1) - bboxes = bboxes[nms(bboxes, args.nms_thr)][:, :4] - - # predict keypoints - pose_results = inference_topdown(pose_estimator, img_path, bboxes) - data_samples = merge_data_samples(pose_results) - - # show the results - img = mmcv.imread(img_path) - img = mmcv.imconvert(img, 'bgr', 'rgb') - - out_file = None - if args.output_root: - out_file = f'{args.output_root}/{os.path.basename(img_path)}' - - visualizer.add_datasample( - 'result', - img, - data_sample=data_samples, - draw_gt=False, - draw_heatmap=args.draw_heatmap, - draw_bbox=False, - show=args.show, - wait_time=show_interval, - out_file=out_file, - kpt_score_thr=args.kpt_thr) - - -def main(): - """Visualize the demo images. - - Use `face_recognition` to detect the face. - """ - parser = ArgumentParser() - parser.add_argument('pose_config', help='Config file for pose') - parser.add_argument('pose_checkpoint', help='Checkpoint file for pose') - parser.add_argument( - '--input', type=str, default='', help='Image/Video file') - parser.add_argument( - '--show', - action='store_true', - default=False, - help='whether to show img') - parser.add_argument( - '--output-root', - type=str, - default='', - help='root of the output img file. ' - 'Default not saving the visualization images.') - parser.add_argument( - '--device', default='cuda:0', help='Device used for inference') - parser.add_argument( - '--nms-thr', - type=float, - default=0.3, - help='IoU threshold for bounding box NMS') - parser.add_argument( - '--kpt-thr', type=float, default=0.3, help='Keypoint score threshold') - parser.add_argument( - '--draw-heatmap', - action='store_true', - default=False, - help='Whether to draw output heatmap') - parser.add_argument( - '--radius', - type=int, - default=2, - help='Keypoint radius for visualization') - parser.add_argument( - '--thickness', - type=int, - default=1, - help='Link thickness for visualization') - - assert has_face_det, 'Please install face_recognition to run the demo. ' \ - '"pip install face_recognition", For more details, ' \ - 'see https://github.com/ageitgey/face_recognition' - - args = parser.parse_args() - - assert args.show or (args.output_root != '') - assert args.input != '' - if args.output_root: - mmengine.mkdir_or_exist(args.output_root) - - # build pose estimator - register_mmpose_modules() - pose_estimator = init_pose_estimator( - args.pose_config, - args.pose_checkpoint, - device=args.device, - cfg_options=dict( - model=dict(test_cfg=dict(output_heatmaps=args.draw_heatmap)))) - - # init visualizer - pose_estimator.cfg.visualizer.radius = args.radius - pose_estimator.cfg.visualizer.line_width = args.thickness - visualizer = VISUALIZERS.build(pose_estimator.cfg.visualizer) - # the dataset_meta is loaded from the checkpoint and - # then pass to the model in init_pose_estimator - visualizer.set_dataset_meta(pose_estimator.dataset_meta) - visualizer.kpt_color = 'red' - - input_type = mimetypes.guess_type(args.input)[0].split('/')[0] - if input_type == 'image': - visualize_img( - args, args.input, pose_estimator, visualizer, show_interval=0) - elif input_type == 'video': - tmp_folder = tempfile.TemporaryDirectory() - video = mmcv.VideoReader(args.input) - progressbar = mmengine.ProgressBar(len(video)) - video.cvt2frames(tmp_folder.name, show_progress=False) - output_root = args.output_root - args.output_root = tmp_folder.name - for img_fname in os.listdir(tmp_folder.name): - visualize_img( - args, - f'{tmp_folder.name}/{img_fname}', - pose_estimator, - visualizer, - show_interval=1) - progressbar.update() - if output_root: - mmcv.frames2video( - tmp_folder.name, - f'{output_root}/{os.path.basename(args.input)}', - fps=video.fps, - fourcc='mp4v', - show_progress=False) - tmp_folder.cleanup() - else: - raise ValueError( - f'file {os.path.basename(args.input)} has invalid format.') - - -if __name__ == '__main__': - main() diff --git a/demo/webcam_cfg/pose_estimation.py b/demo/webcam_cfg/pose_estimation.py index 6f34ce64c6..2246a00779 100644 --- a/demo/webcam_cfg/pose_estimation.py +++ b/demo/webcam_cfg/pose_estimation.py @@ -85,9 +85,9 @@ enable=False, input_buffer='vis', output_buffer='vis_sunglasses'), - # # 'BigeyeEffectNode': - # # This node draw the big-eye effetc in the frame image. - # # Pose results is needed. + # 'BigeyeEffectNode': + # This node draw the big-eye effetc in the frame image. + # Pose results is needed. dict( type='BigeyeEffectNode', name='big-eye', diff --git a/docker/serve/Dockerfile b/docker/serve/Dockerfile index ac7162effd..091599b51a 100644 --- a/docker/serve/Dockerfile +++ b/docker/serve/Dockerfile @@ -3,8 +3,8 @@ ARG CUDA="10.2" ARG CUDNN="7" FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel -ARG MMCV="2.0.0rc1" -ARG MMPOSE="1.0.0b0" +ARG MMCV="2.0.0rc4" +ARG MMPOSE="1.0.0rc1" ENV PYTHONUNBUFFERED TRUE diff --git a/docs/en/advanced_guides.md b/docs/en/advanced_guides.md deleted file mode 100644 index 25fcefe33e..0000000000 --- a/docs/en/advanced_guides.md +++ /dev/null @@ -1,3 +0,0 @@ -# Advanced Guides - -Work in progress... diff --git a/docs/en/advanced_guides/advanced_training.md b/docs/en/advanced_guides/advanced_training.md new file mode 100644 index 0000000000..ed079d16a5 --- /dev/null +++ b/docs/en/advanced_guides/advanced_training.md @@ -0,0 +1,96 @@ +# Advanced Training + +## Resume Training + +Resume training means to continue training from the state saved from one of the previous trainings, where the state includes the model weights, the state of the optimizer and the optimizer parameter adjustment strategy. + +### Automatically resume training + +Users can add `--resume` to the end of the training command to resume training. The program will automatically load the latest weight file from `work_dirs` to resume training. If there is a latest `checkpoint` in `work_dirs` (e.g. the training was interrupted during the previous training), the training will be resumed from the `checkpoint`. Otherwise (e.g. the previous training did not save `checkpoint` in time or a new training task was started), the training will be restarted. + +Here is an example of resuming training: + +```shell +python tools/train.py configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py --resume +``` + +### Specify the checkpoint to resume training + +You can also specify the `checkpoint` path for `--resume`. MMPose will automatically read the `checkpoint` and resume training from it. The command is as follows: + +```shell +python tools/train.py configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py \ + --resume work_dirs/td-hm_res50_8xb64-210e_coco-256x192/latest.pth +``` + +If you hope to manually specify the `checkpoint` path in the config file, in addition to setting `resume=True`, you also need to set the `load_from`. + +It should be noted that if only `load_from` is set without setting `resume=True`, only the weights in the `checkpoint` will be loaded and the training will be restarted from scratch, instead of continuing from the previous state. + +The following example is equivalent to the example above that specifies the `--resume` parameter: + +```python +resume = True +load_from = 'work_dirs/td-hm_res50_8xb64-210e_coco-256x192/latest.pth' +# model settings +model = dict( + ## omitted ## + ) +``` + +## Automatic Mixed Precision (AMP) Training + +Mixed precision training can reduce training time and storage requirements without changing the model or reducing the model training accuracy, thus supporting larger batch sizes, larger models, and larger input sizes. + +To enable Automatic Mixing Precision (AMP) training, add `--amp` to the end of the training command, which is as follows: + +```shell +python tools/train.py ${CONFIG_FILE} --amp +``` + +Specific examples are as follows: + +```shell +python tools/train.py configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py --amp +``` + +## Set the random seed + +If you want to specify the random seed during training, you can use the following command: + +```shell +python ./tools/train.py \ + ${CONFIG} \ # config file + --cfg-options randomness.seed=2023 \ # set the random seed = 2023 + [randomness.diff_rank_seed=True] \ # Set different seeds according to rank. + [randomness.deterministic=True] # Set the cuDNN backend deterministic option to True +# `[]` stands for optional parameters, when actually entering the command line, you do not need to enter `[]` +``` + +`randomness` has three parameters that can be set, with the following meanings. + +- `randomness.seed=2023`, set the random seed to `2023`. + +- `randomness.diff_rank_seed=True`, set different seeds according to global `rank`. Defaults to `False`. + +- `randomness.deterministic=True`, set the deterministic option for `cuDNN` backend, i.e., set `torch.backends.cudnn.deterministic` to `True` and `torch.backends.cudnn.benchmark` to `False`. Defaults to `False`. See [Pytorch Randomness](https://pytorch.org/docs/stable/notes/randomness.html) for more details. + +## Use Tensorboard to Visualize Training + +Install Tensorboard environment + +```shell +pip install tensorboard +``` + +Enable Tensorboard in the config file + +```python +visualizer = dict(vis_backends=[dict(type='LocalVisBackend'),dict(type='TensorboardVisBackend')]) +``` + +After training, you can use the following command to visualize the training process. + +```shell +tensorboard --logdir work_dir/${CONFIG}/${TIMESTAMP}/vis_data +``` diff --git a/docs/en/advanced_guides/mixed_datasets.md b/docs/en/advanced_guides/mixed_datasets.md new file mode 100644 index 0000000000..1c595314b7 --- /dev/null +++ b/docs/en/advanced_guides/mixed_datasets.md @@ -0,0 +1,159 @@ +# Training with Mixed Datasets + +MMPose offers a convenient and versatile solution for training with mixed datasets through its `CombinedDataset` tool. Acting as a wrapper, it allows for the inclusion of multiple datasets and seamlessly reads and converts data from varying sources into a unified format for model training. The data processing pipeline utilizing `CombinedDataset` is illustrated in the following figure. + +![combined_dataset_pipeline](https://user-images.githubusercontent.com/26127467/223333154-fb88e511-810a-423c-b755-c791d296bc43.jpg) + +The following section will provide a detailed description of how to configure `CombinedDataset` with an example that combines the COCO and AI Challenger (AIC) datasets. + +## COCO & AIC example + +The COCO and AIC datasets are both human 2D pose datasets, but they differ in the number and order of keypoints. Here are two instances from the respective datasets. + +
+ +Some keypoints, such as "left hand", are defined in both datasets, but they have different indices. Specifically, the index for the "left hand" keypoint is 9 in the COCO dataset and 5 in the AIC dataset. Furthermore, each dataset contains unique keypoints that are not present in the counterpart dataset. For instance, the facial keypoints (with indices 0~4) are only defined in the COCO dataset, whereas the "head top" (with index 12) and "neck" (with index 13) keypoints are exclusive to the AIC dataset. The relationship between the keypoints in both datasets is illustrated in the following Venn diagram. + +
+ +Next, we will discuss two methods of mixing datasets. + +- [Merge](#merge-aic-into-coco) +- [Combine](#combine-aic-and-coco) + +### Merge AIC into COCO + +If users aim to enhance their model's performance on the COCO dataset or other similar datasets, they can use the AIC dataset as an auxiliary source. To do so, they should select only the keypoints in AIC dataset that are shared with COCO datasets and ignore the rest. Moreover, the indices of these chosen keypoints in the AIC dataset should be transformed to match the corresponding indices in the COCO dataset. + +
+ +In this scenario, no data conversion is required for the elements from the COCO dataset. To configure the COCO dataset, use the following code: + +```python +dataset_coco = dict( + type='CocoDataset', + data_root='data/coco/', + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=[], # Leave the `pipeline` empty, as no conversion is needed +) +``` + +For AIC dataset, the order of the keypoints needs to be transformed. MMPose provides a `KeypointConverter` transform to achieve this. Here's an example of how to configure the AIC sub dataset: + +```python +dataset_aic = dict( + type='AicDataset', + data_root='data/aic/', + ann_file='annotations/aic_train.json', + data_prefix=dict(img='ai_challenger_keypoint_train_20170902/' + 'keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=17, # same as COCO dataset + mapping=[ # includes index pairs for corresponding keypoints + (0, 6), # index 0 (in AIC) -> index 6 (in COCO) + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + ]) + ], +) +``` + +By using the `KeypointConverter`, the indices of keypoints with indices 0 to 11 will be transformed to corresponding indices among 5 to 16. Meanwhile, the keypoints with indices 12 and 13 will be removed. For the target keypoints with indices 0 to 4, which are not defined in the `mapping` argument, they will be set as invisible and won't be used in training. + +Once the sub datasets are configured, the `CombinedDataset` wrapper can be defined as follows: + +```python +dataset = dict( + type='CombinedDataset', + # Since the combined dataset has the same data format as COCO, + # it should use the same meta information for the dataset + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[dataset_coco, dataset_aic], + # The pipeline includes typical transforms, such as loading the + # image and data augmentation + pipeline=train_pipeline, +) +``` + +A complete, ready-to-use [config file](https://github.com/open-mmlab/mmpose/blob/dev-1.x/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge.py) that merges the AIC dataset into the COCO dataset is also available. Users can refer to it for more details and use it as a template to build their own custom dataset. + +### Combine AIC and COCO + +The previously mentioned method discards some annotations in the AIC dataset. If users want to use all the information from both datasets, they can combine the two datasets. This means taking the union set of keypoints in both datasets. + +
+ +In this scenario, both COCO and AIC datasets need to adjust the keypoint indices using `KeypointConverter`: + +```python +dataset_coco = dict( + type='CocoDataset', + data_root='data/coco/', + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, # the size of union keypoint set + mapping=[ + (0, 0), + (1, 1), + # omitted + (16, 16), + ]) + ]) + +dataset_aic = dict( + type='AicDataset', + data_root='data/aic/', + ann_file='annotations/aic_train.json', + data_prefix=dict(img='ai_challenger_keypoint_train_20170902/' + 'keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, # the size of union keypoint set + mapping=[ + (0, 6), + # omitted + (12, 17), + (13, 18), + ]) + ], +) +``` + +To account for the fact that the combined dataset has 19 keypoints, which is different from either COCO or AIC dataset, a new dataset meta information file is needed to describe the new dataset. An example of such a file is [coco_aic.py](https://github.com/open-mmlab/mmpose/blob/dev-1.x/configs/_base_/datasets/coco_aic.py), which is based on [coco.py](https://github.com/open-mmlab/mmpose/blob/dev-1.x/configs/_base_/datasets/coco.py) but includes several updates: + +- The paper information of AIC dataset has been added. +- The 'head_top' and 'neck' keypoints, which are unique in AIC, have been added to the `keypoint_info`. +- A skeleton link between 'head_top' and 'neck' has been added. +- The `joint_weights` and `sigmas` have been extended for the newly added keypoints. + +Finally, the combined dataset can be configured as: + +```python +dataset = dict( + type='CombinedDataset', + # using new dataset meta information file + metainfo=dict(from_file='configs/_base_/datasets/coco_aic.py'), + datasets=[dataset_coco, dataset_aic], + # The pipeline includes typical transforms, such as loading the + # image and data augmentation + pipeline=train_pipeline, +) +``` + +Additionally, the output channel number of the model should be adjusted as the number of keypoints changes. If the users aim to evaluate the model on the COCO dataset, a subset of model outputs must be chosen. This subset can be customized using the `output_keypoint_indices` argument in `test_cfg`. Users can refer to the [config file](https://github.com/open-mmlab/mmpose/blob/dev-1.x/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py), which combines the COCO and AIC dataset, for more details and use it as a template to create their custom dataset. diff --git a/docs/en/dataset_zoo/2d_animal_keypoint.md b/docs/en/dataset_zoo/2d_animal_keypoint.md index a80b539fc6..956bed3ad0 100644 --- a/docs/en/dataset_zoo/2d_animal_keypoint.md +++ b/docs/en/dataset_zoo/2d_animal_keypoint.md @@ -8,7 +8,7 @@ MMPose supported datasets: - [Animal-Pose](#animal-pose) \[ [Homepage](https://sites.google.com/view/animal-pose/) \] - [AP-10K](#ap-10k) \[ [Homepage](https://github.com/AlexTheBad/AP-10K/) \] - [Horse-10](#horse-10) \[ [Homepage](http://www.mackenziemathislab.org/horse10) \] -- [MacaquePose](#macaquepose) \[ [Homepage](http://www2.ehub.kyoto-u.ac.jp/datasets/macaquepose/index.html) \] +- [MacaquePose](#macaquepose) \[ [Homepage](http://pri.ehub.kyoto-u.ac.jp/datasets/macaquepose/index.html) \] - [Vinegar Fly](#vinegar-fly) \[ [Homepage](https://github.com/jgraving/DeepPoseKit-Data) \] - [Desert Locust](#desert-locust) \[ [Homepage](https://github.com/jgraving/DeepPoseKit-Data) \] - [Grévy’s Zebra](#grvys-zebra) \[ [Homepage](https://github.com/jgraving/DeepPoseKit-Data) \] @@ -216,7 +216,7 @@ mmpose
-For [MacaquePose](http://www2.ehub.kyoto-u.ac.jp/datasets/macaquepose/index.html) dataset, images can be downloaded from [download](http://www2.ehub.kyoto-u.ac.jp/datasets/macaquepose/index.html). +For [MacaquePose](http://pri.ehub.kyoto-u.ac.jp/datasets/macaquepose/index.html) dataset, images can be downloaded from [download](http://pri.ehub.kyoto-u.ac.jp/datasets/macaquepose/download.php). Please download the annotation files from [macaque_annotations](https://download.openmmlab.com/mmpose/datasets/macaque_annotations.tar). Extract them under {MMPose}/data, and make them look like this: diff --git a/docs/en/dataset_zoo/2d_hand_keypoint.md b/docs/en/dataset_zoo/2d_hand_keypoint.md index 1d369e775b..72cdf485b3 100644 --- a/docs/en/dataset_zoo/2d_hand_keypoint.md +++ b/docs/en/dataset_zoo/2d_hand_keypoint.md @@ -184,7 +184,7 @@ year = {2020}
For [InterHand2.6M](https://mks0601.github.io/InterHand2.6M/), please download from [InterHand2.6M](https://mks0601.github.io/InterHand2.6M/). -Please download the annotation files from [annotations](https://drive.google.com/drive/folders/1pWXhdfaka-J0fSAze0MsajN0VpZ8e8tO). +Please download the annotation files from [annotations](https://download.openmmlab.com/mmpose/datasets/interhand2.6m_annotations.zip). Extract them under {MMPose}/data, and make them look like this: ```text diff --git a/docs/en/dataset_zoo/3d_hand_keypoint.md b/docs/en/dataset_zoo/3d_hand_keypoint.md index 17537e4476..8f5937be10 100644 --- a/docs/en/dataset_zoo/3d_hand_keypoint.md +++ b/docs/en/dataset_zoo/3d_hand_keypoint.md @@ -26,7 +26,7 @@ year = {2020} For [InterHand2.6M](https://mks0601.github.io/InterHand2.6M/), please download from [InterHand2.6M](https://mks0601.github.io/InterHand2.6M/). -Please download the annotation files from [annotations](https://drive.google.com/drive/folders/1pWXhdfaka-J0fSAze0MsajN0VpZ8e8tO). +Please download the annotation files from [annotations](https://download.openmmlab.com/mmpose/datasets/interhand2.6m_annotations.zip). Extract them under {MMPose}/data, and make them look like this: ```text diff --git a/docs/en/user_guides/useful_tools.md b/docs/en/dataset_zoo/dataset_tools.md similarity index 73% rename from docs/en/user_guides/useful_tools.md rename to docs/en/dataset_zoo/dataset_tools.md index 3e8316c069..3ff70fc401 100644 --- a/docs/en/user_guides/useful_tools.md +++ b/docs/en/dataset_zoo/dataset_tools.md @@ -1,132 +1,6 @@ -# Useful Tools +# Dataset Tools -Apart from training/testing scripts, We provide lots of useful tools under the `tools/` directory. - - - -- [Analysis Tools](#analysis-tools) - - [Log Analysis](#log-analysis) - - [Model Complexity (Experimental)](#model-complexity-experimental) - - [Print the entire config](#print-the-entire-config) -- [Dataset Tools](#dataset-tools) - - [Animal Pose](#animal-pose) - - [COFW](#cofw) - - [DeepposeKit](#deepposekit) - - [Macaque](#macaque) - - [Human36M](#human36m) - - [MPII](#mpii) - - - -## Analysis Tools - -### Log Analysis - -`tools/analysis_tools/analyze_logs.py` plots `loss_kpt` / `acc_pose` curves given a training log file. Run `pip install seaborn` first to install the dependency. - -![loss_kpt_curve_image](https://user-images.githubusercontent.com/87690686/188538215-5d985aaa-59f8-44cf-b6f9-10890d599e9c.png) - -```shell -python tools/analysis_tools/analyze_logs.py plot_curve ${JSON_LOGS} [--keys ${KEYS}] [--title ${TITLE}] [--legend ${LEGEND}] [--backend ${BACKEND}] [--style ${STYLE}] [--out ${OUT_FILE}] -``` - -Examples: - -- Plot the `loss_kpt` of some run. - - ```shell - python tools/analysis_tools/analyze_logs.py plot_curve log.json --keys loss_kpt --legend loss_kpt - ``` - -- Plot the `acc_pose` of some run, and save the figure to a pdf. - - ```shell - python tools/analysis_tools/analyze_logs.py plot_curve log.json --keys acc_pose --out results.pdf - ``` - -- Compare the `loss_kpt` of two runs in the same figure. - - ```shell - python tools/analysis_tools/analyze_logs.py plot_curve log1.json log2.json --keys loss_kpt --legend run1 run2 --title loss_kpt --out loss_kpt.png - ``` - -You can also compute the average training speed. - -```shell -python tools/analysis_tools/analyze_logs.py cal_train_time ${JSON_LOGS} [--include-outliers] -``` - -- Compute the average training speed for a config file, for example: - - ```shell - python tools/analysis_tools/analyze_logs.py cal_train_time log.json - ``` - - The output is expected to be like the following. - - ```text - -----Analyze train time of hrnet_w32_256x192.json----- - slowest epoch 56, average time is 0.6924 - fastest epoch 1, average time is 0.6502 - time std over epochs is 0.0085 - average iter time: 0.6688 s/iter - ``` - -### Model Complexity (Experimental) - -`/tools/analysis_tools/get_flops.py` is a script adapted from [flops-counter.pytorch](https://github.com/sovrasov/flops-counter.pytorch) to compute the FLOPs and params of a given model. - -Usage: - -```shell -python tools/analysis_tools/get_flops.py ${CONFIG_FILE} [--shape ${INPUT_SHAPE}] [--cfg-options ${CFG_OPTIONS}] -``` - -Description of all arguments: - -- `CONFIG_FILE` : The path of a model config file. -- `--shape`: The input shape to the model. -- `--input-constructor`: If specified as `batch`, it will generate a batch tensor to calculate FLOPs. -- `--batch-size`:If `--input-constructor` is specified as `batch`, it will generate a random tensor with shape (batch_size, 3, \*\*input_shape) to calculate FLOPs. -- `--cfg-options`: If specified, the key-value pair optional cfg will be merged into config file. - -Examples: - -```shell -python tools/analysis_tools/get_flops.py configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py -``` - -We will get the following results: - -``` -============================== -Input shape: (1, 3, 256, 192) -Flops: 7.7 GFLOPs -Params: 28.54 M -============================== -``` - -```{note} -This tool is still experimental and we do not guarantee that the number is absolutely correct. -``` - -You may use the result for simple comparisons, but double check it before you adopt it in technical reports or papers. - -(1) FLOPs are related to the input shape while parameters are not. The default input shape is (1, 3, 256, 192). - -(2) Some operators are not counted into FLOPs like GN and custom operators. Refer to [`mmcv.cnn.get_model_complexity_info()`](https://github.com/open-mmlab/mmcv/blob/master/mmcv/cnn/utils/flops_counter.py) for details. - -### Print the entire config - -`tools/analysis_tools/print_config.py` prints the whole config verbatim, expanding all its imports. - -```shell -python tools/analysis_tools/print_config.py ${CONFIG} [-h] [--options ${OPTIONS [OPTIONS...]}] -``` - -## Dataset Tools - -### Animal Pose +## Animal Pose
Animal-Pose (ICCV'2019) @@ -205,7 +79,7 @@ We choose the images from PascalVOC for train & val. In total, we have 3608 imag 2798 images with 4000 annotations are used for training, and 810 images with 1117 annotations are used for validation. Those images from other sources (1000 images with 1000 annotations) are used for testing. -### COFW +## COFW
COFW (ICCV'2013) @@ -265,7 +139,7 @@ mmpose |── 000002.jpg ``` -### DeepposeKit +## DeepposeKit
Desert Locust (Elife'2019) @@ -333,7 +207,7 @@ For [Vinegar Fly](https://github.com/jgraving/DeepPoseKit-Data), [Desert Locust] Since the official dataset does not provide the test set, we randomly select 90% images for training, and the rest (10%) for evaluation. -### Macaque +## Macaque
MacaquePose (bioRxiv'2020) @@ -383,7 +257,7 @@ For [MacaquePose](http://www2.ehub.kyoto-u.ac.jp/datasets/macaquepose/index.html Since the official dataset does not provide the test set, we randomly select 12500 images for training, and the rest for evaluation. -### Human36M +## Human3.6M
Human3.6M (TPAMI'2014) @@ -459,7 +333,7 @@ After that, the annotations need to be transformed into COCO format which is com python tools/dataset_converters/h36m_to_coco.py ``` -### MPII +## MPII
MPII (CVPR'2014) @@ -476,7 +350,7 @@ python tools/dataset_converters/h36m_to_coco.py
-During training and inference for [MPII dataset](<[MPII](http://human-pose.mpi-inf.mpg.de/)>), the prediction result will be saved as '.mat' format by default. We also provide a tool to convert this `.mat` to more readable `.json` format. +During training and inference for [MPII](http://human-pose.mpi-inf.mpg.de/), the prediction result will be saved as '.mat' format by default. We also provide a tool to convert this `.mat` to more readable `.json` format. ```shell python tools/dataset_converters/mat2json ${PRED_MAT_FILE} ${GT_JSON_FILE} ${OUTPUT_PRED_JSON_FILE} diff --git a/docs/en/guide_to_framework.md b/docs/en/guide_to_framework.md new file mode 100644 index 0000000000..cd39477fc3 --- /dev/null +++ b/docs/en/guide_to_framework.md @@ -0,0 +1,638 @@ +# A 20 Minute Guide to MMPose Framework + +MMPose 1.0 is built upon a brand-new framework. For developers with basic knowledge of deep learning, this tutorial provides a overview of MMPose 1.0 framework design. Whether you are **a user of the previous version of MMPose**, or **a beginner of MMPose wishing to start with v1.0**, this tutorial will show you how to build a project based on MMPose 1.0. + +```{note} +This tutorial covers what developers will concern when using MMPose 1.0: + +- Overall code architecture + +- How to manage modules with configs + +- How to use my own custom datasets + +- How to add new modules(backbone, head, loss function, etc.) +``` + +The content of this tutorial is organized as follows: + +- [A 20 Minute Guide to MMPose Framework](#a-20-minute-guide-to-mmpose-framework) + - [Overview](#overview) + - [Step1: Configs](#step1-configs) + - [Step2: Data](#step2-data) + - [Dataset Meta Information](#dataset-meta-information) + - [Dataset](#dataset) + - [Pipeline](#pipeline) + - [i. Augmentation](#i-augmentation) + - [ii. Transformation](#ii-transformation) + - [iii. Encoding](#iii-encoding) + - [iv. Packing](#iv-packing) + - [Step3: Model](#step3-model) + - [Data Preprocessor](#data-preprocessor) + - [Backbone](#backbone) + - [Neck](#neck) + - [Head](#head) + +## Overview + +![overall-en](https://user-images.githubusercontent.com/13503330/187372008-2a94bad5-5252-4155-9ae3-3da1c426f569.png) + +Generally speaking, there are **five parts** developers will use during project development: + +- **General:** Environment, Hook, Checkpoint, Logger, etc. + +- **Data:** Dataset, Dataloader, Data Augmentation, etc. + +- **Training:** Optimizer, Learning Rate Scheduler, etc. + +- **Model:** Backbone, Neck, Head, Loss function, etc. + +- **Evaluation:** Metric, Evaluator, etc. + +Among them, modules related to **General**, **Training** and **Evaluation** are often provided by the training framework [MMEngine](https://github.com/open-mmlab/mmengine), and developers only need to call APIs and adjust the parameters. Developers mainly focus on implementing the **Data** and **Model** parts. + +## Step1: Configs + +In MMPose, we use a Python file as config for the definition and parameter management of the whole project. Therefore, we strongly recommend the developers who use MMPose for the first time to refer to [Configs](./user_guides/configs.md). + +Note that all new modules need to be registered using `Registry` and imported in `__init__.py` in the corresponding directory before we can create their instances from configs. + +## Step2: Data + +The organization of data in MMPose contains: + +- Dataset Meta Information + +- Dataset + +- Pipeline + +### Dataset Meta Information + +The meta information of a pose dataset usually includes the definition of keypoints and skeleton, symmetrical characteristic, and keypoint properties (e.g. belonging to upper or lower body, weights and sigmas). These information is important in data preprocessing, model training and evaluation. In MMpose, the dataset meta information is stored in configs files under `$MMPOSE/configs/_base_/datasets/`. + +To use a custom dataset in MMPose, you need to add a new config file of the dataset meta information. Take the MPII dataset (`$MMPOSE/configs/_base_/datasets/mpii.py`) as an example. Here is its dataset information: + +```Python +dataset_info = dict( + dataset_name='mpii', + paper_info=dict( + author='Mykhaylo Andriluka and Leonid Pishchulin and ' + 'Peter Gehler and Schiele, Bernt', + title='2D Human Pose Estimation: New Benchmark and ' + 'State of the Art Analysis', + container='IEEE Conference on Computer Vision and ' + 'Pattern Recognition (CVPR)', + year='2014', + homepage='http://human-pose.mpi-inf.mpg.de/', + ), + keypoint_info={ + 0: + dict( + name='right_ankle', + id=0, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + ## omitted + }, + skeleton_info={ + 0: + dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]), + ## omitted + }, + joint_weights=[ + 1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5 + ], + # Adapted from COCO dataset. + sigmas=[ + 0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026, + 0.062, 0.072, 0.179, 0.179, 0.072, 0.062 + ]) +``` + +In the model config, the user needs to specify the metainfo path of the custom dataset (e.g. `$MMPOSE/configs/_base_/datasets/custom.py`) as follows:\`\`\` + +```python +# dataset and dataloader settings +dataset_type = 'MyCustomDataset' # or 'CocoDataset' + +train_dataloader = dict( + batch_size=2, + dataset=dict( + type=dataset_type, + data_root='root/of/your/train/data', + ann_file='path/to/your/train/json', + data_prefix=dict(img='path/to/your/train/img'), + # specify the new dataset meta information config file + metainfo=dict(from_file='configs/_base_/datasets/custom.py'), + ...), + ) + +val_dataloader = dict( + batch_size=2, + dataset=dict( + type=dataset_type, + data_root='root/of/your/val/data', + ann_file='path/to/your/val/json', + data_prefix=dict(img='path/to/your/val/img'), + # specify the new dataset meta information config file + metainfo=dict(from_file='configs/_base_/datasets/custom.py'), + ...), + ) + +test_dataloader = val_dataloader +``` + +### Dataset + +To use custom dataset in MMPose, we recommend converting the annotations into a supported format (e.g. COCO or MPII) and directly using our implementation of the corresponding dataset. If this is not applicable, you may need to implement your own dataset class. + +Most 2D keypoint datasets in MMPose **organize the annotations in a COCO-like style**. Thus we provide a base class [BaseCocoStyleDataset](mmpose/datasets/datasets/base/base_coco_style_dataset.py) for these datasets. We recommend that users subclass `BaseCocoStyleDataset` and override the methods as needed (usually `__init__()` and `_load_annotations()`) to extend to a new custom 2D keypoint dataset. + +```{note} +Please refer to [COCO](./dataset_zoo/2d_body_keypoint.md) for more details about the COCO data format. +``` + +```{note} +The bbox format in MMPose is in `xyxy` instead of `xywh`, which is consistent with the format used in other OpenMMLab projects like [MMDetection](https://github.com/open-mmlab/mmdetection). We provide useful utils for bbox format conversion, such as `bbox_xyxy2xywh`, `bbox_xywh2xyxy`, `bbox_xyxy2cs`, etc., which are defined in `$MMPOSE/mmpose/structures/bbox/transforms.py`. +``` + +Let's take the implementation of the MPII dataset (`$MMPOSE/mmpose/datasets/datasets/body/mpii_dataset.py`) as an example. + +```Python +@DATASETS.register_module() +class MpiiDataset(BaseCocoStyleDataset): + METAINFO: dict = dict(from_file='configs/_base_/datasets/mpii.py') + + def __init__(self, + ## omitted + headbox_file: Optional[str] = None, + ## omitted + ): + + if headbox_file: + if data_mode != 'topdown': + raise ValueError( + f'{self.__class__.__name__} is set to {data_mode}: ' + 'mode, while "headbox_file" is only ' + 'supported in topdown mode.') + + if not test_mode: + raise ValueError( + f'{self.__class__.__name__} has `test_mode==False` ' + 'while "headbox_file" is only ' + 'supported when `test_mode==True`.') + + headbox_file_type = headbox_file[-3:] + allow_headbox_file_type = ['mat'] + if headbox_file_type not in allow_headbox_file_type: + raise KeyError( + f'The head boxes file type {headbox_file_type} is not ' + f'supported. Should be `mat` but got {headbox_file_type}.') + self.headbox_file = headbox_file + + super().__init__( + ## omitted + ) + + def _load_annotations(self) -> List[dict]: + """Load data from annotations in MPII format.""" + check_file_exist(self.ann_file) + with open(self.ann_file) as anno_file: + anns = json.load(anno_file) + + if self.headbox_file: + check_file_exist(self.headbox_file) + headbox_dict = loadmat(self.headbox_file) + headboxes_src = np.transpose(headbox_dict['headboxes_src'], + [2, 0, 1]) + SC_BIAS = 0.6 + + data_list = [] + ann_id = 0 + + # mpii bbox scales are normalized with factor 200. + pixel_std = 200. + + for idx, ann in enumerate(anns): + center = np.array(ann['center'], dtype=np.float32) + scale = np.array([ann['scale'], ann['scale']], + dtype=np.float32) * pixel_std + + # Adjust center/scale slightly to avoid cropping limbs + if center[0] != -1: + center[1] = center[1] + 15. / pixel_std * scale[1] + + # MPII uses matlab format, index is 1-based, + # we should first convert to 0-based index + center = center - 1 + + # unify shape with coco datasets + center = center.reshape(1, -1) + scale = scale.reshape(1, -1) + bbox = bbox_cs2xyxy(center, scale) + + # load keypoints in shape [1, K, 2] and keypoints_visible in [1, K] + keypoints = np.array(ann['joints']).reshape(1, -1, 2) + keypoints_visible = np.array(ann['joints_vis']).reshape(1, -1) + + data_info = { + 'id': ann_id, + 'img_id': int(ann['image'].split('.')[0]), + 'img_path': osp.join(self.data_prefix['img'], ann['image']), + 'bbox_center': center, + 'bbox_scale': scale, + 'bbox': bbox, + 'bbox_score': np.ones(1, dtype=np.float32), + 'keypoints': keypoints, + 'keypoints_visible': keypoints_visible, + } + + if self.headbox_file: + # calculate the diagonal length of head box as norm_factor + headbox = headboxes_src[idx] + head_size = np.linalg.norm(headbox[1] - headbox[0], axis=0) + head_size *= SC_BIAS + data_info['head_size'] = head_size.reshape(1, -1) + + data_list.append(data_info) + ann_id = ann_id + 1 + + return data_list +``` + +When supporting MPII dataset, since we need to use `head_size` to calculate `PCKh`, we add `headbox_file` to `__init__()` and override`_load_annotations()`. + +To support a dataset that is beyond the scope of `BaseCocoStyleDataset`, you may need to subclass from the `BaseDataset` provided by [MMEngine](https://github.com/open-mmlab/mmengine). Please refer to the [documents](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html) for details. + +### Pipeline + +Data augmentations and transformations during pre-processing are organized as a pipeline. Here is an example of typical pipelines: + +```Python +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +test_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] +``` + +In a keypoint detection task, data will be transformed among three scale spaces: + +- **Original Image Space**: the space where the images are stored. The sizes of different images are not necessarily the same + +- **Input Image Space**: the image space used for model input. All **images** and **annotations** will be transformed into this space, such as `256x256`, `256x192`, etc. + +- **Output Space**: the scale space where model outputs are located, such as `64x64(Heatmap)`,`1x1(Regression)`, etc. The supervision signal is also in this space during training + +Here is a diagram to show the workflow of data transformation among the three scale spaces: + +![migration-en](https://user-images.githubusercontent.com/13503330/187190213-cad87b5f-0a95-4f1f-b722-15896914ded4.png) + +In MMPose, the modules used for data transformation are under `$MMPOSE/mmpose/datasets/transforms`, and their workflow is shown as follows: + +![transforms-en](https://user-images.githubusercontent.com/13503330/187190352-a7662346-b8da-4256-9192-c7a84b15cbb5.png) + +#### i. Augmentation + +Commonly used transforms are defined in `$MMPOSE/mmpose/datasets/transforms/common_transforms.py`, such as `RandomFlip`, `RandomHalfBody`, etc. + +For top-down methods, `Shift`, `Rotate`and `Resize` are implemented by `RandomBBoxTransform`**.** For bottom-up methods, `BottomupRandomAffine` is used. + +```{note} +Most data transforms depend on `bbox_center` and `bbox_scale`, which can be obtained by `GetBBoxCenterScale`. +``` + +#### ii. Transformation + +Affine transformation is used to convert images and annotations from the original image space to the input space. This is done by `TopdownAffine` for top-down methods and `BottomupRandomAffine` for bottom-up methods. + +#### iii. Encoding + +In training phase, after the data is transformed from the original image space into the input space, it is necessary to use `GenerateTarget` to obtain the training target(e.g. Gaussian Heatmaps). We name this process **Encoding**. Conversely, the process of getting the corresponding coordinates from Gaussian Heatmaps is called **Decoding**. + +In MMPose, we collect Encoding and Decoding processes into a **Codec**, in which `encode()` and `decode()` are implemented. + +Currently we support the following types of Targets. + +- `heatmap`: Gaussian heatmaps +- `keypoint_label`: keypoint representation (e.g. normalized coordinates) +- `keypoint_xy_label`: axis-wise keypoint representation +- `heatmap+keypoint_label`: Gaussian heatmaps and keypoint representation +- `multiscale_heatmap`: multi-scale Gaussian heatmaps + +and the generated targets will be packed as follows. + +- `heatmaps`: Gaussian heatmaps +- `keypoint_labels`: keypoint representation (e.g. normalized coordinates) +- `keypoint_x_labels`: keypoint x-axis representation +- `keypoint_y_labels`: keypoint y-axis representation +- `keypoint_weights`: keypoint visibility and weights + +Note that we unify the data format of top-down and bottom-up methods, which means that a new dimension is added to represent different instances from the same image, in shape: + +```Python +[batch_size, num_instances, num_keypoints, dim_coordinates] +``` + +- top-down: `[B, 1, K, D]` + +- Bottom-up: `[B, N, K, D]` + +The provided codecs are stored under `$MMPOSE/mmpose/codecs`. + +```{note} +If you wish to customize a new codec, you can refer to [Codec](./user_guides/codecs.md) for more details. +``` + +#### iv. Packing + +After the data is transformed, you need to pack it using `PackPoseInputs`. + +This method converts the data stored in the dictionary `results` into standard data structures in MMPose, such as `InstanceData`, `PixelData`, `PoseDataSample`, etc. + +Specifically, we divide the data into `gt` (ground-truth) and `pred` (prediction), each of which has the following types: + +- **instances**(numpy.array): instance-level raw annotations or predictions in the original scale space +- **instance_labels**(torch.tensor): instance-level training labels (e.g. normalized coordinates, keypoint visibility) in the output scale space +- **fields**(torch.tensor): pixel-level training labels or predictions (e.g. Gaussian Heatmaps) in the output scale space + +The following is an example of the implementation of `PoseDataSample` under the hood: + +```Python +def get_pose_data_sample(self): + # meta + pose_meta = dict( + img_shape=(600, 900), # [h, w, c] + crop_size=(256, 192), # [h, w] + heatmap_size=(64, 48), # [h, w] + ) + + # gt_instances + gt_instances = InstanceData() + gt_instances.bboxes = np.random.rand(1, 4) + gt_instances.keypoints = np.random.rand(1, 17, 2) + + # gt_instance_labels + gt_instance_labels = InstanceData() + gt_instance_labels.keypoint_labels = torch.rand(1, 17, 2) + gt_instance_labels.keypoint_weights = torch.rand(1, 17) + + # pred_instances + pred_instances = InstanceData() + pred_instances.keypoints = np.random.rand(1, 17, 2) + pred_instances.keypoint_scores = np.random.rand(1, 17) + + # gt_fields + gt_fields = PixelData() + gt_fields.heatmaps = torch.rand(17, 64, 48) + + # pred_fields + pred_fields = PixelData() + pred_fields.heatmaps = torch.rand(17, 64, 48) + data_sample = PoseDataSample( + gt_instances=gt_instances, + pred_instances=pred_instances, + gt_fields=gt_fields, + pred_fields=pred_fields, + metainfo=pose_meta) + + return data_sample +``` + +## Step3: Model + +In MMPose 1.0, the model consists of the following components: + +- **Data Preprocessor**: perform data normalization and channel transposition + +- **Backbone**: used for feature extraction + +- **Neck**: GAP,FPN, etc. are optional + +- **Head**: used to implement the core algorithm and loss function + +We define a base class `BasePoseEstimator` for the model in `$MMPOSE/models/pose_estimators/base.py`. All models, e.g. `TopdownPoseEstimator`, should inherit from this base class and override the corresponding methods. + +Three modes are provided in `forward()` of the estimator: + +- `mode == 'loss'`: return the result of loss function for model training + +- `mode == 'predict'`: return the prediction result in the input space, used for model inference + +- `mode == 'tensor'`: return the model output in the output space, i.e. model forward propagation only, for model export + +Developers should build the components by calling the corresponding registry. Taking the top-down model as an example: + +```Python +@MODELS.register_module() +class TopdownPoseEstimator(BasePoseEstimator): + def __init__(self, + backbone: ConfigType, + neck: OptConfigType = None, + head: OptConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__(data_preprocessor, init_cfg) + + self.backbone = MODELS.build(backbone) + + if neck is not None: + self.neck = MODELS.build(neck) + + if head is not None: + self.head = MODELS.build(head) +``` + +### Data Preprocessor + +Starting from MMPose 1.0, we have added a new module to the model called data preprocessor, which performs data preprocessings like image normalization and channel transposition. It can benefit from the high computing power of devices like GPU, and improve the integrity in model export and deployment. + +A typical `data_preprocessor` in the config is as follows: + +```Python +data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), +``` + +It will transpose the channel order of the input image from `bgr` to `rgb` and normalize the data according to `mean` and `std`. + +### Backbone + +MMPose provides some commonly used backbones under `$MMPOSE/mmpose/models/backbones`. + +In practice, developers often use pre-trained backbone weights for transfer learning, which can improve the performance of the model on small datasets. + +In MMPose, you can use the pre-trained weights by setting `init_cfg` in config: + +```Python +init_cfg=dict( + type='Pretrained', + checkpoint='PATH/TO/YOUR_MODEL_WEIGHTS.pth'), +``` + +If you want to load a checkpoint to your backbone, you should specify the `prefix`: + +```Python +init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='PATH/TO/YOUR_CHECKPOINT.pth'), +``` + +`checkpoint` can be either a local path or a download link. Thus, if you wish to use a pre-trained model provided by Torchvision(e.g. ResNet50), you can simply use: + +```Python +init_cfg=dict( + type='Pretrained', + checkpoint='torchvision://resnet50') +``` + +In addition to these commonly used backbones, you can easily use backbones from other repositories in the OpenMMLab family such as MMClassification, which all share the same config system and provide pre-trained weights. + +It should be emphasized that if you add a new backbone, you need to register it by doing: + +```Python +@MODELS.register_module() +class YourBackbone(BaseBackbone): +``` + +Besides, import it in `$MMPOSE/mmpose/models/backbones/__init__.py`, and add it to `__all__`. + +### Neck + +Neck is usually a module between Backbone and Head, which is used in some algorithms. Here are some commonly used Neck: + +- Global Average Pooling(GAP) + +- Feature Pyramid Networks(FPN) + +### Head + +Generally speaking, Head is often the core of an algorithm, which is used to make predictions and perform loss calculation. + +Modules related to Head in MMPose are defined under `$MMPOSE/mmpose/models/heads`, and developers need to inherit the base class `BaseHead` when customizing Head and override the following methods: + +- forward() + +- predict() + +- loss() + +Specifically, `predict()` method needs to return pose predictions in the image space, which is obtained from the model output though the decoding function provided by the codec. We implement this process in `BaseHead.decode()`. + +On the other hand, we will perform test-time augmentation(TTA) in `predict()`. + +A commonly used TTA is `flip_test`, namely, an image and its flipped version are sent into the model to inference, and the output of the flipped version will be flipped back, then average them to stabilize the prediction. + +Here is an example of `predict()` in `RegressionHead`: + +```Python +def predict(self, + feats: Tuple[Tensor], + batch_data_samples: OptSampleList, + test_cfg: ConfigType = {}) -> Predictions: + """Predict results from outputs.""" + + if test_cfg.get('flip_test', False): + # TTA: flip test -> feats = [orig, flipped] + assert isinstance(feats, list) and len(feats) == 2 + flip_indices = batch_data_samples[0].metainfo['flip_indices'] + input_size = batch_data_samples[0].metainfo['input_size'] + _feats, _feats_flip = feats + _batch_coords = self.forward(_feats) + _batch_coords_flip = flip_coordinates( + self.forward(_feats_flip), + flip_indices=flip_indices, + shift_coords=test_cfg.get('shift_coords', True), + input_size=input_size) + batch_coords = (_batch_coords + _batch_coords_flip) * 0.5 + else: + batch_coords = self.forward(feats) # (B, K, D) + + batch_coords.unsqueeze_(dim=1) # (B, N, K, D) + preds = self.decode(batch_coords) +``` + +The `loss()` not only performs the calculation of loss functions, but also the calculation of training-time metrics such as pose accuracy. The results are carried by a dictionary `losses`: + +```Python + # calculate accuracy +_, avg_acc, _ = keypoint_pck_accuracy( + pred=to_numpy(pred_coords), + gt=to_numpy(keypoint_labels), + mask=to_numpy(keypoint_weights) > 0, + thr=0.05, + norm_factor=np.ones((pred_coords.size(0), 2), dtype=np.float32)) + +acc_pose = torch.tensor(avg_acc, device=keypoint_labels.device) +losses.update(acc_pose=acc_pose) +``` + +The data of each batch is packaged into `batch_data_samples`. Taking the Regression-based method as an example, the normalized coordinates and keypoint weights can be obtained as follows: + +```Python +keypoint_labels = torch.cat( + [d.gt_instance_labels.keypoint_labels for d in batch_data_samples]) +keypoint_weights = torch.cat([ + d.gt_instance_labels.keypoint_weights for d in batch_data_samples +]) +``` + +Here is the complete implementation of `loss()` in `RegressionHead`: + +```Python +def loss(self, + inputs: Tuple[Tensor], + batch_data_samples: OptSampleList, + train_cfg: ConfigType = {}) -> dict: + """Calculate losses from a batch of inputs and data samples.""" + + pred_outputs = self.forward(inputs) + + keypoint_labels = torch.cat( + [d.gt_instance_labels.keypoint_labels for d in batch_data_samples]) + keypoint_weights = torch.cat([ + d.gt_instance_labels.keypoint_weights for d in batch_data_samples + ]) + + # calculate losses + losses = dict() + loss = self.loss_module(pred_outputs, keypoint_labels, + keypoint_weights.unsqueeze(-1)) + + if isinstance(loss, dict): + losses.update(loss) + else: + losses.update(loss_kpt=loss) + + # calculate accuracy + _, avg_acc, _ = keypoint_pck_accuracy( + pred=to_numpy(pred_outputs), + gt=to_numpy(keypoint_labels), + mask=to_numpy(keypoint_weights) > 0, + thr=0.05, + norm_factor=np.ones((pred_outputs.size(0), 2), dtype=np.float32)) + acc_pose = torch.tensor(avg_acc, device=keypoint_labels.device) + losses.update(acc_pose=acc_pose) + + return losses +``` diff --git a/docs/en/index.rst b/docs/en/index.rst index 83d02f9b5e..47f25a8d2e 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -12,6 +12,7 @@ You can change the documentation language at the lower-left corner of the page. overview.md installation.md quick_run.md + guide_to_framework.md .. toctree:: :maxdepth: 1 @@ -23,14 +24,14 @@ You can change the documentation language at the lower-left corner of the page. user_guides/inference.md user_guides/train_and_test.md user_guides/visualization.md - user_guides/useful_tools.md - + user_guides/how_to.md .. toctree:: :maxdepth: 1 :caption: Advanced Guides - advanced_guides.md + advanced_guides/advanced_training.md + advanced_guides/mixed_datasets.md .. toctree:: :maxdepth: 1 @@ -71,6 +72,7 @@ You can change the documentation language at the lower-left corner of the page. dataset_zoo/2d_animal_keypoint.md dataset_zoo/3d_body_keypoint.md dataset_zoo/3d_hand_keypoint.md + dataset_zoo/dataset_tools.md .. toctree:: :maxdepth: 1 diff --git a/docs/en/installation.md b/docs/en/installation.md index 0b6ad73ab4..dc70953a95 100644 --- a/docs/en/installation.md +++ b/docs/en/installation.md @@ -20,7 +20,7 @@ In this section we demonstrate how to prepare an environment with PyTorch. -MMPose works on Linux, Windows and macOS. It requires Python 3.6+, CUDA 9.2+ and PyTorch 1.6+. +MMPose works on Linux, Windows and macOS. It requires Python 3.7+, CUDA 9.2+ and PyTorch 1.6+. If you are experienced with PyTorch and have already installed it, you can skip this part and jump to the [MMPose Installation](#install-mmpose). Otherwise, you can follow these steps for the preparation. @@ -62,13 +62,13 @@ We recommend that users follow our best practices to install MMPose. However, th ```shell pip install -U openmim mim install mmengine -mim install "mmcv>=2.0.0rc1" +mim install "mmcv>=2.0.0rc4" ``` Note that some of the demo scripts in MMPose require [MMDetection](https://github.com/open-mmlab/mmdetection) (mmdet) for human detection. If you want to run these demo scripts with mmdet, you can easily install mmdet as a dependency by running: ```shell -mim install "mmdet>=3.0.0rc0" +mim install "mmdet>=3.0.0rc6" ``` **Step 1.** Install MMPose. @@ -89,7 +89,7 @@ pip install -v -e . Case B: To use mmpose as a dependency or third-party package, install it with pip: ```shell -mim install "mmpose>=1.0.0b0" +mim install "mmpose>=1.0.0rc1" ``` ### Verify the installation @@ -117,12 +117,10 @@ python demo/image_demo.py \ --draw-heatmap ``` -If everything goes fine, you will get this visualization result: +If everything goes fine, you will be able to get the following visualization result from `vis_results.jpg` in your current folder, which displays the predicted keypoints and heatmaps overlaid on the person in the image. ![image](https://user-images.githubusercontent.com/87690686/187824033-2cce0f55-034a-4127-82e2-52744178bc32.jpg) -And the visualization result will be saved as `vis_results.jpg` on your current folder, where the predicted keypoints and heatmaps are plotted on the person in the image. - Option (B). If you install mmpose with pip, open you python interpreter and copy & paste the following codes. ```python @@ -210,7 +208,7 @@ thus we only need to install MMEngine, MMCV and MMPose with the following comman ```python import mmpose print(mmpose.__version__) -# Example output: 1.0.0b0 +# Example output: 1.0.0rc0 ``` ```{note} diff --git a/docs/en/migration.md b/docs/en/migration.md index e639f031cd..5795bbe571 100644 --- a/docs/en/migration.md +++ b/docs/en/migration.md @@ -1,628 +1,12 @@ -# Migration - -MMPose 1.0 has made significant BC-breaking changes, with modules redesigned and reorganized to reduce code redundancy and improve efficiency. For developers who have some deep-learning knowledge, this tutorial provides a migration guide. - -Whether you are **a user of the previous version of MMPose**, or **a new user wishing to migrate your Pytorch project to MMPose**, you can learn how to build a project based on MMPose 1.0 with this tutorial. - -```{note} -This tutorial covers what developers will concern when using MMPose 1.0: - -- Overall code architecture - -- How to manage modules with configs - -- How to use my own custom datasets - -- How to add new modules(backbone, head, loss function, etc.) -``` - -The content of this tutorial is organized as follows: - -- [Migration](#migration) - - [Overall Code Architecture](#overall-code-architecture) - - [Step1: Configs](#step1-configs) - - [Step2: Data](#step2-data) - - [Dataset Meta Information](#dataset-meta-information) - - [Dataset](#dataset) - - [Pipeline](#pipeline) - - [i. Augmentation](#i-augmentation) - - [ii. Transformation](#ii-transformation) - - [iii. Encoding](#iii-encoding) - - [iv. Packing](#iv-packing) - - [Step3: Model](#step3-model) - - [Data Preprocessor](#data-preprocessor) - - [Backbone](#backbone) - - [Neck](#neck) - - [Head](#head) - - [Compatibility of MMPose 0.X](#compatibility-of-mmpose-0x) - - [Data Transformation](#data-transformation) - - [Translation, Rotation and Scaling](#translation-rotation-and-scaling) - - [Target Generation](#target-generation) - - [Data Normalization](#data-normalization) - - [Compatibility of Models](#compatibility-of-models) - - [Heatmap-based Model](#heatmap-based-model) - - [RLE-based Model](#rle-based-model) - -## Overall Code Architecture - -![overall-en](https://user-images.githubusercontent.com/13503330/187372008-2a94bad5-5252-4155-9ae3-3da1c426f569.png) - -Generally speaking, there are **five parts** developers will use during project development: - -- **General:** Environment, Hook, Checkpoint, Logger, etc. - -- **Data:** Dataset, Dataloader, Data Augmentation, etc. - -- **Training:** Optimizer, Learning Rate Scheduler, etc. - -- **Model:** Backbone, Neck, Head, Loss function, etc. - -- **Evaluation:** Metric, Evaluator, etc. - -Among them, modules related to **General**, **Training** and **Evaluation** are often provided by the training framework [MMEngine](https://github.com/open-mmlab/mmengine), and developers only need to call APIs and adjust the parameters. Developers mainly focus on implementing the **Data** and **Model** parts. - -## Step1: Configs - -In MMPose, we use a Python file as config for the definition and parameter management of the whole project. Therefore, we strongly recommend the developers who use MMPose for the first time to refer to [Configs](./user_guides/configs.md). - -Note that all new modules need to be registered using `Registry` and imported in `__init__.py` in the corresponding directory before we can create their instances from configs. - -## Step2: Data - -The organization of data in MMPose contains: - -- Dataset Meta Information - -- Dataset - -- Pipeline - -### Dataset Meta Information - -The meta information of a pose dataset usually includes the definition of keypoints and skeleton, symmetrical characteristic, and keypoint properties (e.g. belonging to upper or lower body, weights and sigmas). These information is important in data preprocessing, model training and evaluation. In MMpose, the dataset meta information is stored in configs files under `$MMPOSE/configs/_base_/datasets/`. - -To use a custom dataset in MMPose, you need to add a new config file of the dataset meta information. Take the MPII dataset (`$MMPOSE/configs/_base_/datasets/mpii.py`) as an example. Here is its dataset information: - -```Python -dataset_info = dict( - dataset_name='mpii', - paper_info=dict( - author='Mykhaylo Andriluka and Leonid Pishchulin and ' - 'Peter Gehler and Schiele, Bernt', - title='2D Human Pose Estimation: New Benchmark and ' - 'State of the Art Analysis', - container='IEEE Conference on Computer Vision and ' - 'Pattern Recognition (CVPR)', - year='2014', - homepage='http://human-pose.mpi-inf.mpg.de/', - ), - keypoint_info={ - 0: - dict( - name='right_ankle', - id=0, - color=[255, 128, 0], - type='lower', - swap='left_ankle'), - ## omitted - }, - skeleton_info={ - 0: - dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]), - ## omitted - }, - joint_weights=[ - 1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5 - ], - # Adapted from COCO dataset. - sigmas=[ - 0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026, - 0.062, 0.072, 0.179, 0.179, 0.072, 0.062 - ]) -``` - -### Dataset - -To use custom dataset in MMPose, we recommend converting the annotations into a supported format (e.g. COCO or MPII) and directly using our implementation of the corresponding dataset. If this is not applicable, you may need to implement your own dataset class. - -Most 2D keypoint datasets in MMPose **organize the annotations in a COCO-like style**. Thus we provide a base class [BaseCocoStyleDataset](mmpose/datasets/datasets/base/base_coco_style_dataset.py) for these datasets. We recommend that users subclass `BaseCocoStyleDataset` and override the methods as needed (usually `__init__()` and `_load_annotations()`) to extend to a new custom 2D keypoint dataset. - -```{note} -Please refer to [COCO](./dataset_zoo/2d_body_keypoint.md) for more details about the COCO data format. -``` - -```{note} -The bbox format in MMPose is in `xyxy` instead of `xywh`, which is consistent with the format used in other OpenMMLab projects like [MMDetection](https://github.com/open-mmlab/mmdetection). We provide useful utils for bbox format conversion, such as `bbox_xyxy2xywh`, `bbox_xywh2xyxy`, `bbox_xyxy2cs`, etc., which are defined in `$MMPOSE/mmpose/structures/bbox/transforms.py`. -``` - -Let's take the implementation of the MPII dataset (`$MMPOSE/mmpose/datasets/datasets/body/mpii_dataset.py`) as an example. - -```Python -@DATASETS.register_module() -class MpiiDataset(BaseCocoStyleDataset): - METAINFO: dict = dict(from_file='configs/_base_/datasets/mpii.py') - - def __init__(self, - ## omitted - headbox_file: Optional[str] = None, - ## omitted - ): - - if headbox_file: - if data_mode != 'topdown': - raise ValueError( - f'{self.__class__.__name__} is set to {data_mode}: ' - 'mode, while "headbox_file" is only ' - 'supported in topdown mode.') - - if not test_mode: - raise ValueError( - f'{self.__class__.__name__} has `test_mode==False` ' - 'while "headbox_file" is only ' - 'supported when `test_mode==True`.') - - headbox_file_type = headbox_file[-3:] - allow_headbox_file_type = ['mat'] - if headbox_file_type not in allow_headbox_file_type: - raise KeyError( - f'The head boxes file type {headbox_file_type} is not ' - f'supported. Should be `mat` but got {headbox_file_type}.') - self.headbox_file = headbox_file - - super().__init__( - ## omitted - ) - - def _load_annotations(self) -> List[dict]: - """Load data from annotations in MPII format.""" - check_file_exist(self.ann_file) - with open(self.ann_file) as anno_file: - anns = json.load(anno_file) - - if self.headbox_file: - check_file_exist(self.headbox_file) - headbox_dict = loadmat(self.headbox_file) - headboxes_src = np.transpose(headbox_dict['headboxes_src'], - [2, 0, 1]) - SC_BIAS = 0.6 - - data_list = [] - ann_id = 0 - - # mpii bbox scales are normalized with factor 200. - pixel_std = 200. - - for idx, ann in enumerate(anns): - center = np.array(ann['center'], dtype=np.float32) - scale = np.array([ann['scale'], ann['scale']], - dtype=np.float32) * pixel_std - - # Adjust center/scale slightly to avoid cropping limbs - if center[0] != -1: - center[1] = center[1] + 15. / pixel_std * scale[1] - - # MPII uses matlab format, index is 1-based, - # we should first convert to 0-based index - center = center - 1 - - # unify shape with coco datasets - center = center.reshape(1, -1) - scale = scale.reshape(1, -1) - bbox = bbox_cs2xyxy(center, scale) - - # load keypoints in shape [1, K, 2] and keypoints_visible in [1, K] - keypoints = np.array(ann['joints']).reshape(1, -1, 2) - keypoints_visible = np.array(ann['joints_vis']).reshape(1, -1) - - data_info = { - 'id': ann_id, - 'img_id': int(ann['image'].split('.')[0]), - 'img_path': osp.join(self.data_prefix['img'], ann['image']), - 'bbox_center': center, - 'bbox_scale': scale, - 'bbox': bbox, - 'bbox_score': np.ones(1, dtype=np.float32), - 'keypoints': keypoints, - 'keypoints_visible': keypoints_visible, - } - - if self.headbox_file: - # calculate the diagonal length of head box as norm_factor - headbox = headboxes_src[idx] - head_size = np.linalg.norm(headbox[1] - headbox[0], axis=0) - head_size *= SC_BIAS - data_info['head_size'] = head_size.reshape(1, -1) - - data_list.append(data_info) - ann_id = ann_id + 1 - - return data_list -``` - -When supporting MPII dataset, since we need to use `head_size` to calculate `PCKh`, we add `headbox_file` to `__init__()` and override`_load_annotations()`. - -To support a dataset that is beyond the scope of `BaseCocoStyleDataset`, you may need to subclass from the `BaseDataset` provided by [MMEngine](https://github.com/open-mmlab/mmengine). Please refer to the [documents](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html) for details. - -### Pipeline - -Data augmentations and transformations during pre-processing are organized as a pipeline. Here is an example of typical pipelines: - -```Python -# pipelines -train_pipeline = [ - dict(type='LoadImage', file_client_args=file_client_args), - dict(type='GetBBoxCenterScale'), - dict(type='RandomFlip', direction='horizontal'), - dict(type='RandomHalfBody'), - dict(type='RandomBBoxTransform'), - dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), - dict(type='PackPoseInputs') -] -test_pipeline = [ - dict(type='LoadImage', file_client_args=file_client_args), - dict(type='GetBBoxCenterScale'), - dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='PackPoseInputs') -] -``` - -In a keypoint detection task, data will be transformed among three scale spaces: - -- **Original Image Space**: the space where the images are stored. The sizes of different images are not necessarily the same - -- **Input Image Space**: the image space used for model input. All **images** and **annotations** will be transformed into this space, such as `256x256`, `256x192`, etc. - -- **Output Space**: the scale space where model outputs are located, such as `64x64(Heatmap)`,`1x1(Regression)`, etc. The supervision signal is also in this space during training - -Here is a diagram to show the workflow of data transformation among the three scale spaces: - -![migration-en](https://user-images.githubusercontent.com/13503330/187190213-cad87b5f-0a95-4f1f-b722-15896914ded4.png) - -In MMPose, the modules used for data transformation are under `$MMPOSE/mmpose/datasets/transforms`, and their workflow is shown as follows: - -![transforms-en](https://user-images.githubusercontent.com/13503330/187190352-a7662346-b8da-4256-9192-c7a84b15cbb5.png) - -#### i. Augmentation - -Commonly used transforms are defined in `$MMPOSE/mmpose/datasets/transforms/common_transforms.py`, such as `RandomFlip`, `RandomHalfBody`, etc. - -For top-down methods, `Shift`, `Rotate`and `Resize` are implemented by `RandomBBoxTransform`**.** For bottom-up methods, `BottomupRandomAffine` is used. - -```{note} -Most data transforms depend on `bbox_center` and `bbox_scale`, which can be obtained by `GetBBoxCenterScale`. -``` - -#### ii. Transformation - -Affine transformation is used to convert images and annotations from the original image space to the input space. This is done by `TopdownAffine` for top-down methods and `BottomupRandomAffine` for bottom-up methods. - -#### iii. Encoding - -In training phase, after the data is transformed from the original image space into the input space, it is necessary to use `GenerateTarget` to obtain the training target(e.g. Gaussian Heatmaps). We name this process **Encoding**. Conversely, the process of getting the corresponding coordinates from Gaussian Heatmaps is called **Decoding**. - -In MMPose, we collect Encoding and Decoding processes into a **Codec**, in which `encode()` and `decode()` are implemented. - -Currently we support the following types of Targets. - -- `heatmap`: Gaussian heatmaps -- `keypoint_label`: keypoint representation (e.g. normalized coordinates) -- `keypoint_xy_label`: axis-wise keypoint representation -- `heatmap+keypoint_label`: Gaussian heatmaps and keypoint representation -- `multiscale_heatmap`: multi-scale Gaussian heatmaps - -and the generated targets will be packed as follows. - -- `heatmaps`: Gaussian heatmaps -- `keypoint_labels`: keypoint representation (e.g. normalized coordinates) -- `keypoint_x_labels`: keypoint x-axis representation -- `keypoint_y_labels`: keypoint y-axis representation -- `keypoint_weights`: keypoint visibility and weights - -Note that we unify the data format of top-down and bottom-up methods, which means that a new dimension is added to represent different instances from the same image, in shape: - -```Python -[batch_size, num_instances, num_keypoints, dim_coordinates] -``` - -- top-down: `[B, 1, K, D]` - -- Bottom-up: `[B, N, K, D]` - -The provided codecs are stored under `$MMPOSE/mmpose/codecs`. - -```{note} -If you wish to customize a new codec, you can refer to [Codec](./user_guides/codecs.md) for more details. -``` - -#### iv. Packing - -After the data is transformed, you need to pack it using `PackPoseInputs`. - -This method converts the data stored in the dictionary `results` into standard data structures in MMPose, such as `InstanceData`, `PixelData`, `PoseDataSample`, etc. - -Specifically, we divide the data into `gt` (ground-truth) and `pred` (prediction), each of which has the following types: - -- **instances**(numpy.array): instance-level raw annotations or predictions in the original scale space -- **instance_labels**(torch.tensor): instance-level training labels (e.g. normalized coordinates, keypoint visibility) in the output scale space -- **fields**(torch.tensor): pixel-level training labels or predictions (e.g. Gaussian Heatmaps) in the output scale space - -The following is an example of the implementation of `PoseDataSample` under the hood: - -```Python -def get_pose_data_sample(self): - # meta - pose_meta = dict( - img_shape=(600, 900), # [h, w, c] - crop_size=(256, 192), # [h, w] - heatmap_size=(64, 48), # [h, w] - ) - - # gt_instances - gt_instances = InstanceData() - gt_instances.bboxes = np.random.rand(1, 4) - gt_instances.keypoints = np.random.rand(1, 17, 2) - - # gt_instance_labels - gt_instance_labels = InstanceData() - gt_instance_labels.keypoint_labels = torch.rand(1, 17, 2) - gt_instance_labels.keypoint_weights = torch.rand(1, 17) - - # pred_instances - pred_instances = InstanceData() - pred_instances.keypoints = np.random.rand(1, 17, 2) - pred_instances.keypoint_scores = np.random.rand(1, 17) - - # gt_fields - gt_fields = PixelData() - gt_fields.heatmaps = torch.rand(17, 64, 48) - - # pred_fields - pred_fields = PixelData() - pred_fields.heatmaps = torch.rand(17, 64, 48) - data_sample = PoseDataSample( - gt_instances=gt_instances, - pred_instances=pred_instances, - gt_fields=gt_fields, - pred_fields=pred_fields, - metainfo=pose_meta) - - return data_sample -``` - -## Step3: Model - -In MMPose 1.0, the model consists of the following components: - -- **Data Preprocessor**: perform data normalization and channel transposition - -- **Backbone**: used for feature extraction - -- **Neck**: GAP,FPN, etc. are optional - -- **Head**: used to implement the core algorithm and loss function - -We define a base class `BasePoseEstimator` for the model in `$MMPOSE/models/pose_estimators/base.py`. All models, e.g. `TopdownPoseEstimator`, should inherit from this base class and override the corresponding methods. - -Three modes are provided in `forward()` of the estimator: - -- `mode == 'loss'`: return the result of loss function for model training - -- `mode == 'predict'`: return the prediction result in the input space, used for model inference - -- `mode == 'tensor'`: return the model output in the output space, i.e. model forward propagatin only, for model export - -Developers should build the components by calling the corresponding registry. Taking the top-down model as an example: - -```Python -@MODELS.register_module() -class TopdownPoseEstimator(BasePoseEstimator): - def __init__(self, - backbone: ConfigType, - neck: OptConfigType = None, - head: OptConfigType = None, - train_cfg: OptConfigType = None, - test_cfg: OptConfigType = None, - data_preprocessor: OptConfigType = None, - init_cfg: OptMultiConfig = None): - super().__init__(data_preprocessor, init_cfg) - - self.backbone = MODELS.build(backbone) - - if neck is not None: - self.neck = MODELS.build(neck) - - if head is not None: - self.head = MODELS.build(head) -``` - -### Data Preprocessor - -Starting from MMPose 1.0, we have added a new module to the model called data preprocessor, which performs data preprocessings like image normalization and channel transposition. It can benefit from the high computing power of devices like GPU, and improve the integrity in model export and deployment. - -A typical `data_preprocessor` in the config is as follows: - -```Python -data_preprocessor=dict( - type='PoseDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True), -``` - -It will transpose the channel order of the input image from `bgr` to `rgb` and normalize the data according to `mean` and `std`. - -### Backbone - -MMPose provides some commonly used backbones under `$MMPOSE/mmpose/models/backbones`. - -In practice, developers often use pre-trained backbone weights for transfer learning, which can improve the performance of the model on small datasets. - -In MMPose, you can use the pre-trained weights by setting `init_cfg` in config: - -```Python -init_cfg=dict( - type='Pretrained', - checkpoint='PATH/TO/YOUR_MODEL_WEIGHTS.pth'), -``` - -If you want to load a checkopoint to your backbone, you should specify the `prefix`: - -```Python -init_cfg=dict( - type='Pretrained', - prefix='backbone.', - checkpoint='PATH/TO/YOUR_CHECKPOINT.pth'), -``` - -`checkpoint` can be either a local path or a download link. Thus, if you wish to use a pre-trained model provided by Torchvision(e.g. ResNet50), you can simply use: - -```Python -init_cfg=dict( - type='Pretrained', - checkpoint='torchvision://resnet50') -``` - -In addition to these commonly used backbones, you can easily use backbones from other repositories in the OpenMMLab family such as MMClassification, which all share the same config system and provide pre-trained weights. - -It should be emphasized that if you add a new backbone, you need to register it by doing: - -```Python -@MODELS.register_module() -class YourBackbone(BaseBackbone): -``` - -Besides, import it in `$MMPOSE/mmpose/models/backbones/__init__.py`, and add it to `__all__`. - -### Neck - -Neck is usually a module between Backbone and Head, which is used in some algorithms. Here are some commonly used Neck: - -- Global Average Pooling(GAP) - -- Feature Pyramid Networks(FPN) - -### Head - -Generally speaking, Head is often the core of an algorithm, which is used to make predictions and perform loss calculation. - -Modules related to Head in MMPose are defined under `$MMPOSE/mmpose/models/heads`, and developers need to inherit the base class `BaseHead` when customizing Head and override the following methods: - -- forward() - -- predict() - -- loss() - -Specifically, `predict()` method needs to return pose predictions in the image space, which is obtained from the model output though the decoding function provided by the codec. We implement this process in `BaseHead.decode()`. - -On the other hand, we will perform test-time augmentation(TTA) in `predict()`. - -A commonly used TTA is `flip_test`, namely, an image and its flipped version are sent into the model to inference, and the output of the flipped version will be flipped back, then average them to stabilize the prediction. - -Here is an example of `predict()` in `RegressionHead`: - -```Python -def predict(self, - feats: Tuple[Tensor], - batch_data_samples: OptSampleList, - test_cfg: ConfigType = {}) -> Predictions: - """Predict results from outputs.""" - - if test_cfg.get('flip_test', False): - # TTA: flip test -> feats = [orig, flipped] - assert isinstance(feats, list) and len(feats) == 2 - flip_indices = batch_data_samples[0].metainfo['flip_indices'] - input_size = batch_data_samples[0].metainfo['input_size'] - _feats, _feats_flip = feats - _batch_coords = self.forward(_feats) - _batch_coords_flip = flip_coordinates( - self.forward(_feats_flip), - flip_indices=flip_indices, - shift_coords=test_cfg.get('shift_coords', True), - input_size=input_size) - batch_coords = (_batch_coords + _batch_coords_flip) * 0.5 - else: - batch_coords = self.forward(feats) # (B, K, D) - - batch_coords.unsqueeze_(dim=1) # (B, N, K, D) - preds = self.decode(batch_coords) -``` - -The `loss()` not only performs the calculation of loss functions, but also the calculation of training-time metrics such as pose accuracy. The results are carried by a dictionary `losses`: - -```Python - # calculate accuracy -_, avg_acc, _ = keypoint_pck_accuracy( - pred=to_numpy(pred_coords), - gt=to_numpy(keypoint_labels), - mask=to_numpy(keypoint_weights) > 0, - thr=0.05, - norm_factor=np.ones((pred_coords.size(0), 2), dtype=np.float32)) - -acc_pose = torch.tensor(avg_acc, device=keypoint_labels.device) -losses.update(acc_pose=acc_pose) -``` - -The data of each batch is packaged into `batch_data_samples`. Taking the Regression-based method as an example, the normalized coordinates and keypoint weights can be obtained as follows: - -```Python -keypoint_labels = torch.cat( - [d.gt_instance_labels.keypoint_labels for d in batch_data_samples]) -keypoint_weights = torch.cat([ - d.gt_instance_labels.keypoint_weights for d in batch_data_samples -]) -``` - -Here is the complete implementation of `loss()` in `RegressionHead`: - -```Python -def loss(self, - inputs: Tuple[Tensor], - batch_data_samples: OptSampleList, - train_cfg: ConfigType = {}) -> dict: - """Calculate losses from a batch of inputs and data samples.""" - - pred_outputs = self.forward(inputs) - - keypoint_labels = torch.cat( - [d.gt_instance_labels.keypoint_labels for d in batch_data_samples]) - keypoint_weights = torch.cat([ - d.gt_instance_labels.keypoint_weights for d in batch_data_samples - ]) - - # calculate losses - losses = dict() - loss = self.loss_module(pred_outputs, keypoint_labels, - keypoint_weights.unsqueeze(-1)) - - if isinstance(loss, dict): - losses.update(loss) - else: - losses.update(loss_kpt=loss) - - # calculate accuracy - _, avg_acc, _ = keypoint_pck_accuracy( - pred=to_numpy(pred_outputs), - gt=to_numpy(keypoint_labels), - mask=to_numpy(keypoint_weights) > 0, - thr=0.05, - norm_factor=np.ones((pred_outputs.size(0), 2), dtype=np.float32)) - acc_pose = torch.tensor(avg_acc, device=keypoint_labels.device) - losses.update(acc_pose=acc_pose) - - return losses -``` - -## Compatibility of MMPose 0.X +# Compatibility of MMPose 0.X MMPose 1.0 has been refactored extensively and addressed many legacy issues. Most of the code in MMPose 1.0 will not be compatible with 0.x version. To try our best to help you migrate your code and model, here are some major changes: -### Data Transformation +## Data Transformation -#### Translation, Rotation and Scaling +### Translation, Rotation and Scaling The transformation methods `TopDownRandomShiftBboxCenter` and `TopDownGetRandomScaleRotation` in old version, will be merged into `RandomBBoxTransform`. @@ -671,7 +55,7 @@ class RandomBBoxTransform(BaseTransform): rotate_prob: float = 0.6) -> None: ``` -#### Target Generation +### Target Generation The old methods like: @@ -696,45 +80,38 @@ class GenerateTarget(BaseTransform): - keypoints_visible - dataset_keypoint_weights - Added Keys (depends on the args): - - heatmaps - - keypoint_labels - - keypoint_x_labels - - keypoint_y_labels - - keypoint_weights + Added Keys: + + - The keys of the encoded items from the codec will be updated into + the results, e.g. ``'heatmaps'`` or ``'keypoint_weights'``. See + the specific codec for more details. Args: - encoder (dict | list[dict]): The codec config for keypoint encoding - target_type (str): The type of the encoded form of the keypoints. - Should be one of the following options: - - - ``'heatmap'``: The encoded should be instance-irrelevant - heatmaps and will be stored in ``results['heatmaps']`` - - ``'multiscale_heatmap'`` The encoded should be a list of - heatmaps and will be stored in ``results['heatmaps']``. Note - that in this case ``self.encoder`` is also a list, each - encoder for a single scale of heatmaps - - ``'keypoint_label'``: The encoded should be instance-level - labels and will be stored in ``results['keypoint_label']`` - - ``'keypoint_xy_label'``: The encoed should be instance-level - labels in x-axis and y-axis respectively. They will be stored - in ``results['keypoint_x_label']`` and - ``results['keypoint_y_label']`` + encoder (dict | list[dict]): The codec config for keypoint encoding. + Both single encoder and multiple encoders (given as a list) are + supported + multilevel (bool): Determine the method to handle multiple encoders. + If ``multilevel==True``, generate multilevel targets from a group + of encoders of the same type (e.g. multiple :class:`MSRAHeatmap` + encoders with different sigma values); If ``multilevel==False``, + generate combined targets from a group of different encoders. This + argument will have no effect in case of single encoder. Defaults + to ``False`` use_dataset_keypoint_weights (bool): Whether use the keypoint weights from the dataset meta information. Defaults to ``False`` """ def __init__(self, encoder: MultiConfig, - target_type: str, + multilevel: bool = False, use_dataset_keypoint_weights: bool = False) -> None: ``` -#### Data Normalization +### Data Normalization The data normalization operations `NormalizeTensor` and `ToTensor` will be replaced by **DataPreprocessor** module, which will no longer be used as a preprocessing operation, but will be merged as a part of the model forward propagation. -### Compatibility of Models +## Compatibility of Models We have performed compatibility with the model weights provided by model zoo to ensure that the same model weights can get a comparable accuracy in both version. But note that due to the large number of differences in processing details, the inference outputs can be slightly different(less than 0.05% difference in accuracy). @@ -751,7 +128,7 @@ def __init__(self): self._register_load_state_dict_pre_hook(self._load_state_dict_pre_hook) ``` -#### Heatmap-based Model +### Heatmap-based Model For models based on `SimpleBaseline` approach, developers need to pay attention to the last convolutional layer. @@ -802,7 +179,7 @@ def _load_state_dict_pre_hook(self, state_dict, prefix, local_meta, *args, state_dict[prefix + k_new] = v ``` -#### RLE-based Model +### RLE-based Model For the RLE-based models, since the loss module is renamed to `loss_module` in MMPose 1.0, and the flow model is subsumed under the loss module, changes need to be made to the keys in `state_dict`: diff --git a/docs/en/notes/changelog.md b/docs/en/notes/changelog.md index 244a680ff3..c91aab1188 100644 --- a/docs/en/notes/changelog.md +++ b/docs/en/notes/changelog.md @@ -1,5 +1,61 @@ # Changelog +## **v1.0.0rc1 (14/10/2022)** + +**Highlights** + +- Release RTMPose, a high-performance real-time pose estimation algorithm with cross-platform deployment and inference support. See details at the [project page](/projects/rtmpose/) +- Support several new algorithms: ViTPose (arXiv'2022), CID (CVPR'2022), DEKR (CVPR'2021) +- Add Inferencer, a convenient inference interface that perform pose estimation and visualization on images, videos and webcam streams with only one line of code +- Introduce *Project*, a new form for rapid and easy implementation of new algorithms and features in MMPose, which is more handy for community contributors + +**New Features** + +- Support RTMPose ([#1971](https://github.com/open-mmlab/mmpose/pull/1971), [#2024](https://github.com/open-mmlab/mmpose/pull/2024), [#2028](https://github.com/open-mmlab/mmpose/pull/2028), [#2030](https://github.com/open-mmlab/mmpose/pull/2030), [#2040](https://github.com/open-mmlab/mmpose/pull/2040), [#2057](https://github.com/open-mmlab/mmpose/pull/2057)) +- Support Inferencer ([#1969](https://github.com/open-mmlab/mmpose/pull/1969)) +- Support ViTPose ([#1876](https://github.com/open-mmlab/mmpose/pull/1876), [#2056](https://github.com/open-mmlab/mmpose/pull/2056), [#2058](https://github.com/open-mmlab/mmpose/pull/2058), [#2065](https://github.com/open-mmlab/mmpose/pull/2065)) +- Support CID ([#1907](https://github.com/open-mmlab/mmpose/pull/1907)) +- Support DEKR ([#1834](https://github.com/open-mmlab/mmpose/pull/1834), [#1901](https://github.com/open-mmlab/mmpose/pull/1901)) +- Support training with multiple datasets ([#1767](https://github.com/open-mmlab/mmpose/pull/1767), [#1930](https://github.com/open-mmlab/mmpose/pull/1930), [#1938](https://github.com/open-mmlab/mmpose/pull/1938), [#2025](https://github.com/open-mmlab/mmpose/pull/2025)) +- Add *project* to allow rapid and easy implementation of new models and features ([#1914](https://github.com/open-mmlab/mmpose/pull/1914)) + +**Improvements** + +- Improve documentation quality ([#1846](https://github.com/open-mmlab/mmpose/pull/1846), [#1858](https://github.com/open-mmlab/mmpose/pull/1858), [#1872](https://github.com/open-mmlab/mmpose/pull/1872), [#1899](https://github.com/open-mmlab/mmpose/pull/1899), [#1925](https://github.com/open-mmlab/mmpose/pull/1925), [#1945](https://github.com/open-mmlab/mmpose/pull/1945), [#1952](https://github.com/open-mmlab/mmpose/pull/1952), [#1990](https://github.com/open-mmlab/mmpose/pull/1990), [#2023](https://github.com/open-mmlab/mmpose/pull/2023), [#2042](https://github.com/open-mmlab/mmpose/pull/2042)) +- Support visualizing keypoint indices ([#2051](https://github.com/open-mmlab/mmpose/pull/2051)) +- Support OpenPose style visualization ([#2055](https://github.com/open-mmlab/mmpose/pull/2055)) +- Accelerate image transpose in data pipelines with tensor operation ([#1976](https://github.com/open-mmlab/mmpose/pull/1976)) +- Support auto-import modules from registry ([#1961](https://github.com/open-mmlab/mmpose/pull/1961)) +- Support keypoint partition metric ([#1944](https://github.com/open-mmlab/mmpose/pull/1944)) +- Support SimCC 1D-heatmap visualization ([#1912](https://github.com/open-mmlab/mmpose/pull/1912)) +- Support saving predictions and data metainfo in demos ([#1814](https://github.com/open-mmlab/mmpose/pull/1814), [#1879](https://github.com/open-mmlab/mmpose/pull/1879)) +- Support SimCC with DARK ([#1870](https://github.com/open-mmlab/mmpose/pull/1870)) +- Remove Gaussian blur for offset maps in UDP-regress ([#1815](https://github.com/open-mmlab/mmpose/pull/1815)) +- Refactor encoding interface of Codec for better extendibility and easier configuration ([#1781](https://github.com/open-mmlab/mmpose/pull/1781)) +- Support evaluating CocoMetric without annotation file ([#1722](https://github.com/open-mmlab/mmpose/pull/1722)) +- Improve unit tests ([#1765](https://github.com/open-mmlab/mmpose/pull/1765)) + +**Bug Fixes** + +- Fix repeated warnings from different ranks ([#2053](https://github.com/open-mmlab/mmpose/pull/2053)) +- Avoid frequent scope switching when using mmdet inference api ([#2039](https://github.com/open-mmlab/mmpose/pull/2039)) +- Remove EMA parameters and message hub data when publishing model checkpoints ([#2036](https://github.com/open-mmlab/mmpose/pull/2036)) +- Fix metainfo copying in dataset class ([#2017](https://github.com/open-mmlab/mmpose/pull/2017)) +- Fix top-down demo bug when there is no object detected ([#2007](https://github.com/open-mmlab/mmpose/pull/2007)) +- Fix config errors ([#1882](https://github.com/open-mmlab/mmpose/pull/1882), [#1906](https://github.com/open-mmlab/mmpose/pull/1906), [#1995](https://github.com/open-mmlab/mmpose/pull/1995)) +- Fix image demo failure when GUI is unavailable ([#1968](https://github.com/open-mmlab/mmpose/pull/1968)) +- Fix bug in AdaptiveWingLoss ([#1953](https://github.com/open-mmlab/mmpose/pull/1953)) +- Fix incorrect importing of RepeatDataset which is deprecated ([#1943](https://github.com/open-mmlab/mmpose/pull/1943)) +- Fix bug in bottom-up datasets that ignores images without instances ([#1752](https://github.com/open-mmlab/mmpose/pull/1752), [#1936](https://github.com/open-mmlab/mmpose/pull/1936)) +- Fix upstream dependency issues ([#1867](https://github.com/open-mmlab/mmpose/pull/1867), [#1921](https://github.com/open-mmlab/mmpose/pull/1921)) +- Fix evaluation issues and update results ([#1763](https://github.com/open-mmlab/mmpose/pull/1763), [#1773](https://github.com/open-mmlab/mmpose/pull/1773), [#1780](https://github.com/open-mmlab/mmpose/pull/1780), [#1850](https://github.com/open-mmlab/mmpose/pull/1850), [#1868](https://github.com/open-mmlab/mmpose/pull/1868)) +- Fix local registry missing warnings ([#1849](https://github.com/open-mmlab/mmpose/pull/1849)) +- Remove deprecated scripts for model deployment ([#1845](https://github.com/open-mmlab/mmpose/pull/1845)) +- Fix a bug in input transformation in BaseHead ([#1843](https://github.com/open-mmlab/mmpose/pull/1843)) +- Fix an interface mismatch with MMDetection in webcam demo ([#1813](https://github.com/open-mmlab/mmpose/pull/1813)) +- Fix a bug in heatmap visualization that causes incorrect scale ([#1800](https://github.com/open-mmlab/mmpose/pull/1800)) +- Add model metafiles ([#1768](https://github.com/open-mmlab/mmpose/pull/1768)) + ## **v1.0.0rc0 (14/10/2022)** **New Features** diff --git a/docs/en/notes/contribution_guide.md b/docs/en/notes/contribution_guide.md index 23bc90cd52..525ca9a7e1 100644 --- a/docs/en/notes/contribution_guide.md +++ b/docs/en/notes/contribution_guide.md @@ -1,31 +1,173 @@ -# Contributing to MMPose +# How to Contribute to MMPose -All kinds of contributions are welcome, including but not limited to the following. +Welcome to join the MMPose community, we are committed to building cutting-edge computer vision foundational library. All kinds of contributions are welcomed, including but not limited to: -- Fix typo or bugs -- Add documentation or translate the documentation into other languages -- Add new features and components +- **Fix bugs** + 1. If the modification involves significant changes, it's recommended to create an issue first that describes the error information and how to trigger the bug. Other developers will discuss it with you and propose a proper solution. + 2. Fix the bug and add the corresponding unit test, submit the PR. +- **Add new features or components** + 1. If the new feature or module involves a large amount of code changes, we suggest you to submit an issue first, and we will confirm the necessity of the function with you. + 2. Implement the new feature and add unit tests, submit the PR. +- **Improve documentation or translation** + - If you find errors or incomplete documentation, please submit a PR directly. -## Workflow +```{note} +- If you hope to contribute to MMPose 1.0, please create a new branch from dev-1.x and submit a PR to the dev-1.x branch. +- If you are the author of papers in this field and would like to include your work to MMPose, please contact us. We will much appreciate your contribution. +- If you hope to share your MMPose-based projects with the community at once, consider creating a PR to `Projects` directory, which will simplify the review process and bring in the projects as soon as possible. Checkout our [example project](/projects/example_project) +- If you wish to join the MMPose developers, please feel free to contact us and we will invite you to join the MMPose developers group. +``` + +## Preparation + +The commands for processing pull requests are implemented using Git, and this chapter details Git Configuration and associated GitHub. + +### Git Configuration + +First, you need to install Git and configure your Git username and email. + +```shell +# view the Git version +git --version +``` + +Second, check your Git config and ensure that `user.name` and `user.email` are properly configured. -1. Fork and pull the latest MMPose repository -2. Checkout a new branch (do not use master branch for PRs) -3. Commit your changes -4. Create a PR +```shell +# view the Git config +git config --global --list +# configure the user name and email +git config --global user.name "Change your user name here" +git config --global user.email "Change your user email here" +``` + +## Pull Request Workflow + +If you’re not familiar with Pull Request, don’t worry! The following guidance will tell you how to create a Pull Request step by step. If you want to dive into the development mode of Pull Request, you can refer to the [official documents](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests). + +### 1. Fork and Clone + +If you are posting a pull request for the first time, you should fork the OpenMMLab repositories by clicking the **Fork** button in the top right corner of the GitHub page, and the forked repositories will appear under your GitHub profile. + +![](https://user-images.githubusercontent.com/13503330/223318144-a49c6cef-b1fb-45b8-aa2b-0833d0e3fd5c.png) + +Then you need to clone the forked repository to your local machine. + +```shell +# clone the forked repository +git clone https://github.com/username/mmpose.git + +# Add official repository as upstream remote +cd mmpose +git remote add upstream https://github.com/open-mmlab/mmpose.git +``` + +Enter the following command in the terminal to see if the remote repository was successfully added. + +```shell +git remote -v +``` + +If the following message appears, you have successfully added a remote repository. + +```Shell +origin https://github.com/{username}/mmpose.git (fetch) +origin https://github.com/{username}/mmpose.git (push) +upstream https://github.com/open-mmlab/mmpose.git (fetch) +upstream https://github.com/open-mmlab/mmpose.git (push) +``` ```{note} -- If you wish to PR to MMPose 1.0, please create your branch from `dev-1.x` -- If you plan to add some new features that involve large changes, it is encouraged to open an issue for discussion first. -- If you are the author of papers in this field and would like to include your work to MMPose, please contact us. We will much appreciate your contribution. +Here’s a brief introduction to the origin and upstream. When we use “git clone”, we create an “origin” remote by default, which points to the repository cloned from. As for “upstream”, we add it ourselves to point to the target repository. Of course, if you don’t like the name “upstream”, you could name it as you wish. Usually, we’ll push the code to “origin”. If the pushed code conflicts with the latest code in official(“upstream”), we should pull the latest code from upstream to resolve the conflicts, and then push to “origin” again. The posted Pull Request will be updated automatically. ``` -## Code style +### 2. Configure pre-commit -### Python +You should configure pre-commit in the local development environment to make sure the code style matches that of OpenMMLab. Note: The following code should be executed under the MMPOSE directory. -We adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code style. +```Shell +pip install -U pre-commit +pre-commit install +``` + +Check that pre-commit is configured successfully, and install the hooks defined in `.pre-commit-config.yaml`. + +```Shell +pre-commit run --all-files +``` -We use the following tools for linting and formatting: +![](https://user-images.githubusercontent.com/57566630/202368856-0465a90d-8fce-4345-918e-67b8b9c82614.png) + +```{note} +Chinese users may fail to download the pre-commit hooks due to the network issue. In this case, you could download these hooks from: + +pip install -U pre-commit -i https://pypi.tuna.tsinghua.edu.cn/simple + +or: + +pip install -U pre-commit -i https://pypi.mirrors.ustc.edu.cn/simple +``` + +If the installation process is interrupted, you can repeatedly run `pre-commit run ...` to continue the installation. + +If the code does not conform to the code style specification, pre-commit will raise a warning and fixes some of the errors automatically. + +![](https://user-images.githubusercontent.com/57566630/202369176-67642454-0025-4023-a095-263529107aa3.png) + +### 3. Create a development branch + +After configuring the pre-commit, we should create a branch based on the dev branch to develop the new feature or fix the bug. The proposed branch name is `username/pr_name`. + +```Shell +git checkout -b username/refactor_contributing_doc +``` + +In subsequent development, if the dev branch of the local repository lags behind the dev branch of the official repository, you need to pull the upstream dev branch first and then rebase it to the local development branch. + +```Shell +git checkout username/refactor_contributing_doc +git fetch upstream +git rebase upstream/dev-1.x +``` + +When rebasing, if a conflict arises, you need to resolve the conflict manually, then execute the `git add` command, and then execute the `git rebase --continue` command until the rebase is complete. + +### 4. Commit the code and pass the unit test + +After the local development is done, we need to pass the unit tests locally and then commit the code. + +```shell +# run unit test +pytest tests/ + +# commit the code +git add . +git commit -m "commit message" +``` + +### 5. Push the code to the remote repository + +After the local development is done, we need to push the code to the remote repository. + +```Shell +git push origin username/refactor_contributing_doc +``` + +### 6. Create a Pull Request + +#### (1) Create a Pull Request on GitHub + +![](https://user-images.githubusercontent.com/13503330/223321382-e6068e18-1d91-4458-8328-b1c7c907b3b2.png) + +#### (2) Fill in the Pull Request template + +![](https://user-images.githubusercontent.com/57566630/167307569-a794b967-6e28-4eac-a942-00deb657815f.png) + +## Code Style + +### Python + +We adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code style, and use the following tools for linting and formatting: - [flake8](https://github.com/PyCQA/flake8): A wrapper around some linter tools. - [isort](https://github.com/timothycrosley/isort): A Python utility to sort imports. @@ -40,22 +182,10 @@ We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `f fixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirments.txt` automatically on every commit. The config for a pre-commit hook is stored in [.pre-commit-config](/.pre-commit-config.yaml). -After you clone the repository, you will need to install initialize pre-commit hook. - -```shell -pip install -U pre-commit -``` - -From the repository folder - -```shell -pre-commit install +```{note} +Before you create a PR, make sure that your code lints and is formatted by yapf. ``` -After this on every commit check code linters and formatter will be enforced. - -> Before you create a PR, make sure that your code lints and is formatted by yapf. - ### C++ and CUDA We follow the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html). diff --git a/docs/en/notes/faq.md b/docs/en/notes/faq.md index e05a695adc..54d4f7b707 100644 --- a/docs/en/notes/faq.md +++ b/docs/en/notes/faq.md @@ -12,15 +12,19 @@ Compatible MMPose and MMCV versions are shown as below. Please choose the correc ### MMPose 1.x -| MMPose version | MMCV version | +| MMPose version | MMCV/MMEngine version | | :------------: | :-----------------------------: | -| 1.0.0b0 | mmcv>=2.0.0rc1, mmengine>=0.0.1 | +| 1.0.0rc1 | mmcv>=2.0.0rc4, mmengine>=0.6.0 | +| 1.0.0rc0 | mmcv>=2.0.0rc0, mmengine>=0.0.1 | +| 1.0.0b0 | mmcv>=2.0.0rc0, mmengine>=0.0.1 | ### MMPose 0.x | MMPose version | MMCV version | | :------------: | :-----------------------: | -| master | mmcv-full>=1.3.8, \<1.7.0 | +| master | mmcv-full>=1.3.8, \<1.8.0 | +| 0.29.0 | mmcv-full>=1.3.8, \<1.7.0 | +| 0.28.1 | mmcv-full>=1.3.8, \<1.7.0 | | 0.28.0 | mmcv-full>=1.3.8, \<1.6.0 | | 0.27.0 | mmcv-full>=1.3.8, \<1.6.0 | | 0.26.0 | mmcv-full>=1.3.8, \<1.6.0 | diff --git a/docs/en/overview.md b/docs/en/overview.md index fbeaac13f5..9c5fef6e04 100644 --- a/docs/en/overview.md +++ b/docs/en/overview.md @@ -42,7 +42,7 @@ We have prepared detailed guidelines for all types of users: - [Codecs](./user_guides/codecs.md) - [Train & Test](./user_guides/train_and_test.md) - [Visualization](./user_guides/visualization.md) - - [Useful Tools](./user_guides/useful_tools.md) + - [How to](./user_guides/how_to.md) 4. For developers who wish to develop based on MMPose: - [Migration Guide](./migration.md) 5. For researchers and developers who are willing to contribute to MMPose: diff --git a/docs/en/user_guides/codecs.md b/docs/en/user_guides/codecs.md index 308c0154b1..ca6ebccf63 100644 --- a/docs/en/user_guides/codecs.md +++ b/docs/en/user_guides/codecs.md @@ -26,11 +26,9 @@ The encoder transforms the coordinates in the input image space into the needed For example, in the Regression-based method, the encoder will be: ```Python -def encode( - self, - keypoints: np.ndarray, - keypoints_visible: Optional[np.ndarray] = None -) -> Tuple[np.ndarray, np.ndarray]: +def encode(self, + keypoints: np.ndarray, + keypoints_visible: Optional[np.ndarray] = None) -> dict: """Encoding keypoints from input image space to normalized space. Args: @@ -39,13 +37,12 @@ def encode( (N, K) Returns: - tuple: - - reg_labels (np.ndarray): The normalized regression labels in + dict: + - keypoint_labels (np.ndarray): The normalized regression labels in shape (N, K, D) where D is 2 for 2d coordinates - keypoint_weights (np.ndarray): The target weights in shape (N, K) """ - if keypoints_visible is None: keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32) @@ -54,10 +51,39 @@ def encode( (keypoints <= [w - 1, h - 1])).all(axis=-1) & ( keypoints_visible > 0.5) - reg_labels = (keypoints / np.array([w, h])).astype(np.float32) + keypoint_labels = (keypoints / np.array([w, h])).astype(np.float32) keypoint_weights = np.where(valid, 1., 0.).astype(np.float32) - return reg_labels, keypoint_weights + encoded = dict( + keypoint_labels=keypoint_labels, keypoint_weights=keypoint_weights) + + return encoded +``` + +The encoded data is converted to Tensor format in `PackPoseInputs` and packed in `data_sample.gt_instance_labels` for model calls, which is generally used for loss calculation, as demonstrated by `loss()` in `RegressionHead`. + +```Python +def loss(self, + inputs: Tuple[Tensor], + batch_data_samples: OptSampleList, + train_cfg: ConfigType = {}) -> dict: + """Calculate losses from a batch of inputs and data samples.""" + + pred_outputs = self.forward(inputs) + + keypoint_labels = torch.cat( + [d.gt_instance_labels.keypoint_labels for d in batch_data_samples]) + keypoint_weights = torch.cat([ + d.gt_instance_labels.keypoint_weights for d in batch_data_samples + ]) + + # calculate losses + losses = dict() + loss = self.loss_module(pred_outputs, keypoint_labels, + keypoint_weights.unsqueeze(-1)) + + losses.update(loss_kpt=loss) + ### Omitted ### ``` ### Decoder @@ -125,7 +151,7 @@ codec = dict(type='RegressionLabel', input_size=(192, 256)) In pipelines, A codec should be passed into `GenerateTarget` to work as the `encoder`: ```Python -dict(type='GenerateTarget', target_type='keypoint_label', encoder=codec) +dict(type='GenerateTarget', encoder=codec) ``` ### Head @@ -189,7 +215,7 @@ train_pipeline = [ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='keypoint_label', encoder=codec), ## Generate Target ## + dict(type='GenerateTarget', encoder=codec), ## Generate Target ## dict(type='PackPoseInputs') ] test_pipeline = [ diff --git a/docs/en/user_guides/configs.md b/docs/en/user_guides/configs.md index be73312e20..92ed4a157d 100644 --- a/docs/en/user_guides/configs.md +++ b/docs/en/user_guides/configs.md @@ -50,6 +50,10 @@ loss_cfg = dict( loss = MODELS.build(loss_cfg) # equals to `loss = Loss_A(param1=1.0, param2=True)` ``` +```{note} +Note that all new modules need to be registered using `Registry` and imported in `__init__.py` in the corresponding directory before we can create their instances from configs. +``` + Here is a list of pre-defined registries in MMPose: - `DATASETS`: data-related modules @@ -129,16 +133,14 @@ _base_ = ['../../../_base_/default_runtime.py'] # take the config file as the st ``` ```{note} -**Tips** - CheckpointHook: -- save_best: `'coco/AP'` for `CocoMetric`, `'pck/PCK@0.05'` for `PCKAccuracy` +- save_best: `'coco/AP'` for `CocoMetric`, `'PCK'` for `PCKAccuracy` - max_keep_ckpts: the maximum checkpoints to keep. Defaults to -1, which means unlimited. Example: -`default_hooks = dict(checkpoint=dict(save_best='pck/PCK@0.05', rule='greater', max_keep_ckpts=1))` +`default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater', max_keep_ckpts=1))` ``` ### Data @@ -174,7 +176,7 @@ train_pipeline = [ # data aug in training dict(type='TopdownAffine', input_size=codec['input_size']), # update inputs via transform matrix dict( type='GenerateTarget', # generate targets via transformed inputs - target_type='heatmap', # typeof targets + # typeof targets encoder=codec, # get encoder from codec dict(type='PackPoseInputs') # pack targets ] @@ -218,9 +220,10 @@ test_dataloader = val_dataloader # use val as test by default ``` ```{note} -**Tips** - -You can set the random seed by doing: `randomness=dict(seed=0)` +Common Usages: +- [Resume training](../common_usages/resume_training.md) +- [Automatic mixed precision (AMP) training](../common_usages/amp_training.md) +- [Set the random seed](../common_usages/set_random_seed.md) ``` diff --git a/docs/en/user_guides/how_to.md b/docs/en/user_guides/how_to.md new file mode 100644 index 0000000000..fe97ee6539 --- /dev/null +++ b/docs/en/user_guides/how_to.md @@ -0,0 +1,110 @@ +# How to + +## Log Analysis + +MMPose provides `tools/analysis_tools/analyze_logs.py` to analyze the training log. The log file can be either a json file or a text file. The json file is recommended, because it is more convenient to parse and visualize. + +Currently, the following functions are supported: + +- Plot loss/accuracy curves +- Calculate training time + +### Plot Loss/Accuracy Curves + +The function depends on `seaborn`, please install it first by running `pip install seaborn`. + +![log_curve](https://user-images.githubusercontent.com/87690686/188538215-5d985aaa-59f8-44cf-b6f9-10890d599e9c.png) + +```shell +python tools/analysis_tools/analyze_logs.py plot_curve ${JSON_LOGS} [--keys ${KEYS}] [--title ${TITLE}] [--legend ${LEGEND}] [--backend ${BACKEND}] [--style ${STYLE}] [--out ${OUT_FILE}] +``` + +Examples: + +- Plot loss curve + + ```shell + python tools/analysis_tools/analyze_logs.py plot_curve log.json --keys loss_kpt --legend loss_kpt + ``` + +- Plot accuracy curve and export to PDF file + + ```shell + python tools/analysis_tools/analyze_logs.py plot_curve log.json --keys acc_pose --out results.pdf + ``` + +- Plot multiple log files on the same figure + + ```shell + python tools/analysis_tools/analyze_logs.py plot_curve log1.json log2.json --keys loss_kpt --legend run1 run2 --title loss_kpt --out loss_kpt.png + ``` + +### Calculate Training Time + +```shell +python tools/analysis_tools/analyze_logs.py cal_train_time ${JSON_LOGS} [--include-outliers] +``` + +Examples: + +```shell +python tools/analysis_tools/analyze_logs.py cal_train_time log.json +``` + +The result is as follows: + +```text +-----Analyze train time of hrnet_w32_256x192.json----- +slowest epoch 56, average time is 0.6924 +fastest epoch 1, average time is 0.6502 +time std over epochs is 0.0085 +average iter time: 0.6688 s/iter +``` + +## Get Model Params & FLOPs + +MMPose provides `tools/analysis_tools/get_flops.py` to get model parameters and FLOPs. + +```shell +python tools/analysis_tools/get_flops.py ${CONFIG_FILE} [--shape ${INPUT_SHAPE}] [--cfg-options ${CFG_OPTIONS}] +``` + +Description of all arguments: + +`CONFIG_FILE` : The path of a model config file. + +`--shape`: The input shape to the model. + +`--input-constructor`: If specified as batch, it will generate a batch tensor to calculate FLOPs. + +`--batch-size`:If `--input-constructor` is specified as batch, it will generate a random tensor with shape `(batch_size, 3, **input_shape)` to calculate FLOPs. + +`--cfg-options`: If specified, the key-value pair optional `cfg` will be merged into config file. + +Example: + +```shell +python tools/analysis_tools/get_flops.py configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py +``` + +We will get the following results: + +```text +============================== +Input shape: (1, 3, 256, 192) +Flops: 7.7 GFLOPs +Params: 28.54 M +============================== +``` + +```{note} +This tool is still experimental and we do not guarantee that the number is absolutely correct. Some operators are not counted into FLOPs like GN and custom operators. +``` + +## Print Entire Config + +Officially provided config files inherit multiple config files, which can facilitate management and reduce redundant code. But sometimes we want to know what the default parameter values that are not explicitly written in the configuration file are. MMPose provides `tools/analysis_tools/print_config.py` to print the entire configuration information verbatim. + +```shell +python tools/analysis_tools/print_config.py ${CONFIG} [-h] [--options ${OPTIONS [OPTIONS...]}] +``` diff --git a/docs/en/user_guides/inference.md b/docs/en/user_guides/inference.md index b247d819fb..e9af2adee1 100644 --- a/docs/en/user_guides/inference.md +++ b/docs/en/user_guides/inference.md @@ -9,6 +9,122 @@ In MMPose, a model is defined by a configuration file and existing model paramet To start with, we recommend HRNet model with [this configuration file](/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py) and [this checkpoint file](https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth). It is recommended to download the checkpoint file to `checkpoints` directory. +## Out-of-the-box inferencer + +MMPose offers a comprehensive API for inference, known as `MMPoseInferencer`. This API enables users to perform inference on both images and videos using all the models supported by MMPose. Furthermore, the API provides automatic visualization of inference results and allows for the convenient saving of predictions. + +Here is an example of inference on a given image using the pre-trained human pose estimator. + +```python +from mmpose.apis import MMPoseInferencer + +img_path = 'tests/data/coco/000000000785.jpg' # you can specify your own picture path + +# build the inferencer with model alias +inferencer = MMPoseInferencer('human') + +# The MMPoseInferencer API utilizes a lazy inference strategy, +# whereby it generates a prediction generator when provided with input +result_generator = inferencer(img_path, show=True) +result = next(result_generator) +``` + +If everything works fine, you will see the following image in a new window. +![inferencer_result_coco](https://user-images.githubusercontent.com/26127467/220008302-4a57fd44-0978-408e-8351-600e5513316a.jpg) + +The variable `result` is a dictionary that contains two keys, `'visualization'` and `'predictions'`. The key `'visualization'` is intended to contain the visualization results. However, as the `return_vis` argument was not specified, this list remains blank. On the other hand, the key `'predictions'` is a list that contains the estimated keypoints for each individual instance. + +### CLI tool + +A command-line interface (CLI) tool for the inferencer is also available: `demo/inferencer_demo.py`. This tool enables users to perform inference with the same model and inputs using the following command: + +```bash +python demo/inferencer_demo.py 'tests/data/coco/000000000785.jpg' \ + --pose2d 'human' --show --pred-out-dir 'predictions' +``` + +The predictions will be save in `predictions/000000000785.json`. + +### Custom pose estimation models + +The inferencer provides several methods that can be used to customize the models employed: + +```python + +# build the inferencer with model alias +# the available aliases include 'human', 'hand', 'face' and 'animal' +inferencer = MMPoseInferencer('human') + +# build the inferencer with model config name +inferencer = MMPoseInferencer('td-hm_hrnet-w32_8xb64-210e_coco-256x192') + +# build the inferencer with model config path and checkpoint path/URL +inferencer = MMPoseInferencer( + pose2d='configs/body_2d_keypoint/topdown_heatmap/coco/' \ + 'td-hm_hrnet-w32_8xb64-210e_coco-256x192.py', + pose2d_weights='https://download.openmmlab.com/mmpose/top_down/' \ + 'hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth' +) +``` + +In addition, top-down pose estimators also require an object detection model. The inferencer is capable of inferring the instance type for models trained with datasets supported in MMPose, and subsequently constructing the necessary object detection model. Alternatively, users may also manually specify the detection model using the following methods: + +```python + +# specify detection model by alias +# the available aliases include 'human', 'hand', 'face', 'animal', +# as well as any additional aliases defined in mmdet +inferencer = MMPoseInferencer( + # suppose the pose estimator is trained on custom dataset + pose2d='custom_human_pose_estimator.py', + pose2d_weights='custom_human_pose_estimator.pth', + det_model='human' +) + +# specify detection model with model config name +inferencer = MMPoseInferencer( + pose2d='human', + det_model='yolox_l_8x8_300e_coco', + det_cat_ids=[0], # the category id of 'human' class +) + +# specify detection model with config path and checkpoint path/URL +inferencer = MMPoseInferencer( + pose2d='human', + det_model=f'{PATH_TO_MMDET}/configs/yolox/yolox_l_8x8_300e_coco.py', + det_weights='https://download.openmmlab.com/mmdetection/v2.0/' \ + 'yolox/yolox_l_8x8_300e_coco/' \ + 'yolox_l_8x8_300e_coco_20211126_140236-d3bd2b23.pth', + det_cat_ids=[0], # the category id of 'human' class +) +``` + +### Input format + +The inferencer is capable of processing a range of input types, which includes the following: + +- A path to an image +- A path to a video +- A path to a folder (which will cause all images in that folder to be inferred) +- An image array +- A list of image arrays +- A webcam (in which case the `input` parameter should be set to either `'webcam'` or `'webcam:{CAMERA_ID}'`) + +### Output settings + +The inferencer is capable of both visualizing and saving predictions. The relevant arguments are as follows: + +| Argument | Description | +| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | +| `show` | Determines whether the image or video should be displayed in a pop-up window. | +| `radius` | Sets the keypoint radius for visualization. | +| `thickness` | Sets the link thickness for visualization. | +| `return_vis` | Determines whether visualization images should be included in the results. | +| `vis_out_dir` | Specifies the folder path for saving the visualization images. If not set, the visualization images will not be saved. | +| `return_datasample` | Determines whether to return the prediction in the format of `PoseDataSample`. | +| `pred_out_dir` | Specifies the folder path for saving the predictions. If not set, the predictions will not be saved. | +| `out_dir` | If `vis_out_dir` or `pred_out_dir` is not set, the values will be set to `f'{out_dir}/visualization'` or `f'{out_dir}/predictions'`, respectively. | + ## High-level APIs for inference MMPose provides high-level Python APIs for inference on a given image: diff --git a/docs/en/user_guides/prepare_datasets.md b/docs/en/user_guides/prepare_datasets.md index 1d2c93e37d..8695243f7d 100644 --- a/docs/en/user_guides/prepare_datasets.md +++ b/docs/en/user_guides/prepare_datasets.md @@ -224,4 +224,48 @@ Make sure you have provided all the paths correctly. The following dataset wrappers are supported in [MMEngine](https://github.com/open-mmlab/mmengine), you can refer to [MMEngine tutorial](https://mmengine.readthedocs.io/en/latest) to learn how to use it. +- [ConcatDataset](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html#concatdataset) - [RepeatDataset](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html#repeatdataset) + +### CombinedDataset + +MMPose provides `CombinedDataset` to combine multiple datasets with different annotations. A combined dataset can be defined in config files as: + +```python +dataset_1 = dict( + type='dataset_type_1', + data_root='root/of/your/dataset1', + data_prefix=dict(img_path='path/to/your/img'), + ann_file='annotations/train.json', + pipeline=[ + # the converter transforms convert data into a unified format + converter_transform_1 + ]) + +dataset_2 = dict( + type='dataset_type_2', + data_root='root/of/your/dataset2', + data_prefix=dict(img_path='path/to/your/img'), + ann_file='annotations/train.json', + pipeline=[ + converter_transform_2 + ]) + +shared_pipeline = [ + LoadImage(), + ParseImage(), +] + +combined_dataset = dict( + type='CombinedDataset', + metainfo=dict(from_file='path/to/your/metainfo'), + datasets=[dataset_1, dataset_2], + pipeline=shared_pipeline, +) +``` + +- **MetaInfo of combined dataset** determines the annotation format. Either metainfo of a sub-dataset or a customed dataset metainfo is valid here. To custom a dataset metainfo, please refer to [Create a custom dataset_info config file for the dataset](#create-a-custom-datasetinfo-config-file-for-the-dataset). + +- **Converter transforms of sub-datasets** are applied when there exist mismatches of annotation format between sub-datasets and the combined dataset. For example, the number and order of keypoints might be different in the combined dataset and the sub-datasets. Then `KeypointConverter` can be used to unify the keypoints number and order. + +- More details about `CombinedDataset` and `KeypointConverter` can be found in Advanced Guides-[Training with Mixed Datasets](../advanced_guides/mixed_datasets.md). diff --git a/docs/en/user_guides/train_and_test.md b/docs/en/user_guides/train_and_test.md index 66057d6cb6..ae0d459da2 100644 --- a/docs/en/user_guides/train_and_test.md +++ b/docs/en/user_guides/train_and_test.md @@ -47,6 +47,10 @@ CUDA_VISIBLE_DEVICES=-1 python tools/train.py ${CONFIG_FILE} [ARGS] | `--no-validate` | **Not suggested**. Disable checkpoint evaluation during training. | | `--auto-scale-lr` | Automatically rescale the learning rate according to the actual batch size and the original batch size. | | `--cfg-options CFG_OPTIONS` | Override some settings in the used config, the key-value pair in xxx=yyy format will be merged into the config file. If the value to be overwritten is a list, it should be of the form of either `key="[a,b]"` or `key=a,b`. The argument also allows nested list/tuple values, e.g. `key="[(a,b),(c,d)]"`. Note that quotation marks are necessary and that **no white space is allowed**. | +| `--show-dir SHOW_DIR` | The directory to save the result visualization images generated during validation. | +| `--show` | Visualize the prediction result in a window. | +| `--interval INTERVAL` | The interval of samples to visualize. | +| `--wait-time WAIT_TIME` | The display time of every window (in seconds). Defaults to 1. | | `--launcher {none,pytorch,slurm,mpi}` | Options for job launcher. | ### Train with multiple GPUs diff --git a/docs/en/user_guides/visualization.md b/docs/en/user_guides/visualization.md index cdb41f059e..2dd39c6f65 100644 --- a/docs/en/user_guides/visualization.md +++ b/docs/en/user_guides/visualization.md @@ -1,8 +1,30 @@ # Visualization +- [Single Image](#single-image) - [Browse Dataset](#browse-dataset) - [Visualizer Hook](#visualizer-hook) +## Single Image + +`demo/image_demo.py` helps the user to visualize the prediction result of a single image, including the skeleton and heatmaps. + +```shell +python demo/image_demo.py ${IMG} ${CONFIG} ${CHECKPOINT} [-h] [--out-file OUT_FILE] [--device DEVICE] [--draw-heatmap] +``` + +| ARGS | Description | +| --------------------- | -------------------------------- | +| `IMG` | The path to the test image. | +| `CONFIG` | The path to the config file. | +| `CHECKPOINT` | The path to the checkpoint file. | +| `--out-file OUT_FILE` | Path to output file. | +| `--device DEVICE` | Device used for inference. | +| `--draw-heatmap` | Visualize the predicted heatmap. | + +Here is an example of Heatmap visualization: + +![000000196141](https://user-images.githubusercontent.com/13503330/222373580-88d93603-e00e-45e9-abdd-f504a62b4ca5.jpg) + ## Browse Dataset `tools/analysis_tools/browse_dataset.py` helps the user to browse a pose dataset visually, or save the image to a designated directory. @@ -71,7 +93,7 @@ During validation: python tools/train.py ${CONFIG} --work-dir ${WORK_DIR} --show-dir=${SHOW_DIR} ``` -More details about visualization hook arguments can be found in [train_and_test](./train_and_test.md). +More details about visualization arguments can be found in [train_and_test](./train_and_test.md). If you use a heatmap-based method and want to visualize predicted heatmaps, you can manually specify `output_heatmaps=True` for `model.test_cfg` in config file. Another way is to add `--cfg-options='model.test_cfg.output_heatmaps=True'` at the end of your command. diff --git a/docs/src/papers/algorithms/cid.md b/docs/src/papers/algorithms/cid.md new file mode 100644 index 0000000000..4366f95504 --- /dev/null +++ b/docs/src/papers/algorithms/cid.md @@ -0,0 +1,31 @@ +# Contextual Instance Decoupling for Robust Multi-Person Pose Estimation + + + +
+CID (CVPR'2022) + +```bibtex +@InProceedings{Wang_2022_CVPR, + author = {Wang, Dongkai and Zhang, Shiliang}, + title = {Contextual Instance Decoupling for Robust Multi-Person Pose Estimation}, + booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2022}, + pages = {11060-11068} +} +``` + +
+ +## Abstract + + + +Crowded scenes make it challenging to differentiate persons and locate their pose keypoints. This paper proposes the Contextual Instance Decoupling (CID), which presents a new pipeline for multi-person pose estimation. Instead of relying on person bounding boxes to spatially differentiate persons, CID decouples persons in an image into multiple instance-aware feature maps. Each of those feature maps is hence adopted to infer keypoints for a specific person. Compared with bounding box detection, CID is differentiable and robust to detection errors. Decoupling persons into different feature maps allows to isolate distractions from other persons, and explore context cues at scales larger than the bounding box size. Experiments show that CID outperforms previous multi-person pose estimation pipelines on crowded scenes pose estimation benchmarks in both accuracy and efficiency. For instance, it achieves 71.3% AP on CrowdPose, outperforming the recent single-stage DEKR by 5.6%, the bottom-up CenterAttention by 3.7%, and the top-down JCSPPE by 5.3%. This advantage sustains on the commonly used COCO benchmark. + + + +
+ +
diff --git a/docs/src/papers/algorithms/dekr.md b/docs/src/papers/algorithms/dekr.md new file mode 100644 index 0000000000..ee19a3315b --- /dev/null +++ b/docs/src/papers/algorithms/dekr.md @@ -0,0 +1,31 @@ +# Bottom-up Human Pose Estimation via Disentangled Keypoint Regression + + + +
+DEKR (CVPR'2021) + +```bibtex +@inproceedings{geng2021bottom, + title={Bottom-up human pose estimation via disentangled keypoint regression}, + author={Geng, Zigang and Sun, Ke and Xiao, Bin and Zhang, Zhaoxiang and Wang, Jingdong}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={14676--14686}, + year={2021} +} +``` + +
+ +## Abstract + + + +In this paper, we are interested in the bottom-up paradigm of estimating human poses from an image. We study the dense keypoint regression framework that is previously inferior to the keypoint detection and grouping framework. Our motivation is that regressing keypoint positions accurately needs to learn representations that focus on the keypoint regions. +We present a simple yet effective approach, named disentangled keypoint regression (DEKR). We adopt adaptive convolutions through pixel-wise spatial transformer to activate the pixels in the keypoint regions and accordingly learn representations from them. We use a multi-branch structure for separate regression: each branch learns a representation with dedicated adaptive convolutions and regresses one keypoint. The resulting disentangled representations are able to attend to the keypoint regions, respectively, and thus the keypoint regression is spatially more accurate. We empirically show that the proposed direct regression method outperforms keypoint detection and grouping methods and achieves superior bottom-up pose estimation results on two benchmark datasets, COCO and CrowdPose. The code and models are available at [this https URL](https://github.com/HRNet/DEKR). + + + +
+ +
diff --git a/docs/src/papers/algorithms/rtmpose.md b/docs/src/papers/algorithms/rtmpose.md new file mode 100644 index 0000000000..a2a285fe20 --- /dev/null +++ b/docs/src/papers/algorithms/rtmpose.md @@ -0,0 +1,34 @@ +# RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose + + + +
+RTMPose (arXiv'2023) + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2303.07399, + doi = {10.48550/ARXIV.2303.07399}, + url = {https://arxiv.org/abs/2303.07399}, + author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose}, + publisher = {arXiv}, + year = {2023}, + copyright = {Creative Commons Attribution 4.0 International} +} + +``` + +
+ +## Abstract + + + +Recent studies on 2D pose estimation have achieved excellent performance on public benchmarks, yet its application in the industrial community still suffers from heavy model parameters and high latency. In order to bridge this gap, we empirically explore key factors in pose estimation including paradigm, model architecture, training strategy, and deployment, and present a high-performance real-time multi-person pose estimation framework, RTMPose, based on MMPose. Our RTMPose-m achieves 75.8% AP on COCO with 90+ FPS on an Intel i7-11700 CPU and 430+ FPS on an NVIDIA GTX 1660 Ti GPU, and RTMPose-l achieves 67.0% AP on COCO-WholeBody with 130+ FPS. To further evaluate RTMPose’s capability in critical real-time applications, we also report the performance after deploying on the mobile device. Our RTMPoses achieves 72.2% AP on COCO with 70+ FPS on a Snapdragon 865 chip, outperforming existing open-source libraries. Code and models are released at https:// github.com/open-mmlab/mmpose/tree/1.x/projects/rtmpose. + + + +
+ +
diff --git a/docs/src/papers/algorithms/vitpose.md b/docs/src/papers/algorithms/vitpose.md new file mode 100644 index 0000000000..3c74233dfa --- /dev/null +++ b/docs/src/papers/algorithms/vitpose.md @@ -0,0 +1,30 @@ +# ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation + + + +
+ViTPose (NeurIPS'2022) + +```bibtex +@inproceedings{ + xu2022vitpose, + title={Vi{TP}ose: Simple Vision Transformer Baselines for Human Pose Estimation}, + author={Yufei Xu and Jing Zhang and Qiming Zhang and Dacheng Tao}, + booktitle={Advances in Neural Information Processing Systems}, + year={2022}, +} +``` + +
+ +## Abstract + + + +Although no specific domain knowledge is considered in the design, plain vision transformers have shown excellent performance in visual recognition tasks. However, little effort has been made to reveal the potential of such simple structures for pose estimation tasks. In this paper, we show the surprisingly good capabilities of plain vision transformers for pose estimation from various aspects, namely simplicity in model structure, scalability in model size, flexibility in training paradigm, and transferability of knowledge between models, through a simple baseline model called ViTPose. Specifically, ViTPose employs plain and non-hierarchical vision transformers as backbones to extract features for a given person instance and a lightweight decoder for pose estimation. It can be scaled up from 100M to 1B parameters by taking the advantages of the scalable model capacity and high parallelism of transformers, setting a new Pareto front between throughput and performance. Besides, ViTPose is very flexible regarding the attention type, input resolution, pre-training and finetuning strategy, as well as dealing with multiple pose tasks. We also empirically demonstrate that the knowledge of large ViTPose models can be easily transferred to small ones via a simple knowledge token. Experimental results show that our basic ViTPose model outperforms representative methods on the challenging MS COCO Keypoint Detection benchmark, while the largest model sets a new state-of-the-art. + + + +
+ +
diff --git a/docs/src/papers/techniques/rle.md b/docs/src/papers/techniques/rle.md new file mode 100644 index 0000000000..cdc59d57ec --- /dev/null +++ b/docs/src/papers/techniques/rle.md @@ -0,0 +1,30 @@ +# Human pose regression with residual log-likelihood estimation + + + +
+RLE (ICCV'2021) + +```bibtex +@inproceedings{li2021human, + title={Human pose regression with residual log-likelihood estimation}, + author={Li, Jiefeng and Bian, Siyuan and Zeng, Ailing and Wang, Can and Pang, Bo and Liu, Wentao and Lu, Cewu}, + booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages={11025--11034}, + year={2021} +} +``` + +
+ +## Abstract + + + +Heatmap-based methods dominate in the field of human pose estimation by modelling the output distribution through likelihood heatmaps. In contrast, regressionbased methods are more efficient but suffer from inferior performance. In this work, we explore maximum likelihood estimation (MLE) to develop an efficient and effective regression-based methods. From the perspective of MLE, adopting different regression losses is making different assumptions about the output density function. A density function closer to the true distribution leads to a better regression performance. In light of this, we propose a novel regression paradigm with Residual Log-likelihood Estimation (RLE) to capture the underlying output distribution. Concretely, RLE learns the change of the distribution instead of the unreferenced underlying distribution to facilitate the training process. With the proposed reparameterization design, our method is compatible with offthe-shelf flow models. The proposed method is effective, efficient and flexible. We show its potential in various human pose estimation tasks with comprehensive experiments. Compared to the conventional regression paradigm, regression with RLE bring 12.4 mAP improvement on MSCOCO without any test-time overhead. Moreover, for the first time, especially on multi-person pose estimation, our regression method is superior to the heatmap-based methods. + + + +
+ +
diff --git a/docs/zh_cn/advanced_guides.md b/docs/zh_cn/advanced_guides.md deleted file mode 100644 index 25fcefe33e..0000000000 --- a/docs/zh_cn/advanced_guides.md +++ /dev/null @@ -1,3 +0,0 @@ -# Advanced Guides - -Work in progress... diff --git a/docs/zh_cn/advanced_guides/advanced_training.md b/docs/zh_cn/advanced_guides/advanced_training.md new file mode 100644 index 0000000000..dd02a7661f --- /dev/null +++ b/docs/zh_cn/advanced_guides/advanced_training.md @@ -0,0 +1,104 @@ +# 高级训练设置 + +## 恢复训练 + +恢复训练是指从之前某次训练保存下来的状态开始继续训练,这里的状态包括模型的权重、优化器和优化器参数调整策略的状态。 + +### 自动恢复训练 + +用户可以在训练命令最后加上 `--resume` 恢复训练,程序会自动从 `work_dirs` 中加载最新的权重文件恢复训练。如果 `work_dir` 中有最新的 `checkpoint`(例如该训练在上一次训练时被中断),则会从该 `checkpoint` 恢复训练,否则(例如上一次训练还没来得及保存 `checkpoint` 或者启动了新的训练任务)会重新开始训练。 + +下面是一个恢复训练的示例: + +```shell +python tools/train.py configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py --resume +``` + +### 指定 Checkpoint 恢复训练 + +你也可以对 `--resume` 指定 `checkpoint` 路径,MMPose 会自动读取该 `checkpoint` 并从中恢复训练,命令如下: + +```shell +python tools/train.py configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py \ + --resume work_dirs/td-hm_res50_8xb64-210e_coco-256x192/latest.pth +``` + +如果你希望手动在配置文件中指定 `checkpoint` 路径,除了设置 `resume=True`,还需要设置 `load_from` 参数。需要注意的是,如果只设置了 `load_from` 而没有设置 `resume=True`,则只会加载 `checkpoint` 中的权重并重新开始训练,而不是接着之前的状态继续训练。 + +下面的例子与上面指定 `--resume` 参数的例子等价: + +```python +resume = True +load_from = 'work_dirs/td-hm_res50_8xb64-210e_coco-256x192/latest.pth' +# model settings +model = dict( + ## 内容省略 ## + ) +``` + +## 自动混合精度(AMP)训练 + +混合精度训练在不改变模型、不降低模型训练精度的前提下,可以缩短训练时间,降低存储需求,因而能支持更大的 batch size、更大模型和尺寸更大的输入的训练。 + +如果要开启自动混合精度(AMP)训练,在训练命令最后加上 --amp 即可, 命令如下: + +```shell +python tools/train.py ${CONFIG_FILE} --amp +``` + +具体例子如下: + +```shell +python tools/train.py configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_res50_8xb64-210e_coco-256x192.py --amp +``` + +## 设置随机种子 + +如果想要在训练时指定随机种子,可以使用以下命令: + +```shell +python ./tools/train.py \ + ${CONFIG} \ # 配置文件路径 + --cfg-options randomness.seed=2023 \ # 设置随机种子为 2023 + [randomness.diff_rank_seed=True] \ # 根据 rank 来设置不同的种子。 + [randomness.deterministic=True] # 把 cuDNN 后端确定性选项设置为 True +# [] 代表可选参数,实际输入命令行时,不用输入 [] +``` + +randomness 有三个参数可设置,具体含义如下: + +- `randomness.seed=2023` ,设置随机种子为 `2023`。 + +- `randomness.diff_rank_seed=True`,根据 `rank` 来设置不同的种子,`diff_rank_seed` 默认为 `False`。 + +- `randomness.deterministic=True`,把 `cuDNN` 后端确定性选项设置为 `True`,即把 `torch.backends.cudnn.deterministic` 设为 `True`,把 `torch.backends.cudnn.benchmark` 设为 `False`。`deterministic` 默认为 `False`。更多细节见 [Pytorch Randomness](https://pytorch.org/docs/stable/notes/randomness.html)。 + +如果你希望手动在配置文件中指定随机种子,可以在配置文件中设置 `random_seed` 参数,具体如下: + +```python +randomness = dict(seed=2023) +# model settings +model = dict( + ## 内容省略 ## + ) +``` + +## 使用 Tensorboard 可视化训练过程 + +安装 Tensorboard 环境 + +```shell +pip install tensorboard +``` + +在 config 文件中添加 tensorboard 配置 + +```python +visualizer = dict(vis_backends=[dict(type='LocalVisBackend'),dict(type='TensorboardVisBackend')]) +``` + +运行训练命令后,tensorboard 文件会生成在可视化文件夹 `work_dir/${CONFIG}/${TIMESTAMP}/vis_data` 下,运行下面的命令就可以在网页链接使用 tensorboard 查看 loss、学习率和精度等信息。 + +```shell +tensorboard --logdir work_dir/${CONFIG}/${TIMESTAMP}/vis_data +``` diff --git a/docs/zh_cn/advanced_guides/mixed_datasets.md b/docs/zh_cn/advanced_guides/mixed_datasets.md new file mode 100644 index 0000000000..fac38e3338 --- /dev/null +++ b/docs/zh_cn/advanced_guides/mixed_datasets.md @@ -0,0 +1,159 @@ +# 混合数据集训练 + +MMPose 提供了一个灵活、便捷的工具 `CombinedDataset` 来进行混合数据集训练。它作为一个封装器,可以包含多个子数据集,并将来自不同子数据集的数据转换成一个统一的格式,以用于模型训练。使用 `CombinedDataset` 的数据处理流程如下图所示。 + +![combined_dataset_pipeline](https://user-images.githubusercontent.com/26127467/223333154-fb88e511-810a-423c-b755-c791d296bc43.jpg) + +本篇教程的后续部分将通过一个结合 COCO 和 AI Challenger (AIC) 数据集的例子详细介绍如何配置 `CombinedDataset`。 + +## COCO & AIC 数据集混合案例 + +COCO 和 AIC 都是 2D 人体姿态数据集。但是,这两个数据集在关键点的数量和排列顺序上有所不同。下面是分别来自这两个数据集的图片及关键点: + +
+ +有些关键点(例如“左手”)在两个数据集中都有定义,但它们具有不同的序号。具体来说,“左手”关键点在 COCO 数据集中的序号为 9,在AIC数据集中的序号为 5。此外,每个数据集都包含独特的关键点,另一个数据集中不存在。例如,面部关键点(序号为0〜4)仅在 COCO 数据集中定义,而“头顶”(序号为 12)和“颈部”(序号为 13)关键点仅在 AIC 数据集中存在。以下的维恩图显示了两个数据集中关键点之间的关系。 + +
+ +接下来,我们会介绍两种混合数据集的方式: + +- [将 AIC 合入 COCO 数据集](#将-aic-合入-coco-数据集) +- [合并 AIC 和 COCO 数据集](#合并-aic-和-coco-数据集) + +### 将 AIC 合入 COCO 数据集 + +如果用户想提高其模型在 COCO 或类似数据集上的性能,可以将 AIC 数据集作为辅助数据。此时应该仅选择 AIC 数据集中与 COCO 数据集共享的关键点,忽略其余关键点。此外,还需要将这些被选择的关键点在 AIC 数据集中的序号进行转换,以匹配在 COCO 数据集中对应关键点的序号。 + +
+ +在这种情况下,来自 COCO 的数据不需要进行转换。此时 COCO 数据集可通过如下方式配置: + +```python +dataset_coco = dict( + type='CocoDataset', + data_root='data/coco/', + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=[], # `pipeline` 应为空列表,因为 COCO 数据不需要转换 +) +``` + +对于 AIC 数据集,需要转换关键点的顺序。MMPose 提供了一个 `KeypointConverter` 转换器来实现这一点。以下是配置 AIC 子数据集的示例: + +```python +dataset_aic = dict( + type='AicDataset', + data_root='data/aic/', + ann_file='annotations/aic_train.json', + data_prefix=dict(img='ai_challenger_keypoint_train_20170902/' + 'keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=17, # 与 COCO 数据集关键点数一致 + mapping=[ # 需要列出所有带转换关键点的序号 + (0, 6), # 0 (AIC 中的序号) -> 6 (COCO 中的序号) + (1, 8), + (2, 10), + (3, 5), + (4, 7), + (5, 9), + (6, 12), + (7, 14), + (8, 16), + (9, 11), + (10, 13), + (11, 15), + ]) + ], +) +``` + +`KeypointConverter` 会将原序号在 0 到 11 之间的关键点的序号转换为在 5 到 16 之间的对应序号。同时,在 AIC 中序号为为 12 和 13 的关键点将被删除。另外,目标序号在 0 到 4 之间的关键点在 `mapping` 参数中没有定义,这些点将被设为不可见,并且不会在训练中使用。 + +子数据集都完成配置后, 混合数据集 `CombinedDataset` 可以通过如下方式配置: + +```python +dataset = dict( + type='CombinedDataset', + # 混合数据集关键点顺序和 COCO 数据集相同, + # 所以使用 COCO 数据集的描述信息 + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[dataset_coco, dataset_aic], + # `train_pipeline` 包含了常用的数据预处理, + # 比如图片读取、数据增广等 + pipeline=train_pipeline, +) +``` + +MMPose 提供了一份完整的 [配置文件](https://github.com/open-mmlab/mmpose/blob/dev-1.x/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-merge.py) 来将 AIC 合入 COCO 数据集并用于训练网络。用户可以查阅这个文件以获取更多细节,或者参考这个文件来构建新的混合数据集。 + +### 合并 AIC 和 COCO 数据集 + +将 AIC 合入 COCO 数据集的过程中丢弃了部分 AIC 数据集中的标注信息。如果用户想要使用两个数据集中的所有信息,可以将两个数据集合并,即在两个数据集中取关键点的并集。 + +
+ +在这种情况下,COCO 和 AIC 数据集都需要使用 `KeypointConverter` 来调整它们关键点的顺序: + +```python +dataset_coco = dict( + type='CocoDataset', + data_root='data/coco/', + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, # 并集中有 19 个关键点 + mapping=[ + (0, 0), + (1, 1), + # 省略 + (16, 16), + ]) + ]) + +dataset_aic = dict( + type='AicDataset', + data_root='data/aic/', + ann_file='annotations/aic_train.json', + data_prefix=dict(img='ai_challenger_keypoint_train_20170902/' + 'keypoint_train_images_20170902/'), + pipeline=[ + dict( + type='KeypointConverter', + num_keypoints=19, # 并集中有 19 个关键点 + mapping=[ + (0, 6), + # 省略 + (12, 17), + (13, 18), + ]) + ], +) +``` + +合并后的数据集有 19 个关键点,这与 COCO 或 AIC 数据集都不同,因此需要一个新的数据集描述信息文件。[coco_aic.py](https://github.com/open-mmlab/mmpose/blob/dev-1.x/configs/_base_/datasets/coco_aic.py) 是一个描述信息文件的示例,它基于 [coco.py](https://github.com/open-mmlab/mmpose/blob/dev-1.x/configs/_base_/datasets/coco.py) 并进行了以下几点修改: + +- 添加了 AIC 数据集的文章信息; +- 在 `keypoint_info` 中添加了“头顶”和“颈部”这两个只在 AIC 中定义的关键点; +- 在 `skeleton_info` 中添加了“头顶”和“颈部”间的连线; +- 拓展 `joint_weights` 和 `sigmas` 以添加新增关键点的信息。 + +完成以上步骤后,合并数据集 `CombinedDataset` 可以通过以下方式配置: + +```python +dataset = dict( + type='CombinedDataset', + # 使用新的描述信息文件 + metainfo=dict(from_file='configs/_base_/datasets/coco_aic.py'), + datasets=[dataset_coco, dataset_aic], + # `train_pipeline` 包含了常用的数据预处理, + # 比如图片读取、数据增广等 + pipeline=train_pipeline, +) +``` + +此外,在使用混合数据集时,由于关键点数量的变化,模型的输出通道数也要做相应调整。如果用户用混合数据集训练了模型,但是要在 COCO 数据集上评估模型,就需要从模型输出的关键点中取出一个子集来匹配 COCO 中的关键点格式。可以通过 `test_cfg` 中的 `output_keypoint_indices` 参数自定义此子集。这个 [配置文件](https://github.com/open-mmlab/mmpose/blob/dev-1.x/configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-aic-256x192-combine.py) 展示了如何用 AIC 和 COCO 合并后的数据集训练模型并在 COCO 数据集上进行测试。用户可以查阅这个文件以获取更多细节,或者参考这个文件来构建新的混合数据集。 diff --git a/docs/zh_cn/dataset_zoo/dataset_tools.md b/docs/zh_cn/dataset_zoo/dataset_tools.md new file mode 100644 index 0000000000..ab30fc5604 --- /dev/null +++ b/docs/zh_cn/dataset_zoo/dataset_tools.md @@ -0,0 +1,378 @@ +# 数据集格式转换脚本 + +MMPose 提供了一些工具来帮助用户处理数据集。 + +## Animal Pose 数据集 + +
+Animal-Pose (ICCV'2019) + +```bibtex +@InProceedings{Cao_2019_ICCV, + author = {Cao, Jinkun and Tang, Hongyang and Fang, Hao-Shu and Shen, Xiaoyong and Lu, Cewu and Tai, Yu-Wing}, + title = {Cross-Domain Adaptation for Animal Pose Estimation}, + booktitle = {The IEEE International Conference on Computer Vision (ICCV)}, + month = {October}, + year = {2019} +} +``` + +
+ +对于 [Animal-Pose](https://sites.google.com/view/animal-pose/),可以从[官方网站](https://sites.google.com/view/animal-pose/)下载图像和标注。脚本 `tools/dataset_converters/parse_animalpose_dataset.py` 将原始标注转换为 MMPose 兼容的格式。预处理的[标注文件](https://download.openmmlab.com/mmpose/datasets/animalpose_annotations.tar)可用。如果您想自己生成标注,请按照以下步骤操作: + +1. 下载图片与标注信息并解压到 `$MMPOSE/data`,按照以下格式组织: + + ```text + mmpose + ├── mmpose + ├── docs + ├── tests + ├── tools + ├── configs + `── data + │── animalpose + │ + │-- VOC2012 + │ │-- Annotations + │ │-- ImageSets + │ │-- JPEGImages + │ │-- SegmentationClass + │ │-- SegmentationObject + │ + │-- animalpose_image_part2 + │ │-- cat + │ │-- cow + │ │-- dog + │ │-- horse + │ │-- sheep + │ + │-- PASCAL2011_animal_annotation + │ │-- cat + │ │ |-- 2007_000528_1.xml + │ │ |-- 2007_000549_1.xml + │ │ │-- ... + │ │-- cow + │ │-- dog + │ │-- horse + │ │-- sheep + │ + │-- annimalpose_anno2 + │ │-- cat + │ │ |-- ca1.xml + │ │ |-- ca2.xml + │ │ │-- ... + │ │-- cow + │ │-- dog + │ │-- horse + │ │-- sheep + ``` + +2. 运行脚本 + + ```bash + python tools/dataset_converters/parse_animalpose_dataset.py + ``` + + 生成的标注文件将保存在 `$MMPOSE/data/animalpose/annotations` 中。 + +开源作者没有提供官方的 train/val/test 划分,我们选择来自 PascalVOC 的图片作为 train & val,train+val 一共 3600 张图片,5117 个标注。其中 2798 张图片,4000 个标注用于训练,810 张图片,1117 个标注用于验证。测试集包含 1000 张图片,1000 个标注用于评估。 + +## COFW 数据集 + +
+COFW (ICCV'2013) + +```bibtex +@inproceedings{burgos2013robust, + title={Robust face landmark estimation under occlusion}, + author={Burgos-Artizzu, Xavier P and Perona, Pietro and Doll{\'a}r, Piotr}, + booktitle={Proceedings of the IEEE international conference on computer vision}, + pages={1513--1520}, + year={2013} +} +``` + +
+ +对于 COFW 数据集,请从 [COFW Dataset (Color Images)](https://data.caltech.edu/records/20099) 进行下载。 + +将 `COFW_train_color.mat` 和 `COFW_test_color.mat` 移动到 `$MMPOSE/data/cofw/`,确保它们按照以下格式组织: + +```text +mmpose +├── mmpose +├── docs +├── tests +├── tools +├── configs +`── data + │── cofw + |── COFW_train_color.mat + |── COFW_test_color.mat +``` + +运行 `pip install h5py` 安装依赖,然后在 `$MMPOSE` 下运行脚本: + +```bash +python tools/dataset_converters/parse_cofw_dataset.py +``` + +最终结果为: + +```text +mmpose +├── mmpose +├── docs +├── tests +├── tools +├── configs +`── data + │── cofw + |── COFW_train_color.mat + |── COFW_test_color.mat + |── annotations + | |── cofw_train.json + | |── cofw_test.json + |── images + |── 000001.jpg + |── 000002.jpg +``` + +## DeepposeKit 数据集 + +
+Desert Locust (Elife'2019) + +```bibtex +@article{graving2019deepposekit, + title={DeepPoseKit, a software toolkit for fast and robust animal pose estimation using deep learning}, + author={Graving, Jacob M and Chae, Daniel and Naik, Hemal and Li, Liang and Koger, Benjamin and Costelloe, Blair R and Couzin, Iain D}, + journal={Elife}, + volume={8}, + pages={e47994}, + year={2019}, + publisher={eLife Sciences Publications Limited} +} +``` + +
+ +对于 [Vinegar Fly](https://github.com/jgraving/DeepPoseKit-Data),[Desert Locust](https://github.com/jgraving/DeepPoseKit-Data), 和 [Grévy’s Zebra](https://github.com/jgraving/DeepPoseKit-Data) 数据集,请从 [DeepPoseKit-Data](https://github.com/jgraving/DeepPoseKit-Data) 下载数据。 + +`tools/dataset_converters/parse_deepposekit_dataset.py` 脚本可以将原始标注转换为 MMPose 支持的格式。我们已经转换好的标注文件可以在这里下载: + +- [vinegar_fly_annotations](https://download.openmmlab.com/mmpose/datasets/vinegar_fly_annotations.tar) +- [locust_annotations](https://download.openmmlab.com/mmpose/datasets/locust_annotations.tar) +- [zebra_annotations](https://download.openmmlab.com/mmpose/datasets/zebra_annotations.tar) + +如果你希望自己转换数据,请按照以下步骤操作: + +1. 下载原始图片和标注,并解压到 `$MMPOSE/data`,将它们按照以下格式组织: + + ```text + mmpose + ├── mmpose + ├── docs + ├── tests + ├── tools + ├── configs + `── data + | + |── DeepPoseKit-Data + | `── datasets + | |── fly + | | |── annotation_data_release.h5 + | | |── skeleton.csv + | | |── ... + | | + | |── locust + | | |── annotation_data_release.h5 + | | |── skeleton.csv + | | |── ... + | | + | `── zebra + | |── annotation_data_release.h5 + | |── skeleton.csv + | |── ... + | + │── fly + `-- images + │-- 0.jpg + │-- 1.jpg + │-- ... + ``` + + 图片也可以在 [vinegar_fly_images](https://download.openmmlab.com/mmpose/datasets/vinegar_fly_images.tar),[locust_images](https://download.openmmlab.com/mmpose/datasets/locust_images.tar) 和[zebra_images](https://download.openmmlab.com/mmpose/datasets/zebra_images.tar) 下载。 + +2. 运行脚本: + + ```bash + python tools/dataset_converters/parse_deepposekit_dataset.py + ``` + + 生成的标注文件将保存在 $MMPOSE/data/fly/annotations`,`$MMPOSE/data/locust/annotations`和`$MMPOSE/data/zebra/annotations\` 中。 + +由于官方数据集中没有提供测试集,我们随机选择了 90% 的图片用于训练,剩下的 10% 用于测试。 + +## Macaque 数据集 + +
+MacaquePose (bioRxiv'2020) + +```bibtex +@article{labuguen2020macaquepose, + title={MacaquePose: A novel ‘in the wild’macaque monkey pose dataset for markerless motion capture}, + author={Labuguen, Rollyn and Matsumoto, Jumpei and Negrete, Salvador and Nishimaru, Hiroshi and Nishijo, Hisao and Takada, Masahiko and Go, Yasuhiro and Inoue, Ken-ichi and Shibata, Tomohiro}, + journal={bioRxiv}, + year={2020}, + publisher={Cold Spring Harbor Laboratory} +} +``` + +
+ +对于 [MacaquePose](http://www2.ehub.kyoto-u.ac.jp/datasets/macaquepose/index.html) 数据集,请从 [这里](http://www2.ehub.kyoto-u.ac.jp/datasets/macaquepose/index.html) 下载数据。 + +`tools/dataset_converters/parse_macaquepose_dataset.py` 脚本可以将原始标注转换为 MMPose 支持的格式。我们已经转换好的标注文件可以在 [这里](https://download.openmmlab.com/mmpose/datasets/macaque_annotations.tar) 下载。 + +如果你希望自己转换数据,请按照以下步骤操作: + +1. 下载原始图片和标注,并解压到 `$MMPOSE/data`,将它们按照以下格式组织: + + ```text + mmpose + ├── mmpose + ├── docs + ├── tests + ├── tools + ├── configs + `── data + │── macaque + │-- annotations.csv + │-- images + │ │-- 01418849d54b3005.jpg + │ │-- 0142d1d1a6904a70.jpg + │ │-- 01ef2c4c260321b7.jpg + │ │-- 020a1c75c8c85238.jpg + │ │-- 020b1506eef2557d.jpg + │ │-- ... + ``` + +2. 运行脚本: + + ```bash + python tools/dataset_converters/parse_macaquepose_dataset.py + ``` + + 生成的标注文件将保存在 `$MMPOSE/data/macaque/annotations` 中。 + +由于官方数据集中没有提供测试集,我们随机选择了 90% 的图片用于训练,剩下的 10% 用于测试。 + +## Human3.6M 数据集 + +
+Human3.6M (TPAMI'2014) + +```bibtex +@article{h36m_pami, + author = {Ionescu, Catalin and Papava, Dragos and Olaru, Vlad and Sminchisescu, Cristian}, + title = {Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human Sensing in Natural Environments}, + journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, + publisher = {IEEE Computer Society}, + volume = {36}, + number = {7}, + pages = {1325-1339}, + month = {jul}, + year = {2014} +} +``` + +
+ +对于 [Human3.6M](http://vision.imar.ro/human3.6m/description.php) 数据集,请从官网下载数据,放置到 `$MMPOSE/data/h36m` 下。 + +然后执行 [预处理脚本](/tools/dataset_converters/preprocess_h36m.py)。 + +```bash +python tools/dataset_converters/preprocess_h36m.py --metadata {path to metadata.xml} --original data/h36m +``` + +这将在全帧率(50 FPS)和降频帧率(10 FPS)下提取相机参数和姿势注释。处理后的数据应具有以下结构: + +```text +mmpose +├── mmpose +├── docs +├── tests +├── tools +├── configs +`── data + ├── h36m + ├── annotation_body3d + | ├── cameras.pkl + | ├── fps50 + | | ├── h36m_test.npz + | | ├── h36m_train.npz + | | ├── joint2d_rel_stats.pkl + | | ├── joint2d_stats.pkl + | | ├── joint3d_rel_stats.pkl + | | `── joint3d_stats.pkl + | `── fps10 + | ├── h36m_test.npz + | ├── h36m_train.npz + | ├── joint2d_rel_stats.pkl + | ├── joint2d_stats.pkl + | ├── joint3d_rel_stats.pkl + | `── joint3d_stats.pkl + `── images + ├── S1 + | ├── S1_Directions_1.54138969 + | | ├── S1_Directions_1.54138969_00001.jpg + | | ├── S1_Directions_1.54138969_00002.jpg + | | ├── ... + | ├── ... + ├── S5 + ├── S6 + ├── S7 + ├── S8 + ├── S9 + `── S11 +``` + +然后,标注信息需要转换为 MMPose 支持的 COCO 格式。这可以通过运行以下命令完成: + +```bash +python tools/dataset_converters/h36m_to_coco.py +``` + +## MPII 数据集 + +
+MPII (CVPR'2014) + +```bibtex +@inproceedings{andriluka14cvpr, + author = {Mykhaylo Andriluka and Leonid Pishchulin and Peter Gehler and Schiele, Bernt}, + title = {2D Human Pose Estimation: New Benchmark and State of the Art Analysis}, + booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2014}, + month = {June} +} +``` + +
+ +对于 [MPII](http://human-pose.mpi-inf.mpg.de/) 数据集,请从官网下载数据,放置到 `$MMPOSE/data/mpii` 下。 + +我们提供了一个脚本来将 `.mat` 格式的标注文件转换为 `.json` 格式。这可以通过运行以下命令完成: + +```shell +python tools/dataset_converters/mat2json ${PRED_MAT_FILE} ${GT_JSON_FILE} ${OUTPUT_PRED_JSON_FILE} +``` + +例如: + +```shell +python tools/dataset/mat2json work_dirs/res50_mpii_256x256/pred.mat data/mpii/annotations/mpii_val.json pred.json +``` diff --git a/docs/zh_cn/guide_to_framework.md b/docs/zh_cn/guide_to_framework.md new file mode 100644 index 0000000000..b1991da7f0 --- /dev/null +++ b/docs/zh_cn/guide_to_framework.md @@ -0,0 +1,652 @@ +# 20 分钟了解 MMPose 架构设计 + +MMPose 1.0 与之前的版本有较大改动,对部分模块进行了重新设计和组织,降低代码冗余度,提升运行效率,降低学习难度。 + +MMPose 1.0 采用了全新的模块结构设计以精简代码,提升运行效率,降低学习难度。对于有一定深度学习基础的用户,本章节提供了对 MMPose 架构设计的总体介绍。不论你是**旧版 MMPose 的用户**,还是**希望直接从 MMPose 1.0 上手的新用户**,都可以通过本教程了解如何构建一个基于 MMPose 1.0 的项目。 + +```{note} +本教程包含了使用 MMPose 1.0 时开发者会关心的内容: + +- 整体代码架构与设计逻辑 + +- 如何用config文件管理模块 + +- 如何使用自定义数据集 + +- 如何添加新的模块(骨干网络、模型头部、损失函数等) +``` + +以下是这篇教程的目录: + +- [20 分钟了解 MMPose 架构设计](#20-分钟了解-mmpose-架构设计) + - [总览](#总览) + - [Step1:配置文件](#step1配置文件) + - [Step2:数据](#step2数据) + - [数据集元信息](#数据集元信息) + - [数据集](#数据集) + - [数据流水线](#数据流水线) + - [i. 数据增强](#i-数据增强) + - [ii. 数据变换](#ii-数据变换) + - [iii. 数据编码](#iii-数据编码) + - [iv. 数据打包](#iv-数据打包) + - [Step3: 模型](#step3-模型) + - [前处理器(DataPreprocessor)](#前处理器datapreprocessor) + - [主干网络(Backbone)](#主干网络backbone) + - [颈部模块(Neck)](#颈部模块neck) + - [预测头(Head)](#预测头head) + +## 总览 + +![overall-cn](https://user-images.githubusercontent.com/13503330/187830967-f2d7bf40-6261-42f3-91a5-ae045fa0dc0c.png) + +一般来说,开发者在项目开发过程中经常接触内容的主要有**五个**方面: + +- **通用**:环境、钩子(Hook)、模型权重存取(Checkpoint)、日志(Logger)等 + +- **数据**:数据集、数据读取(Dataloader)、数据增强等 + +- **训练**:优化器、学习率调整等 + +- **模型**:主干网络、颈部模块(Neck)、预测头模块(Head)、损失函数等 + +- **评测**:评测指标(Metric)、评测器(Evaluator)等 + +其中**通用**、**训练**和**评测**相关的模块往往由训练框架提供,开发者只需要调用和调整参数,不需要自行实现,开发者主要实现的是**数据**和**模型**部分。 + +## Step1:配置文件 + +在MMPose中,我们通常 python 格式的配置文件,用于整个项目的定义、参数管理,因此我们强烈建议第一次接触 MMPose 的开发者,查阅 [配置文件](./user_guides/configs.md) 学习配置文件的定义。 + +需要注意的是,所有新增的模块都需要使用注册器(Registry)进行注册,并在对应目录的 `__init__.py` 中进行 `import`,以便能够使用配置文件构建其实例。 + +## Step2:数据 + +MMPose 数据的组织主要包含三个方面: + +- 数据集元信息 + +- 数据集 + +- 数据流水线 + +### 数据集元信息 + +元信息指具体标注之外的数据集信息。姿态估计数据集的元信息通常包括:关键点和骨骼连接的定义、对称性、关键点性质(如关键点权重、标注标准差、所属上下半身)等。这些信息在数据在数据处理、模型训练和测试中有重要作用。在 MMPose 中,数据集的元信息使用 python 格式的配置文件保存,位于 `$MMPOSE/configs/_base_/datasets` 目录下。 + +在 MMPose 中使用自定义数据集时,你需要增加对应的元信息配置文件。以 MPII 数据集(`$MMPOSE/configs/_base_/datasets/mpii.py`)为例: + +```Python +dataset_info = dict( + dataset_name='mpii', + paper_info=dict( + author='Mykhaylo Andriluka and Leonid Pishchulin and ' + 'Peter Gehler and Schiele, Bernt', + title='2D Human Pose Estimation: New Benchmark and ' + 'State of the Art Analysis', + container='IEEE Conference on Computer Vision and ' + 'Pattern Recognition (CVPR)', + year='2014', + homepage='http://human-pose.mpi-inf.mpg.de/', + ), + keypoint_info={ + 0: + dict( + name='right_ankle', + id=0, + color=[255, 128, 0], + type='lower', + swap='left_ankle'), + ## 内容省略 + }, + skeleton_info={ + 0: + dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]), + ## 内容省略 + }, + joint_weights=[ + 1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5 + ], + # 使用 COCO 数据集中提供的 sigmas 值 + sigmas=[ + 0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026, + 0.062, 0.072, 0.179, 0.179, 0.072, 0.062 + ]) +``` + +在模型配置文件中,你需要为自定义数据集指定对应的元信息配置文件。假如该元信息配置文件路径为 `$MMPOSE/configs/_base_/datasets/custom.py`,指定方式如下: + +```python +# dataset and dataloader settings +dataset_type = 'MyCustomDataset' # or 'CocoDataset' +train_dataloader = dict( + batch_size=2, + dataset=dict( + type=dataset_type, + data_root='root/of/your/train/data', + ann_file='path/to/your/train/json', + data_prefix=dict(img='path/to/your/train/img'), + # 指定对应的元信息配置文件 + metainfo=dict(from_file='configs/_base_/datasets/custom.py'), + ...), + ) +val_dataloader = dict( + batch_size=2, + dataset=dict( + type=dataset_type, + data_root='root/of/your/val/data', + ann_file='path/to/your/val/json', + data_prefix=dict(img='path/to/your/val/img'), + # 指定对应的元信息配置文件 + metainfo=dict(from_file='configs/_base_/datasets/custom.py'), + ...), + ) +test_dataloader = val_dataloader +``` + +### 数据集 + +在 MMPose 中使用自定义数据集时,我们推荐将数据转化为已支持的格式(如 COCO 或 MPII),并直接使用我们提供的对应数据集实现。如果这种方式不可行,则用户需要实现自己的数据集类。 + +MMPose 中的大部分 2D 关键点数据集**以 COCO 形式组织**,为此我们提供了基类 [BaseCocoStyleDataset](/mmpose/datasets/datasets/base/base_coco_style_dataset.py)。我们推荐用户继承该基类,并按需重写它的方法(通常是 `__init__()` 和 `_load_annotations()` 方法),以扩展到新的 2D 关键点数据集。 + +```{note} +关于COCO数据格式的详细说明请参考 [COCO](./dataset_zoo/2d_body_keypoint.md) 。 +``` + +```{note} +在 MMPose 中 bbox 的数据格式采用 `xyxy`,而不是 `xywh`,这与 [MMDetection](https://github.com/open-mmlab/mmdetection) 等其他 OpenMMLab 成员保持一致。为了实现不同 bbox 格式之间的转换,我们提供了丰富的函数:`bbox_xyxy2xywh`、`bbox_xywh2xyxy`、`bbox_xyxy2cs`等。这些函数定义在`$MMPOSE/mmpose/structures/bbox/transforms.py`。 +``` + +下面我们以MPII数据集的实现(`$MMPOSE/mmpose/datasets/datasets/body/mpii_dataset.py`)为例: + +```Python +@DATASETS.register_module() +class MpiiDataset(BaseCocoStyleDataset): + METAINFO: dict = dict(from_file='configs/_base_/datasets/mpii.py') + + def __init__(self, + ## 内容省略 + headbox_file: Optional[str] = None, + ## 内容省略): + + if headbox_file: + if data_mode != 'topdown': + raise ValueError( + f'{self.__class__.__name__} is set to {data_mode}: ' + 'mode, while "headbox_file" is only ' + 'supported in topdown mode.') + + if not test_mode: + raise ValueError( + f'{self.__class__.__name__} has `test_mode==False` ' + 'while "headbox_file" is only ' + 'supported when `test_mode==True`.') + + headbox_file_type = headbox_file[-3:] + allow_headbox_file_type = ['mat'] + if headbox_file_type not in allow_headbox_file_type: + raise KeyError( + f'The head boxes file type {headbox_file_type} is not ' + f'supported. Should be `mat` but got {headbox_file_type}.') + self.headbox_file = headbox_file + + super().__init__( + ## 内容省略 + ) + + def _load_annotations(self) -> List[dict]: + """Load data from annotations in MPII format.""" + check_file_exist(self.ann_file) + with open(self.ann_file) as anno_file: + anns = json.load(anno_file) + + if self.headbox_file: + check_file_exist(self.headbox_file) + headbox_dict = loadmat(self.headbox_file) + headboxes_src = np.transpose(headbox_dict['headboxes_src'], + [2, 0, 1]) + SC_BIAS = 0.6 + + data_list = [] + ann_id = 0 + + # mpii bbox scales are normalized with factor 200. + pixel_std = 200. + + for idx, ann in enumerate(anns): + center = np.array(ann['center'], dtype=np.float32) + scale = np.array([ann['scale'], ann['scale']], + dtype=np.float32) * pixel_std + + # Adjust center/scale slightly to avoid cropping limbs + if center[0] != -1: + center[1] = center[1] + 15. / pixel_std * scale[1] + + # MPII uses matlab format, index is 1-based, + # we should first convert to 0-based index + center = center - 1 + + # unify shape with coco datasets + center = center.reshape(1, -1) + scale = scale.reshape(1, -1) + bbox = bbox_cs2xyxy(center, scale) + + # load keypoints in shape [1, K, 2] and keypoints_visible in [1, K] + keypoints = np.array(ann['joints']).reshape(1, -1, 2) + keypoints_visible = np.array(ann['joints_vis']).reshape(1, -1) + + data_info = { + 'id': ann_id, + 'img_id': int(ann['image'].split('.')[0]), + 'img_path': osp.join(self.data_prefix['img'], ann['image']), + 'bbox_center': center, + 'bbox_scale': scale, + 'bbox': bbox, + 'bbox_score': np.ones(1, dtype=np.float32), + 'keypoints': keypoints, + 'keypoints_visible': keypoints_visible, + } + + if self.headbox_file: + # calculate the diagonal length of head box as norm_factor + headbox = headboxes_src[idx] + head_size = np.linalg.norm(headbox[1] - headbox[0], axis=0) + head_size *= SC_BIAS + data_info['head_size'] = head_size.reshape(1, -1) + + data_list.append(data_info) + ann_id = ann_id + 1 + + return data_list +``` + +在对MPII数据集进行支持时,由于MPII需要读入 `head_size` 信息来计算 `PCKh`,因此我们在`__init__()`中增加了 `headbox_file`,并重载了 `_load_annotations()` 来完成数据组织。 + +如果自定义数据集无法被 `BaseCocoStyleDataset` 支持,你需要直接继承 [MMEngine](https://github.com/open-mmlab/mmengine) 中提供的 `BaseDataset` 基类。具体方法请参考相关[文档](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html)。 + +### 数据流水线 + +一个典型的数据流水线配置如下: + +```Python +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +test_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] +``` + +在关键点检测任务中,数据一般会在三个尺度空间中变换: + +- **原始图片空间**:图片存储时的原始空间,不同图片的尺寸不一定相同 + +- **输入图片空间**:模型输入的图片尺度空间,所有**图片**和**标注**被缩放到输入尺度,如 `256x256`,`256x192` 等 + +- **输出尺度空间**:模型输出和训练监督信息所在的尺度空间,如`64x64(热力图)`,`1x1(回归坐标值)`等 + +数据在三个空间中变换的流程如图所示: + +![migration-cn](https://user-images.githubusercontent.com/13503330/187831574-13804daf-f498-47c2-ba43-64b8e6ffe3dd.png) + +在MMPose中,数据变换所需要的模块在`$MMPOSE/mmpose/datasets/transforms`目录下,它们的工作流程如图所示: + +![transforms-cn](https://user-images.githubusercontent.com/13503330/187831611-8db89e20-95c7-42bc-8b0d-700fadf60328.png) + +#### i. 数据增强 + +数据增强中常用的变换存放在 `$MMPOSE/mmpose/datasets/transforms/common_transforms.py` 中,如 `RandomFlip`、`RandomHalfBody` 等。 + +对于 top-down 方法,`Shift`、`Rotate`、`Resize` 操作由 `RandomBBoxTransform`来实现;对于 bottom-up 方法,这些则是由 `BottomupRandomAffine` 实现。 + +```{note} +值得注意的是,大部分数据变换都依赖于 `bbox_center` 和 `bbox_scale`,它们可以通过 `GetBBoxCenterScale` 来得到。 +``` + +#### ii. 数据变换 + +我们使用仿射变换,将图像和坐标标注从原始图片空间变换到输入图片空间。这一操作在 top-down 方法中由 `TopdownAffine` 完成,在 bottom-up 方法中则由 `BottomupRandomAffine` 完成。 + +#### iii. 数据编码 + +在模型训练时,数据从原始空间变换到输入图片空间后,需要使用 `GenerateTarget` 来生成训练所需的监督目标(比如用坐标值生成高斯热图),我们将这一过程称为编码(Encode),反之,通过高斯热图得到对应坐标值的过程称为解码(Decode)。 + +在 MMPose 中,我们将编码和解码过程集合成一个编解码器(Codec),在其中实现 `encode()` 和 `decode()`。 + +目前 MMPose 支持生成以下类型的监督目标: + +- `heatmap`: 高斯热图 + +- `keypoint_label`: 关键点标签(如归一化的坐标值) + +- `keypoint_xy_label`: 单个坐标轴关键点标签 + +- `heatmap+keypoint_label`: 同时生成高斯热图和关键点标签 + +- `multiscale_heatmap`: 多尺度高斯热图 + +生成的监督目标会按以下关键字进行封装: + +- `heatmaps`:高斯热图 + +- `keypoint_labels`:关键点标签(如归一化的坐标值) + +- `keypoint_x_labels`:x 轴关键点标签 + +- `keypoint_y_labels`:y 轴关键点标签 + +- `keypoint_weights`:关键点权重 + +```Python +@TRANSFORMS.register_module() +class GenerateTarget(BaseTransform): + """Encode keypoints into Target. + + Added Keys (depends on the args): + - heatmaps + - keypoint_labels + - keypoint_x_labels + - keypoint_y_labels + - keypoint_weights + """ +``` + +值得注意的是,我们对 top-down 和 bottom-up 的数据格式进行了统一,这意味着标注信息中会新增一个维度来代表同一张图里的不同目标(如人),格式为: + +```Python +[batch_size, num_instances, num_keypoints, dim_coordinates] +``` + +- top-down:`[B, 1, K, D]` + +- Bottom-up: `[B, N, K, D]` + +当前已经支持的编解码器定义在 `$MMPOSE/mmpose/codecs` 目录下,如果你需要自定新的编解码器,可以前往[编解码器](./user_guides/codecs.md)了解更多详情。 + +#### iv. 数据打包 + +数据经过前处理变换后,最终需要通过 `PackPoseInputs` 打包成数据样本。该操作定义在 `$MMPOSE/mmpose/datasets/transforms/formatting.py` 中。 + +打包过程会将数据流水线中用字典 `results` 存储的数据转换成用 MMPose 所需的标准数据结构, 如 `InstanceData`,`PixelData`,`PoseDataSample` 等。 + +具体而言,我们将数据样本内容分为 `gt`(标注真值) 和 `pred`(模型预测)两部分,它们都包含以下数据项: + +- **instances**(numpy.array):实例级别的原始标注或预测结果,属于原始尺度空间 + +- **instance_labels**(torch.tensor):实例级别的训练标签(如归一化的坐标值、关键点可见性),属于输出尺度空间 + +- **fields**(torch.tensor):像素级别的训练标签(如高斯热图)或预测结果,属于输出尺度空间 + +下面是 `PoseDataSample` 底层实现的例子: + +```Python +def get_pose_data_sample(self): + # meta + pose_meta = dict( + img_shape=(600, 900), # [h, w, c] + crop_size=(256, 192), # [h, w] + heatmap_size=(64, 48), # [h, w] + ) + + # gt_instances + gt_instances = InstanceData() + gt_instances.bboxes = np.random.rand(1, 4) + gt_instances.keypoints = np.random.rand(1, 17, 2) + + # gt_instance_labels + gt_instance_labels = InstanceData() + gt_instance_labels.keypoint_labels = torch.rand(1, 17, 2) + gt_instance_labels.keypoint_weights = torch.rand(1, 17) + + # pred_instances + pred_instances = InstanceData() + pred_instances.keypoints = np.random.rand(1, 17, 2) + pred_instances.keypoint_scores = np.random.rand(1, 17) + + # gt_fields + gt_fields = PixelData() + gt_fields.heatmaps = torch.rand(17, 64, 48) + + # pred_fields + pred_fields = PixelData() + pred_fields.heatmaps = torch.rand(17, 64, 48) + data_sample = PoseDataSample( + gt_instances=gt_instances, + pred_instances=pred_instances, + gt_fields=gt_fields, + pred_fields=pred_fields, + metainfo=pose_meta) + + return data_sample +``` + +## Step3: 模型 + +在 MMPose 1.0中,模型由以下几部分构成: + +- **预处理器(DataPreprocessor)**:完成图像归一化和通道转换等前处理 + +- **主干网络 (Backbone)**:用于特征提取 + +- **颈部模块(Neck)**:GAP,FPN 等可选项 + +- **预测头(Head)**:用于实现核心算法功能和损失函数定义 + +我们在 `$MMPOSE/models/pose_estimators/base.py` 下为姿态估计模型定义了一个基类 `BasePoseEstimator`,所有的模型(如 `TopdownPoseEstimator`)都需要继承这个基类,并重载对应的方法。 + +在模型的 `forward()` 方法中提供了三种不同的模式: + +- `mode == 'loss'`:返回损失函数计算的结果,用于模型训练 + +- `mode == 'predict'`:返回输入尺度下的预测结果,用于模型推理 + +- `mode == 'tensor'`:返回输出尺度下的模型输出,即只进行模型前向传播,用于模型导出 + +开发者需要在 `PoseEstimator` 中按照模型结构调用对应的 `Registry` ,对模块进行实例化。以 top-down 模型为例: + +```Python +@MODELS.register_module() +class TopdownPoseEstimator(BasePoseEstimator): + def __init__(self, + backbone: ConfigType, + neck: OptConfigType = None, + head: OptConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__(data_preprocessor, init_cfg) + + self.backbone = MODELS.build(backbone) + + if neck is not None: + self.neck = MODELS.build(neck) + + if head is not None: + self.head = MODELS.build(head) +``` + +### 前处理器(DataPreprocessor) + +从 MMPose 1.0 开始,我们在模型中添加了新的前处理器模块,用以完成图像归一化、通道顺序变换等操作。这样做的好处是可以利用 GPU 等设备的计算能力加快计算,并使模型在导出和部署时更具完整性。 + +在配置文件中,一个常见的 `data_preprocessor` 如下: + +```Python +data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), +``` + +它会将输入图片的通道顺序从 `bgr` 转换为 `rgb`,并根据 `mean` 和 `std` 进行数据归一化。 + +### 主干网络(Backbone) + +MMPose 实现的主干网络存放在 `$MMPOSE/mmpose/models/backbones` 目录下。 + +在实际开发中,开发者经常会使用预训练的网络权重进行迁移学习,这能有效提升模型在小数据集上的性能。 在 MMPose 中,只需要在配置文件 `backbone` 的 `init_cfg` 中设置: + +```Python +init_cfg=dict( + type='Pretrained', + checkpoint='PATH/TO/YOUR_MODEL_WEIGHTS.pth'), +``` + +如果你想只加载一个训练好的 checkpoint 的 backbone 部分,你需要指明一下前缀 `prefix`: + +```Python +init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='PATH/TO/YOUR_CHECKPOINT.pth'), +``` + +其中 `checkpoint` 既可以是本地路径,也可以是下载链接。因此,如果你想使用 Torchvision 提供的预训练模型(比如ResNet50),可以使用: + +```Python +init_cfg=dict( + type='Pretrained', + checkpoint='torchvision://resnet50') +``` + +除了这些常用的主干网络以外,你还可以从 MMClassification 等其他 OpenMMLab 项目中方便地迁移主干网络,它们都遵循同一套配置文件格式,并提供了预训练权重可供使用。 + +需要强调的是,如果你加入了新的主干网络,需要在模型定义时进行注册: + +```Python +@MODELS.register_module() +class YourBackbone(BaseBackbone): +``` + +同时在 `$MMPOSE/mmpose/models/backbones/__init__.py` 下进行 `import`,并加入到 `__all__` 中,才能被配置文件正确地调用。 + +### 颈部模块(Neck) + +颈部模块通常是介于主干网络和预测头之间的模块,在部分模型算法中会用到,常见的颈部模块有: + +- Global Average Pooling (GAP) + +- Feature Pyramid Networks (FPN) + +### 预测头(Head) + +通常来说,预测头是模型算法实现的核心,用于控制模型的输出,并进行损失函数计算。 + +MMPose 中 Head 相关的模块定义在 `$MMPOSE/mmpose/models/heads` 目录下,开发者在自定义预测头时需要继承我们提供的基类 `BaseHead`,并重载以下三个方法对应模型推理的三种模式: + +- forward() + +- predict() + +- loss() + +具体而言,`predict()` 返回的应是输入图片尺度下的结果,因此需要调用 `self.decode()` 对网络输出进行解码,这一过程实现在 `BaseHead` 中已经实现,它会调用编解码器提供的 `decode()` 方法来完成解码。 + +另一方面,我们会在 `predict()` 中进行测试时增强。在进行预测时,一个常见的测试时增强技巧是进行翻转集成。即,将一张图片先进行一次推理,再将图片水平翻转进行一次推理,推理的结果再次水平翻转回去,对两次推理的结果进行平均。这个技巧能有效提升模型的预测稳定性。 + +下面是在 `RegressionHead` 中定义 `predict()` 的例子: + +```Python +def predict(self, + feats: Tuple[Tensor], + batch_data_samples: OptSampleList, + test_cfg: ConfigType = {}) -> Predictions: + """Predict results from outputs.""" + + if test_cfg.get('flip_test', False): + # TTA: flip test -> feats = [orig, flipped] + assert isinstance(feats, list) and len(feats) == 2 + flip_indices = batch_data_samples[0].metainfo['flip_indices'] + input_size = batch_data_samples[0].metainfo['input_size'] + _feats, _feats_flip = feats + _batch_coords = self.forward(_feats) + _batch_coords_flip = flip_coordinates( + self.forward(_feats_flip), + flip_indices=flip_indices, + shift_coords=test_cfg.get('shift_coords', True), + input_size=input_size) + batch_coords = (_batch_coords + _batch_coords_flip) * 0.5 + else: + batch_coords = self.forward(feats) # (B, K, D) + + batch_coords.unsqueeze_(dim=1) # (B, N, K, D) + preds = self.decode(batch_coords) +``` + +`loss()`除了进行损失函数的计算,还会进行 accuracy 等训练时指标的计算,并通过一个字典 `losses` 来传递: + +```Python + # calculate accuracy +_, avg_acc, _ = keypoint_pck_accuracy( + pred=to_numpy(pred_coords), + gt=to_numpy(keypoint_labels), + mask=to_numpy(keypoint_weights) > 0, + thr=0.05, + norm_factor=np.ones((pred_coords.size(0), 2), dtype=np.float32)) + +acc_pose = torch.tensor(avg_acc, device=keypoint_labels.device) +losses.update(acc_pose=acc_pose) +``` + +每个 batch 的数据都打包成了 `batch_data_samples`。以 Regression-based 方法为例,训练所需的归一化的坐标值和关键点权重可以用如下方式获取: + +```Python +keypoint_labels = torch.cat( + [d.gt_instance_labels.keypoint_labels for d in batch_data_samples]) +keypoint_weights = torch.cat([ + d.gt_instance_labels.keypoint_weights for d in batch_data_samples +]) +``` + +以下为 `RegressionHead` 中完整的 `loss()` 实现: + +```Python +def loss(self, + inputs: Tuple[Tensor], + batch_data_samples: OptSampleList, + train_cfg: ConfigType = {}) -> dict: + """Calculate losses from a batch of inputs and data samples.""" + + pred_outputs = self.forward(inputs) + + keypoint_labels = torch.cat( + [d.gt_instance_labels.keypoint_labels for d in batch_data_samples]) + keypoint_weights = torch.cat([ + d.gt_instance_labels.keypoint_weights for d in batch_data_samples + ]) + + # calculate losses + losses = dict() + loss = self.loss_module(pred_outputs, keypoint_labels, + keypoint_weights.unsqueeze(-1)) + + if isinstance(loss, dict): + losses.update(loss) + else: + losses.update(loss_kpt=loss) + + # calculate accuracy + _, avg_acc, _ = keypoint_pck_accuracy( + pred=to_numpy(pred_outputs), + gt=to_numpy(keypoint_labels), + mask=to_numpy(keypoint_weights) > 0, + thr=0.05, + norm_factor=np.ones((pred_outputs.size(0), 2), dtype=np.float32)) + acc_pose = torch.tensor(avg_acc, device=keypoint_labels.device) + losses.update(acc_pose=acc_pose) + + return losses +``` diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst index f0652ce364..957a6db81e 100644 --- a/docs/zh_cn/index.rst +++ b/docs/zh_cn/index.rst @@ -12,6 +12,7 @@ You can change the documentation language at the lower-left corner of the page. overview.md installation.md quick_run.md + guide_to_framework.md .. toctree:: :maxdepth: 1 @@ -23,14 +24,14 @@ You can change the documentation language at the lower-left corner of the page. user_guides/inference.md user_guides/train_and_test.md user_guides/visualization.md - user_guides/useful_tools.md - + user_guides/how_to.md .. toctree:: :maxdepth: 1 :caption: 进阶教程 - advanced_guides.md + advanced_guides/advanced_training.md + advanced_guides/mixed_datasets.md .. toctree:: :maxdepth: 1 @@ -71,6 +72,7 @@ You can change the documentation language at the lower-left corner of the page. dataset_zoo/2d_animal_keypoint.md dataset_zoo/3d_body_keypoint.md dataset_zoo/3d_hand_keypoint.md + dataset_zoo/dataset_tools.md .. toctree:: :maxdepth: 1 diff --git a/docs/zh_cn/installation.md b/docs/zh_cn/installation.md index 65e6bcd0bf..1a14a10746 100644 --- a/docs/zh_cn/installation.md +++ b/docs/zh_cn/installation.md @@ -18,7 +18,7 @@ 在本节中,我们将演示如何准备 PyTorch 相关的依赖环境。 -MMPose 适用于 Linux、Windows 和 macOS。它需要 Python 3.6+、CUDA 9.2+ 和 PyTorch 1.6+。 +MMPose 适用于 Linux、Windows 和 macOS。它需要 Python 3.7+、CUDA 9.2+ 和 PyTorch 1.6+。 如果您对配置 PyTorch 环境已经很熟悉,并且已经完成了配置,可以直接进入下一节:[安装](#安装-mmpose)。否则,请依照以下步骤完成配置。 @@ -93,7 +93,7 @@ pip install -v -e . 直接使用 mim 安装即可。 ```shell -mim install "mmpose>=1.0.0b0" +mim install "mmpose>=1.0.0rc0" ``` ### 验证安装 @@ -219,7 +219,7 @@ MMPose 可以仅在 CPU 环境中安装,在 CPU 模式下,您可以完成训 ```python import mmpose print(mmpose.__version__) -# 预期输出: 1.0.0b0 +# 预期输出: 1.0.0rc0 ``` ```{note} diff --git a/docs/zh_cn/migration.md b/docs/zh_cn/migration.md index 188c5a4af5..9a591dfcc9 100644 --- a/docs/zh_cn/migration.md +++ b/docs/zh_cn/migration.md @@ -1,632 +1,10 @@ -# 迁移指南 - -重构之后的 MMPose 1.0 与之前的版本有较大改动,对部分模块进行了重新设计和组织,降低代码冗余度,提升运行效率,降低学习难度。 - -对于有一定基础的开发者,本章节提供了一份迁移指南。不论你是**旧版 MMPose 的用户**,还是**希望将自己的 PyTorch 项目迁移到 MMPose 的新用户**,都可以通过本教程了解如何构建一个基于 MMPose 1.0 的项目。 - -```{note} -本教程包含了使用 MMPose 1.0 时开发者会关心的内容: - -- 整体代码架构与设计逻辑 - -- 如何用config文件管理模块 - -- 如何使用自定义数据集 - -- 如何添加新的模块(骨干网络、模型头部、损失函数等) -``` - -以下是这篇教程的目录: - -- [迁移指南](#迁移指南) - - [整体架构与设计](#整体架构与设计) - - [Step1:配置文件](#step1配置文件) - - [Step2:数据](#step2数据) - - [数据集元信息](#数据集元信息) - - [数据集](#数据集) - - [数据流水线](#数据流水线) - - [i. 数据增强](#i-数据增强) - - [ii. 数据变换](#ii-数据变换) - - [iii. 数据编码](#iii-数据编码) - - [iv. 数据打包](#iv-数据打包) - - [Step3: 模型](#step3-模型) - - [前处理器(DataPreprocessor)](#前处理器datapreprocessor) - - [主干网络(Backbone)](#主干网络backbone) - - [颈部模块(Neck)](#颈部模块neck) - - [预测头(Head)](#预测头head) - - [MMPose 0.X 兼容性说明](#mmpose-0x-兼容性说明) - - [数据变换](#数据变换) - - [平移、旋转和缩放](#平移旋转和缩放) - - [标签生成](#标签生成) - - [数据归一化](#数据归一化) - - [模型兼容](#模型兼容) - - [Heatmap-based 方法](#heatmap-based-方法) - - [RLE-based 方法](#rle-based-方法) - -## 整体架构与设计 - -![overall-cn](https://user-images.githubusercontent.com/13503330/187830967-f2d7bf40-6261-42f3-91a5-ae045fa0dc0c.png) - -一般来说,开发者在项目开发过程中经常接触内容的主要有**五个**方面: - -- **通用**:环境、钩子(Hook)、模型权重存取(Checkpoint)、日志(Logger)等 - -- **数据**:数据集、数据读取(Dataloader)、数据增强等 - -- **训练**:优化器、学习率调整等 - -- **模型**:主干网络、颈部模块(Neck)、预测头模块(Head)、损失函数等 - -- **评测**:评测指标(Metric)、评测器(Evaluator)等 - -其中**通用**、**训练**和**评测**相关的模块往往由训练框架提供,开发者只需要调用和调整参数,不需要自行实现,开发者主要实现的是**数据**和**模型**部分。 - -## Step1:配置文件 - -在MMPose中,我们通常 python 格式的配置文件,用于整个项目的定义、参数管理,因此我们强烈建议第一次接触 MMPose 的开发者,查阅 [配置文件](./user_guides/configs.md) 学习配置文件的定义。 - -需要注意的是,所有新增的模块都需要使用注册器(Registry)进行注册,并在对应目录的 `__init__.py` 中进行 `import`,以便能够使用配置文件构建其实例。 - -## Step2:数据 - -MMPose 数据的组织主要包含三个方面: - -- 数据集元信息 - -- 数据集 - -- 数据流水线 - -### 数据集元信息 - -元信息指具体标注之外的数据集信息。姿态估计数据集的元信息通常包括:关键点和骨骼连接的定义、对称性、关键点性质(如关键点权重、标注标准差、所属上下半身)等。这些信息在数据在数据处理、模型训练和测试中有重要作用。在 MMPose 中,数据集的元信息使用 python 格式的配置文件保存,位于 `$MMPOSE/configs/_base_/datasets` 目录下。 - -在 MMPose 中使用自定义数据集时,你需要增加对应的元信息配置文件。以 MPII 数据集(`$MMPOSE/configs/_base_/datasets/mpii.py`)为例: - -```Python -dataset_info = dict( - dataset_name='mpii', - paper_info=dict( - author='Mykhaylo Andriluka and Leonid Pishchulin and ' - 'Peter Gehler and Schiele, Bernt', - title='2D Human Pose Estimation: New Benchmark and ' - 'State of the Art Analysis', - container='IEEE Conference on Computer Vision and ' - 'Pattern Recognition (CVPR)', - year='2014', - homepage='http://human-pose.mpi-inf.mpg.de/', - ), - keypoint_info={ - 0: - dict( - name='right_ankle', - id=0, - color=[255, 128, 0], - type='lower', - swap='left_ankle'), - ## 内容省略 - }, - skeleton_info={ - 0: - dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]), - ## 内容省略 - }, - joint_weights=[ - 1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5 - ], - # 使用 COCO 数据集中提供的 sigmas 值 - sigmas=[ - 0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026, - 0.062, 0.072, 0.179, 0.179, 0.072, 0.062 - ]) -``` - -### 数据集 - -在 MMPose 中使用自定义数据集时,我们推荐将数据转化为已支持的格式(如 COCO 或 MPII),并直接使用我们提供的对应数据集实现。如果这种方式不可行,则用户需要实现自己的数据集类。 - -MMPose 中的大部分 2D 关键点数据集**以 COCO 形式组织**,为此我们提供了基类 [BaseCocoStyleDataset](/mmpose/datasets/datasets/base/base_coco_style_dataset.py)。我们推荐用户继承该基类,并按需重写它的方法(通常是 `__init__()` 和 `_load_annotations()` 方法),以扩展到新的 2D 关键点数据集。 - -```{note} -关于COCO数据格式的详细说明请参考 [COCO](./dataset_zoo/2d_body_keypoint.md) 。 -``` - -```{note} -在 MMPose 中 bbox 的数据格式采用 `xyxy`,而不是 `xywh`,这与 [MMDetection](https://github.com/open-mmlab/mmdetection) 等其他 OpenMMLab 成员保持一致。为了实现不同 bbox 格式之间的转换,我们提供了丰富的函数:`bbox_xyxy2xywh`、`bbox_xywh2xyxy`、`bbox_xyxy2cs`等。这些函数定义在`$MMPOSE/mmpose/structures/bbox/transforms.py`。 -``` - -下面我们以MPII数据集的实现(`$MMPOSE/mmpose/datasets/datasets/body/mpii_dataset.py`)为例: - -```Python -@DATASETS.register_module() -class MpiiDataset(BaseCocoStyleDataset): - METAINFO: dict = dict(from_file='configs/_base_/datasets/mpii.py') - - def __init__(self, - ## 内容省略 - headbox_file: Optional[str] = None, - ## 内容省略): - - if headbox_file: - if data_mode != 'topdown': - raise ValueError( - f'{self.__class__.__name__} is set to {data_mode}: ' - 'mode, while "headbox_file" is only ' - 'supported in topdown mode.') - - if not test_mode: - raise ValueError( - f'{self.__class__.__name__} has `test_mode==False` ' - 'while "headbox_file" is only ' - 'supported when `test_mode==True`.') - - headbox_file_type = headbox_file[-3:] - allow_headbox_file_type = ['mat'] - if headbox_file_type not in allow_headbox_file_type: - raise KeyError( - f'The head boxes file type {headbox_file_type} is not ' - f'supported. Should be `mat` but got {headbox_file_type}.') - self.headbox_file = headbox_file - - super().__init__( - ## 内容省略 - ) - - def _load_annotations(self) -> List[dict]: - """Load data from annotations in MPII format.""" - check_file_exist(self.ann_file) - with open(self.ann_file) as anno_file: - anns = json.load(anno_file) - - if self.headbox_file: - check_file_exist(self.headbox_file) - headbox_dict = loadmat(self.headbox_file) - headboxes_src = np.transpose(headbox_dict['headboxes_src'], - [2, 0, 1]) - SC_BIAS = 0.6 - - data_list = [] - ann_id = 0 - - # mpii bbox scales are normalized with factor 200. - pixel_std = 200. - - for idx, ann in enumerate(anns): - center = np.array(ann['center'], dtype=np.float32) - scale = np.array([ann['scale'], ann['scale']], - dtype=np.float32) * pixel_std - - # Adjust center/scale slightly to avoid cropping limbs - if center[0] != -1: - center[1] = center[1] + 15. / pixel_std * scale[1] - - # MPII uses matlab format, index is 1-based, - # we should first convert to 0-based index - center = center - 1 - - # unify shape with coco datasets - center = center.reshape(1, -1) - scale = scale.reshape(1, -1) - bbox = bbox_cs2xyxy(center, scale) - - # load keypoints in shape [1, K, 2] and keypoints_visible in [1, K] - keypoints = np.array(ann['joints']).reshape(1, -1, 2) - keypoints_visible = np.array(ann['joints_vis']).reshape(1, -1) - - data_info = { - 'id': ann_id, - 'img_id': int(ann['image'].split('.')[0]), - 'img_path': osp.join(self.data_prefix['img'], ann['image']), - 'bbox_center': center, - 'bbox_scale': scale, - 'bbox': bbox, - 'bbox_score': np.ones(1, dtype=np.float32), - 'keypoints': keypoints, - 'keypoints_visible': keypoints_visible, - } - - if self.headbox_file: - # calculate the diagonal length of head box as norm_factor - headbox = headboxes_src[idx] - head_size = np.linalg.norm(headbox[1] - headbox[0], axis=0) - head_size *= SC_BIAS - data_info['head_size'] = head_size.reshape(1, -1) - - data_list.append(data_info) - ann_id = ann_id + 1 - - return data_list -``` - -在对MPII数据集进行支持时,由于MPII需要读入 `head_size` 信息来计算 `PCKh`,因此我们在`__init__()`中增加了 `headbox_file`,并重载了 `_load_annotations()` 来完成数据组织。 - -如果自定义数据集无法被 `BaseCocoStyleDataset` 支持,你需要直接继承 [MMEngine](https://github.com/open-mmlab/mmengine) 中提供的 `BaseDataset` 基类。具体方法请参考相关[文档](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html)。 - -### 数据流水线 - -一个典型的数据流水线配置如下: - -```Python -# pipelines -train_pipeline = [ - dict(type='LoadImage', file_client_args=file_client_args), - dict(type='GetBBoxCenterScale'), - dict(type='RandomFlip', direction='horizontal'), - dict(type='RandomHalfBody'), - dict(type='RandomBBoxTransform'), - dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='heatmap', encoder=codec), - dict(type='PackPoseInputs') -] -test_pipeline = [ - dict(type='LoadImage', file_client_args=file_client_args), - dict(type='GetBBoxCenterScale'), - dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='PackPoseInputs') -] -``` - -在关键点检测任务中,数据一般会在三个尺度空间中变换: - -- **原始图片空间**:图片存储时的原始空间,不同图片的尺寸不一定相同 - -- **输入图片空间**:模型输入的图片尺度空间,所有**图片**和**标注**被缩放到输入尺度,如 `256x256`,`256x192` 等 - -- **输出尺度空间**:模型输出和训练监督信息所在的尺度空间,如`64x64(热力图)`,`1x1(回归坐标值)`等 - -数据在三个空间中变换的流程如图所示: - -![migration-cn](https://user-images.githubusercontent.com/13503330/187831574-13804daf-f498-47c2-ba43-64b8e6ffe3dd.png) - -在MMPose中,数据变换所需要的模块在`$MMPOSE/mmpose/datasets/transforms`目录下,它们的工作流程如图所示: - -![transforms-cn](https://user-images.githubusercontent.com/13503330/187831611-8db89e20-95c7-42bc-8b0d-700fadf60328.png) - -#### i. 数据增强 - -数据增强中常用的变换存放在 `$MMPOSE/mmpose/datasets/transforms/common_transforms.py` 中,如 `RandomFlip`、`RandomHalfBody` 等。 - -对于 top-down 方法,`Shift`、`Rotate`、`Resize` 操作由 `RandomBBoxTransform`来实现;对于 bottom-up 方法,这些则是由 `BottomupRandomAffine` 实现。 - -```{note} -值得注意的是,大部分数据变换都依赖于 `bbox_center` 和 `bbox_scale`,它们可以通过 `GetBBoxCenterScale` 来得到。 -``` - -#### ii. 数据变换 - -我们使用仿射变换,将图像和坐标标注从原始图片空间变换到输入图片空间。这一操作在 top-down 方法中由 `TopdownAffine` 完成,在 bottom-up 方法中则由 `BottomupRandomAffine` 完成。 - -#### iii. 数据编码 - -在模型训练时,数据从原始空间变换到输入图片空间后,需要使用 `GenerateTarget` 来生成训练所需的监督目标(比如用坐标值生成高斯热图),我们将这一过程称为编码(Encode),反之,通过高斯热图得到对应坐标值的过程称为解码(Decode)。 - -在 MMPose 中,我们将编码和解码过程集合成一个编解码器(Codec),在其中实现 `encode()` 和 `decode()`。 - -目前 MMPose 支持生成以下类型的监督目标: - -- `heatmap`: 高斯热图 - -- `keypoint_label`: 关键点标签(如归一化的坐标值) - -- `keypoint_xy_label`: 单个坐标轴关键点标签 - -- `heatmap+keypoint_label`: 同时生成高斯热图和关键点标签 - -- `multiscale_heatmap`: 多尺度高斯热图 - -生成的监督目标会按以下关键字进行封装: - -- `heatmaps`:高斯热图 - -- `keypoint_labels`:关键点标签(如归一化的坐标值) - -- `keypoint_x_labels`:x 轴关键点标签 - -- `keypoint_y_labels`:y 轴关键点标签 - -- `keypoint_weights`:关键点权重 - -```Python -@TRANSFORMS.register_module() -class GenerateTarget(BaseTransform): - """Encode keypoints into Target. - - Added Keys (depends on the args): - - heatmaps - - keypoint_labels - - keypoint_x_labels - - keypoint_y_labels - - keypoint_weights - """ -``` - -值得注意的是,我们对 top-down 和 bottom-up 的数据格式进行了统一,这意味着标注信息中会新增一个维度来代表同一张图里的不同目标(如人),格式为: - -```Python -[batch_size, num_instances, num_keypoints, dim_coordinates] -``` - -- top-down:`[B, 1, K, D]` - -- Bottom-up: `[B, N, K, D]` - -当前已经支持的编解码器定义在 `$MMPOSE/mmpose/codecs` 目录下,如果你需要自定新的编解码器,可以前往[编解码器](./user_guides/codecs.md)了解更多详情。 - -#### iv. 数据打包 - -数据经过前处理变换后,最终需要通过 `PackPoseInputs` 打包成数据样本。该操作定义在 `$MMPOSE/mmpose/datasets/transforms/formatting.py` 中。 - -打包过程会将数据流水线中用字典 `results` 存储的数据转换成用 MMPose 所需的标准数据结构, 如 `InstanceData`,`PixelData`,`PoseDataSample` 等。 - -具体而言,我们将数据样本内容分为 `gt`(标注真值) 和 `pred`(模型预测)两部分,它们都包含以下数据项: - -- **instances**(numpy.array):实例级别的原始标注或预测结果,属于原始尺度空间 - -- **instance_labels**(torch.tensor):实例级别的训练标签(如归一化的坐标值、关键点可见性),属于输出尺度空间 - -- **fields**(torch.tensor):像素级别的训练标签(如高斯热图)或预测结果,属于输出尺度空间 - -下面是 `PoseDataSample` 底层实现的例子: - -```Python -def get_pose_data_sample(self): - # meta - pose_meta = dict( - img_shape=(600, 900), # [h, w, c] - crop_size=(256, 192), # [h, w] - heatmap_size=(64, 48), # [h, w] - ) - - # gt_instances - gt_instances = InstanceData() - gt_instances.bboxes = np.random.rand(1, 4) - gt_instances.keypoints = np.random.rand(1, 17, 2) - - # gt_instance_labels - gt_instance_labels = InstanceData() - gt_instance_labels.keypoint_labels = torch.rand(1, 17, 2) - gt_instance_labels.keypoint_weights = torch.rand(1, 17) - - # pred_instances - pred_instances = InstanceData() - pred_instances.keypoints = np.random.rand(1, 17, 2) - pred_instances.keypoint_scores = np.random.rand(1, 17) - - # gt_fields - gt_fields = PixelData() - gt_fields.heatmaps = torch.rand(17, 64, 48) - - # pred_fields - pred_fields = PixelData() - pred_fields.heatmaps = torch.rand(17, 64, 48) - data_sample = PoseDataSample( - gt_instances=gt_instances, - pred_instances=pred_instances, - gt_fields=gt_fields, - pred_fields=pred_fields, - metainfo=pose_meta) - - return data_sample -``` - -## Step3: 模型 - -在 MMPose 1.0中,模型由以下几部分构成: - -- **预处理器(DataPreprocessor)**:完成图像归一化和通道转换等前处理 - -- **主干网络 (Backbone)**:用于特征提取 - -- **颈部模块(Neck)**:GAP,FPN 等可选项 - -- **预测头(Head)**:用于实现核心算法功能和损失函数定义 - -我们在 `$MMPOSE/models/pose_estimators/base.py` 下为姿态估计模型定义了一个基类 `BasePoseEstimator`,所有的模型(如 `TopdownPoseEstimator`)都需要继承这个基类,并重载对应的方法。 - -在模型的 `forward()` 方法中提供了三种不同的模式: - -- `mode == 'loss'`:返回损失函数计算的结果,用于模型训练 - -- `mode == 'predict'`:返回输入尺度下的预测结果,用于模型推理 - -- `mode == 'tensor'`:返回输出尺度下的模型输出,即只进行模型前向传播,用于模型导出 - -开发者需要在 `PoseEstimator` 中按照模型结构调用对应的 `Registry` ,对模块进行实例化。以 top-down 模型为例: - -```Python -@MODELS.register_module() -class TopdownPoseEstimator(BasePoseEstimator): - def __init__(self, - backbone: ConfigType, - neck: OptConfigType = None, - head: OptConfigType = None, - train_cfg: OptConfigType = None, - test_cfg: OptConfigType = None, - data_preprocessor: OptConfigType = None, - init_cfg: OptMultiConfig = None): - super().__init__(data_preprocessor, init_cfg) - - self.backbone = MODELS.build(backbone) - - if neck is not None: - self.neck = MODELS.build(neck) - - if head is not None: - self.head = MODELS.build(head) -``` - -### 前处理器(DataPreprocessor) - -从 MMPose 1.0 开始,我们在模型中添加了新的前处理器模块,用以完成图像归一化、通道顺序变换等操作。这样做的好处是可以利用 GPU 等设备的计算能力加快计算,并使模型在导出和部署时更具完整性。 - -在配置文件中,一个常见的 `data_preprocessor` 如下: - -```Python -data_preprocessor=dict( - type='PoseDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True), -``` - -它会将输入图片的通道顺序从 `bgr` 转换为 `rgb`,并根据 `mean` 和 `std` 进行数据归一化。 - -### 主干网络(Backbone) - -MMPose 实现的主干网络存放在 `$MMPOSE/mmpose/models/backbones` 目录下。 - -在实际开发中,开发者经常会使用预训练的网络权重进行迁移学习,这能有效提升模型在小数据集上的性能。 在 MMPose 中,只需要在配置文件 `backbone` 的 `init_cfg` 中设置: - -```Python -init_cfg=dict( - type='Pretrained', - checkpoint='PATH/TO/YOUR_MODEL_WEIGHTS.pth'), -``` - -其中 `checkpoint` 既可以是本地路径,也可以是下载链接。因此,如果你想使用 Torchvision 提供的预训练模型(比如ResNet50),可以使用: - -```Python -init_cfg=dict( - type='Pretrained', - checkpoint='torchvision://resnet50') -``` - -除了这些常用的主干网络以外,你还可以从 MMClassification 等其他 OpenMMLab 项目中方便地迁移主干网络,它们都遵循同一套配置文件格式,并提供了预训练权重可供使用。 - -需要强调的是,如果你加入了新的主干网络,需要在模型定义时进行注册: - -```Python -@MODELS.register_module() -class YourBackbone(BaseBackbone): -``` - -同时在 `$MMPOSE/mmpose/models/backbones/__init__.py` 下进行 `import`,并加入到 `__all__` 中,才能被配置文件正确地调用。 - -### 颈部模块(Neck) - -颈部模块通常是介于主干网络和预测头之间的模块,在部分模型算法中会用到,常见的颈部模块有: - -- Global Average Pooling (GAP) - -- Feature Pyramid Networks (FPN) - -### 预测头(Head) - -通常来说,预测头是模型算法实现的核心,用于控制模型的输出,并进行损失函数计算。 - -MMPose 中 Head 相关的模块定义在 `$MMPOSE/mmpose/models/heads` 目录下,开发者在自定义预测头时需要继承我们提供的基类 `BaseHead`,并重载以下三个方法对应模型推理的三种模式: - -- forward() - -- predict() - -- loss() - -具体而言,`predict()` 返回的应是输入图片尺度下的结果,因此需要调用 `self.decode()` 对网络输出进行解码,这一过程实现在 `BaseHead` 中已经实现,它会调用编解码器提供的 `decode()` 方法来完成解码。 - -另一方面,我们会在 `predict()` 中进行测试时增强。在进行预测时,一个常见的测试时增强技巧是进行翻转集成。即,将一张图片先进行一次推理,再将图片水平翻转进行一次推理,推理的结果再次水平翻转回去,对两次推理的结果进行平均。这个技巧能有效提升模型的预测稳定性。 - -下面是在 `RegressionHead` 中定义 `predict()` 的例子: - -```Python -def predict(self, - feats: Tuple[Tensor], - batch_data_samples: OptSampleList, - test_cfg: ConfigType = {}) -> Predictions: - """Predict results from outputs.""" - - if test_cfg.get('flip_test', False): - # TTA: flip test -> feats = [orig, flipped] - assert isinstance(feats, list) and len(feats) == 2 - flip_indices = batch_data_samples[0].metainfo['flip_indices'] - input_size = batch_data_samples[0].metainfo['input_size'] - _feats, _feats_flip = feats - _batch_coords = self.forward(_feats) - _batch_coords_flip = flip_coordinates( - self.forward(_feats_flip), - flip_indices=flip_indices, - shift_coords=test_cfg.get('shift_coords', True), - input_size=input_size) - batch_coords = (_batch_coords + _batch_coords_flip) * 0.5 - else: - batch_coords = self.forward(feats) # (B, K, D) - - batch_coords.unsqueeze_(dim=1) # (B, N, K, D) - preds = self.decode(batch_coords) -``` - -`loss()`除了进行损失函数的计算,还会进行 accuracy 等训练时指标的计算,并通过一个字典 `losses` 来传递: - -```Python - # calculate accuracy -_, avg_acc, _ = keypoint_pck_accuracy( - pred=to_numpy(pred_coords), - gt=to_numpy(keypoint_labels), - mask=to_numpy(keypoint_weights) > 0, - thr=0.05, - norm_factor=np.ones((pred_coords.size(0), 2), dtype=np.float32)) - -acc_pose = torch.tensor(avg_acc, device=keypoint_labels.device) -losses.update(acc_pose=acc_pose) -``` - -每个 batch 的数据都打包成了 `batch_data_samples`。以 Regression-based 方法为例,训练所需的归一化的坐标值和关键点权重可以用如下方式获取: - -```Python -keypoint_labels = torch.cat( - [d.gt_instance_labels.keypoint_labels for d in batch_data_samples]) -keypoint_weights = torch.cat([ - d.gt_instance_labels.keypoint_weights for d in batch_data_samples -]) -``` - -以下为 `RegressionHead` 中完整的 `loss()` 实现: - -```Python -def loss(self, - inputs: Tuple[Tensor], - batch_data_samples: OptSampleList, - train_cfg: ConfigType = {}) -> dict: - """Calculate losses from a batch of inputs and data samples.""" - - pred_outputs = self.forward(inputs) - - keypoint_labels = torch.cat( - [d.gt_instance_labels.keypoint_labels for d in batch_data_samples]) - keypoint_weights = torch.cat([ - d.gt_instance_labels.keypoint_weights for d in batch_data_samples - ]) - - # calculate losses - losses = dict() - loss = self.loss_module(pred_outputs, keypoint_labels, - keypoint_weights.unsqueeze(-1)) - - if isinstance(loss, dict): - losses.update(loss) - else: - losses.update(loss_kpt=loss) - - # calculate accuracy - _, avg_acc, _ = keypoint_pck_accuracy( - pred=to_numpy(pred_outputs), - gt=to_numpy(keypoint_labels), - mask=to_numpy(keypoint_weights) > 0, - thr=0.05, - norm_factor=np.ones((pred_outputs.size(0), 2), dtype=np.float32)) - acc_pose = torch.tensor(avg_acc, device=keypoint_labels.device) - losses.update(acc_pose=acc_pose) - - return losses -``` - -## MMPose 0.X 兼容性说明 +# MMPose 0.X 兼容性说明 MMPose 1.0 经过了大规模重构并解决了许多遗留问题,对于 0.x 版本的大部分代码 MMPose 1.0 将不兼容。 -### 数据变换 +## 数据变换 -#### 平移、旋转和缩放 +### 平移、旋转和缩放 旧版的数据变换方法 `TopDownRandomShiftBboxCenter` 和 `TopDownGetRandomScaleRotation`,将被合并为 `RandomBBoxTransform`: @@ -675,7 +53,7 @@ class RandomBBoxTransform(BaseTransform): rotate_prob: float = 0.6) -> None: ``` -#### 标签生成 +### 标签生成 旧版用于训练标签生成的方法 `TopDownGenerateTarget` 、`TopDownGenerateTargetRegression`、`BottomUpGenerateHeatmapTarget`、`BottomUpGenerateTarget` 等将被合并为 `GenerateTarget`,而实际的生成方法由[编解码器](./user_guides/codecs.md) 提供: @@ -693,45 +71,38 @@ class GenerateTarget(BaseTransform): - keypoints_visible - dataset_keypoint_weights - Added Keys (depends on the args): - - heatmaps - - keypoint_labels - - keypoint_x_labels - - keypoint_y_labels - - keypoint_weights + Added Keys: + + - The keys of the encoded items from the codec will be updated into + the results, e.g. ``'heatmaps'`` or ``'keypoint_weights'``. See + the specific codec for more details. Args: - encoder (dict | list[dict]): The codec config for keypoint encoding - target_type (str): The type of the encoded form of the keypoints. - Should be one of the following options: - - - ``'heatmap'``: The encoded should be instance-irrelevant - heatmaps and will be stored in ``results['heatmaps']`` - - ``'multiscale_heatmap'`` The encoded should be a list of - heatmaps and will be stored in ``results['heatmaps']``. Note - that in this case ``self.encoder`` is also a list, each - encoder for a single scale of heatmaps - - ``'keypoint_label'``: The encoded should be instance-level - labels and will be stored in ``results['keypoint_label']`` - - ``'keypoint_xy_label'``: The encoed should be instance-level - labels in x-axis and y-axis respectively. They will be stored - in ``results['keypoint_x_label']`` and - ``results['keypoint_y_label']`` + encoder (dict | list[dict]): The codec config for keypoint encoding. + Both single encoder and multiple encoders (given as a list) are + supported + multilevel (bool): Determine the method to handle multiple encoders. + If ``multilevel==True``, generate multilevel targets from a group + of encoders of the same type (e.g. multiple :class:`MSRAHeatmap` + encoders with different sigma values); If ``multilevel==False``, + generate combined targets from a group of different encoders. This + argument will have no effect in case of single encoder. Defaults + to ``False`` use_dataset_keypoint_weights (bool): Whether use the keypoint weights from the dataset meta information. Defaults to ``False`` """ def __init__(self, encoder: MultiConfig, - target_type: str, + multilevel: bool = False, use_dataset_keypoint_weights: bool = False) -> None: ``` -#### 数据归一化 +### 数据归一化 旧版的数据归一化操作 `NormalizeTensor` 和 `ToTensor` 方法将由 **DataPreprocessor** 模块替代,不再作为流水线的一部分,而是作为模块加入到模型前向传播中。 -### 模型兼容 +## 模型兼容 我们对 model zoo 提供的模型权重进行了兼容性处理,确保相同的模型权重测试精度能够与 0.x 版本保持同等水平,但由于在这两个版本中存在大量处理细节的差异,推理结果可能会产生轻微的不同(精度误差小于 0.05%)。 @@ -748,7 +119,7 @@ def __init__(self): self._register_load_state_dict_pre_hook(self._load_state_dict_pre_hook) ``` -#### Heatmap-based 方法 +### Heatmap-based 方法 对于基于SimpleBaseline方法的模型,主要需要注意最后一层卷积层的兼容: @@ -799,7 +170,7 @@ def _load_state_dict_pre_hook(self, state_dict, prefix, local_meta, *args, state_dict[prefix + k_new] = v ``` -#### RLE-based 方法 +### RLE-based 方法 对于基于 RLE 的模型,由于新版的 `loss` 模块更名为 `loss_module`,且 flow 模型归属在 `loss` 模块下,因此需要对权重字典中 `loss` 字段进行更改: diff --git a/docs/zh_cn/notes/contribution_guide.md b/docs/zh_cn/notes/contribution_guide.md index 0d83a56a0a..96be7d1723 100644 --- a/docs/zh_cn/notes/contribution_guide.md +++ b/docs/zh_cn/notes/contribution_guide.md @@ -1,24 +1,178 @@ -# 参与贡献代码 +# 如何给 MMPose 贡献代码 -我们欢迎任何形式的贡献,包括但不限于: +欢迎加入 MMPose 社区,我们致力于打造最前沿的计算机视觉基础库,我们欢迎任何形式的贡献,包括但不限于: -- 修复错别字、bug -- 增加文档内容或翻译 -- 添加新功能和组件 +- **修复错误** + 1. 如果提交的代码改动较大,我们鼓励你先开一个 issue 并正确描述现象、原因和复现方式,讨论后确认修复方案。 + 2. 修复错误并补充相应的单元测试,提交 PR 。 +- **新增功能或组件** + 1. 如果新功能或模块涉及较大的代码改动,我们建议先提交 issue,与我们确认功能的必要性。 + 2. 实现新增功能并添加单元测试,提交 PR 。 +- **文档补充或翻译** + - 如果发现文档有错误或不完善的地方,欢迎直接提交 PR 。 -## 流程 +```{note} +- 如果你希望向 MMPose 1.0 贡献代码,请从 dev-1.x 上创建新分支,并提交 PR 到 dev-1.x 分支上。 +- 如果你是论文作者,并希望将你的方法加入到 MMPose 中,欢迎联系我们,我们将非常感谢你的贡献。 +- 如果你希望尽快将你的项目分享到 MMPose 开源社区,欢迎将 PR 提到 Projects 目录下,该目录下的项目将简化 Review 流程并尽快合入。 +- 如果你希望加入 MMPose 的维护者,欢迎联系我们,我们将邀请你加入 MMPose 的维护者群。 +``` + +## 准备工作 + +PR 操作所使用的命令都是用 Git 去实现的,该章节将介绍如何进行 Git 配置与 GitHub 绑定。 + +### Git 配置 + +首先,你需要在本地安装 Git,然后配置你的 Git 用户名和邮箱: + +```Shell +# 在命令提示符(cmd)或终端(terminal)中输入以下命令,查看 Git 版本 +git --version +``` + +然后,你需要检查自己的 Git Config 是否正确配置,如果 `user.name` 和 `user.email` 为空,你需要配置你的 Git 用户名和邮箱: + +```Shell +# 在命令提示符(cmd)或终端(terminal)中输入以下命令,查看 Git 配置 +git config --global --list +# 设置 Git 用户名和邮箱 +git config --global user.name "这里填入你的用户名" +git config --global user.email "这里填入你的邮箱" +``` + +## PR 流程 + +如果你对 PR 流程不熟悉,接下来将会从零开始,一步一步地教你如何提交 PR。如果你想深入了解 PR 开发模式,可以参考 [GitHub 官方文档](https://docs.github.com/cn/github/collaborating-with-issues-and-pull-requests/about-pull-requests)。 + +### 1. Fork 项目 + +当你第一次提交 PR 时,需要先 Fork 项目到自己的 GitHub 账号下。点击项目右上角的 Fork 按钮,将项目 Fork 到自己的 GitHub 账号下。 + +![](https://user-images.githubusercontent.com/13503330/223318144-a49c6cef-b1fb-45b8-aa2b-0833d0e3fd5c.png) + +接着,你需要将你的 Fork 仓库 Clone 到本地,然后添加官方仓库作为远程仓库: + +```Shell + +# Clone 你的 Fork 仓库到本地 +git clone https://github.com/username/mmpose.git + +# 添加官方仓库作为远程仓库 +cd mmpose +git remote add upstream https://github.com/open-mmlab/mmpose.git +``` + +在终端中输入以下命令,查看远程仓库是否成功添加: + +```Shell +git remote -v +``` + +如果出现以下信息,说明你已经成功添加了远程仓库: -1. Fork MMPose官方代码仓库,并Pull最新的代码 -2. 创建一个新的分支(请不要直接在 master 分支上进行开发) -3. 提交你的改动 -4. 创建一个PR +```Shell +origin https://github.com/{username}/mmpose.git (fetch) +origin https://github.com/{username}/mmpose.git (push) +upstream https://github.com/open-mmlab/mmpose.git (fetch) +upstream https://github.com/open-mmlab/mmpose.git (push) +``` ```{note} -- 如果你希望向 MMPose 1.0 贡献代码,请从 dev-1.x 上创建新分支,并提交 PR 到 dev-1.x 分支上 -- 如果你打算添加的新功能涉及的改动较大,我们鼓励你先开一个 issue 与我们进行讨论。 -- 如果你是论文作者,并希望将你的方法加入到 MMPose 中,欢迎联系我们,我们将非常感谢你的贡献。 +这里对 origin 和 upstream 进行一个简单的介绍,当我们使用 git clone 来克隆代码时,会默认创建一个 origin 的 remote,它指向我们克隆的代码库地址,而 upstream 则是我们自己添加的,用来指向原始代码库地址。当然如果你不喜欢他叫 upstream,也可以自己修改,比如叫 open-mmlab。我们通常向 origin 提交代码(即 fork 下来的远程仓库),然后向 upstream 提交一个 pull request。如果提交的代码和最新的代码发生冲突,再从 upstream 拉取最新的代码,和本地分支解决冲突,再提交到 origin。 +``` + +### 2. 配置 pre-commit + +在本地开发环境中,我们使用 pre-commit 来检查代码风格,以确保代码风格的统一。在提交代码前,你需要先安装 pre-commit: + +```Shell +pip install -U pre-commit + +# 在 mmpose 根目录下安装 pre-commit +pre-commit install +``` + +检查 pre-commit 是否配置成功,并安装 `.pre-commit-config.yaml` 中的钩子: + +```Shell +pre-commit run --all-files ``` +![](https://user-images.githubusercontent.com/57566630/202368856-0465a90d-8fce-4345-918e-67b8b9c82614.png) + +```{note} +如果你是中国大陆用户,由于网络原因,可能会出现 pre-commit 安装失败的情况。 + +这时你可以使用清华源来安装 pre-commit: +pip install -U pre-commit -i https://pypi.tuna.tsinghua.edu.cn/simple + +或者使用国内镜像来安装 pre-commit: +pip install -U pre-commit -i https://pypi.mirrors.ustc.edu.cn/simple +``` + +如果安装过程被中断,可以重复执行上述命令,直到安装成功。 + +如果你提交的代码中有不符合规范的地方,pre-commit 会发出警告,并自动修复部分错误。 + +![](https://user-images.githubusercontent.com/57566630/202369176-67642454-0025-4023-a095-263529107aa3.png) + +### 3. 创建开发分支 + +安装完 pre-commit 之后,我们需要基于 dev 分支创建一个新的开发分支,建议以 `username/pr_name` 的形式命名,例如: + +```Shell +git checkout -b username/refactor_contributing_doc +``` + +在后续的开发中,如果本地仓库的 dev 分支落后于官方仓库的 dev 分支,需要先拉取 upstream 的 dev 分支,然后 rebase 到本地的开发分支上: + +```Shell +git checkout username/refactor_contributing_doc +git fetch upstream +git rebase upstream/dev-1.x +``` + +在 rebase 时,如果出现冲突,需要手动解决冲突,然后执行 `git add` 命令,再执行 `git rebase --continue` 命令,直到 rebase 完成。 + +### 4. 提交代码并在本地通过单元测试 + +在本地开发完成后,我们需要在本地通过单元测试,然后提交代码。 + +```shell +# 运行单元测试 +pytest tests/ + +# 提交代码 +git add . +git commit -m "commit message" +``` + +### 5. 推送代码到远程仓库 + +在本地开发完成后,我们需要将代码推送到远程仓库。 + +```Shell +git push origin username/refactor_contributing_doc +``` + +### 6. 提交 Pull Request (PR) + +#### (1) 在 GitHub 上创建 PR + +![](https://user-images.githubusercontent.com/13503330/223321382-e6068e18-1d91-4458-8328-b1c7c907b3b2.png) + +#### (2) 在 PR 中根据指引修改描述,添加必要的信息 + +![](https://user-images.githubusercontent.com/13503330/223322447-94ad4b8c-21bf-4ca7-b3d6-0568cace6eee.png) + +```{note} +- 在 PR branch 左侧选择 `dev` 分支,否则 PR 会被拒绝。 +- 如果你是第一次向 OpenMMLab 提交 PR,需要签署 CLA。 +``` + +![](https://user-images.githubusercontent.com/57566630/167307569-a794b967-6e28-4eac-a942-00deb657815f.png) + ## 代码风格 ### Python @@ -44,22 +198,8 @@ `pre-commit`的配置存储在[.pre-commit-config](/.pre-commit-config.yaml)中。 -在clone代码仓库后,你需要安装并初始化 `pre-commit`: - -```Shell -pip install -U pre-commit -``` - -并在 MMPose 仓库目录下运行: - -```shell -pre-commit install -``` - -在顺利安装后,你每次提交代码时都会自动执行代码格式检查与自动格式化。 - ```{note} -在你创建PR之前,请确保你的代码格式符合规范,且经过了yapf格式化 +在你创建PR之前,请确保你的代码格式符合规范,且经过了 yapf 格式化。 ``` ### C++与CUDA diff --git a/docs/zh_cn/notes/faq.md b/docs/zh_cn/notes/faq.md index 57461bb249..15e3fbb98d 100644 --- a/docs/zh_cn/notes/faq.md +++ b/docs/zh_cn/notes/faq.md @@ -12,15 +12,19 @@ Compatible MMPose and MMCV versions are shown as below. Please choose the correc ### MMPose 1.x -| MMPose version | MMCV version | -| :------------: | :-------------------------------: | -| dev-1.x | mmcv-full>=2.0.0, mmengine>=0.0.1 | +| MMPose version | MMCV/MMEngine version | +| :------------: | :-----------------------------: | +| 1.0.0rc1 | mmcv>=2.0.0rc4, mmengine>=0.6.0 | +| 1.0.0rc0 | mmcv>=2.0.0rc0, mmengine>=0.0.1 | +| 1.0.0b0 | mmcv>=2.0.0rc0, mmengine>=0.0.1 | ### MMPose 0.x | MMPose version | MMCV version | | :------------: | :-----------------------: | -| master | mmcv-full>=1.3.8, \<1.7.0 | +| master | mmcv-full>=1.3.8, \<1.8.0 | +| 0.29.0 | mmcv-full>=1.3.8, \<1.7.0 | +| 0.28.1 | mmcv-full>=1.3.8, \<1.7.0 | | 0.28.0 | mmcv-full>=1.3.8, \<1.6.0 | | 0.27.0 | mmcv-full>=1.3.8, \<1.6.0 | | 0.26.0 | mmcv-full>=1.3.8, \<1.6.0 | diff --git a/docs/zh_cn/overview.md b/docs/zh_cn/overview.md index edfc9e7247..05c2b4e6b3 100644 --- a/docs/zh_cn/overview.md +++ b/docs/zh_cn/overview.md @@ -57,7 +57,7 @@ MMPose 由 **8** 个主要部分组成,apis、structures、datasets、codecs - [编解码器](./user_guides/codecs.md) - [训练与测试](./user_guides/train_and_test.md) - [可视化](./user_guides/visualization.md) - - [常用工具](./user_guides/useful_tools.md) + - [How to](./user_guides/how_to.md) 4. 对于希望将自己的项目迁移到 MMPose 的开发者: diff --git a/docs/zh_cn/useful_tools.md b/docs/zh_cn/useful_tools.md deleted file mode 100644 index a85f7a1e45..0000000000 --- a/docs/zh_cn/useful_tools.md +++ /dev/null @@ -1,3 +0,0 @@ -# 常用工具 - -内容建设中…… diff --git a/docs/zh_cn/user_guides/codecs.md b/docs/zh_cn/user_guides/codecs.md index 9fe97ee893..d758b478ee 100644 --- a/docs/zh_cn/user_guides/codecs.md +++ b/docs/zh_cn/user_guides/codecs.md @@ -26,12 +26,9 @@ MMPose 1.0 中引入了新模块 **编解码器(Codec)** ,将关键点数 以 Regression-based 方法的编码器为例: ```Python -@abstractmethod -def encode( - self, - keypoints: np.ndarray, - keypoints_visible: Optional[np.ndarray] = None -) -> Tuple[np.ndarray, np.ndarray]: +def encode(self, + keypoints: np.ndarray, + keypoints_visible: Optional[np.ndarray] = None) -> dict: """Encoding keypoints from input image space to normalized space. Args: @@ -40,13 +37,12 @@ def encode( (N, K) Returns: - tuple: - - reg_labels (np.ndarray): The normalized regression labels in + dict: + - keypoint_labels (np.ndarray): The normalized regression labels in shape (N, K, D) where D is 2 for 2d coordinates - keypoint_weights (np.ndarray): The target weights in shape (N, K) """ - if keypoints_visible is None: keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32) @@ -55,10 +51,39 @@ def encode( (keypoints <= [w - 1, h - 1])).all(axis=-1) & ( keypoints_visible > 0.5) - reg_labels = (keypoints / np.array([w, h])).astype(np.float32) + keypoint_labels = (keypoints / np.array([w, h])).astype(np.float32) keypoint_weights = np.where(valid, 1., 0.).astype(np.float32) - return reg_labels, keypoint_weights + encoded = dict( + keypoint_labels=keypoint_labels, keypoint_weights=keypoint_weights) + + return encoded +``` + +编码后的数据会在 `PackPoseInputs` 中被转换为 Tensor 格式,并封装到 `data_sample.gt_instance_labels` 中供模型调用,一般主要用于 loss 计算,下面以 `RegressionHead` 中的 `loss()` 为例: + +```Python +def loss(self, + inputs: Tuple[Tensor], + batch_data_samples: OptSampleList, + train_cfg: ConfigType = {}) -> dict: + """Calculate losses from a batch of inputs and data samples.""" + + pred_outputs = self.forward(inputs) + + keypoint_labels = torch.cat( + [d.gt_instance_labels.keypoint_labels for d in batch_data_samples]) + keypoint_weights = torch.cat([ + d.gt_instance_labels.keypoint_weights for d in batch_data_samples + ]) + + # calculate losses + losses = dict() + loss = self.loss_module(pred_outputs, keypoint_labels, + keypoint_weights.unsqueeze(-1)) + + losses.update(loss_kpt=loss) + ### 后续内容省略 ### ``` ### 解码器 @@ -126,7 +151,7 @@ codec = dict(type='RegressionLabel', input_size=(192, 256)) 在数据处理阶段生成训练目标时,需要传入编解码器用于编码: ```Python -dict(type='GenerateTarget', target_type='keypoint_label', encoder=codec) +dict(type='GenerateTarget', encoder=codec) ``` ### 模型头部 @@ -190,7 +215,7 @@ train_pipeline = [ dict(type='RandomHalfBody'), dict(type='RandomBBoxTransform'), dict(type='TopdownAffine', input_size=codec['input_size']), - dict(type='GenerateTarget', target_type='keypoint_label', encoder=codec), ## 生成训练目标 ## + dict(type='GenerateTarget', encoder=codec), ## 生成训练目标 ## dict(type='PackPoseInputs') ] test_pipeline = [ diff --git a/docs/zh_cn/user_guides/configs.md b/docs/zh_cn/user_guides/configs.md index c57a486321..9c9faab4c3 100644 --- a/docs/zh_cn/user_guides/configs.md +++ b/docs/zh_cn/user_guides/configs.md @@ -68,6 +68,10 @@ MMPose 预定义的 Registry 在 `$MMPOSE/mmpose/registry.py` 中,目前支持 - `HOOKS`:钩子类 +```{note} +需要注意的是,所有新增的模块都需要使用注册器(Registry)进行注册,并在对应目录的 `__init__.py` 中进行 `import`,以便能够使用配置文件构建其实例。 +``` + ## 配置系统 具体而言,一个配置文件主要包含如下五个部分: @@ -134,16 +138,14 @@ _base_ = ['../../../_base_/default_runtime.py'] # 以运行时的config文件位 ``` ```{note} -**Tips** - CheckpointHook: -- save_best: `'coco/AP'` 用于 `CocoMetric`, `'pck/PCK@0.05'` 用于 `PCKAccuracy` +- save_best: `'coco/AP'` 用于 `CocoMetric`, `'PCK'` 用于 `PCKAccuracy` - max_keep_ckpts: 最大保留ckpt数量,默认为-1,代表不限制 样例: -`default_hooks = dict(checkpoint=dict(save_best='pck/PCK@0.05', rule='greater', max_keep_ckpts=1))` +`default_hooks = dict(checkpoint=dict(save_best='PCK', rule='greater', max_keep_ckpts=1))` ``` ### 数据配置 @@ -179,7 +181,7 @@ train_pipeline = [ # 训练时数据增强 dict(type='TopdownAffine', input_size=codec['input_size']), # 根据变换矩阵更新目标数据 dict( type='GenerateTarget', # 根据目标数据生成监督信息 - target_type='heatmap', # 监督信息类型 + # 监督信息类型 encoder=codec, # 传入编解码器,用于数据编码,生成特定格式的监督信息 dict(type='PackPoseInputs') # 对target进行打包用于训练 ] @@ -223,9 +225,11 @@ test_dataloader = val_dataloader # 默认情况下不区分验证集和测试集 ``` ```{note} -**Tips** -设置随机种子: `randomness=dict(seed=0)` +常用功能可以参考以下教程: +- [恢复训练](../common_usages/resume_training.md) +- [自动混合精度训练](../common_usages/amp_training.md) +- [设置随机种子](../common_usages/set_random_seed.md) ``` diff --git a/docs/zh_cn/user_guides/how_to.md b/docs/zh_cn/user_guides/how_to.md new file mode 100644 index 0000000000..a2824f977b --- /dev/null +++ b/docs/zh_cn/user_guides/how_to.md @@ -0,0 +1,108 @@ +# How to + +## 分析训练日志 + +MMPose 提供了 `tools/analysis_tools/analyze_logs.py` 来对训练日志进行简单的分析,包括: + +- 将日志绘制成损失和精度曲线图 +- 统计训练速度 + +### 绘制损失和精度曲线图 + +该功能依赖于 `seaborn`,请先运行 `pip install seaborn` 安装依赖包。 + +![log_curve](https://user-images.githubusercontent.com/87690686/188538215-5d985aaa-59f8-44cf-b6f9-10890d599e9c.png) + +```shell +python tools/analysis_tools/analyze_logs.py plot_curve ${JSON_LOGS} [--keys ${KEYS}] [--title ${TITLE}] [--legend ${LEGEND}] [--backend ${BACKEND}] [--style ${STYLE}] [--out ${OUT_FILE}] +``` + +示例: + +- 绘制损失曲线 + + ```shell + python tools/analysis_tools/analyze_logs.py plot_curve log.json --keys loss_kpt --legend loss_kpt + ``` + +- 绘制精度曲线并导出为 PDF 文件 + + ```shell + python tools/analysis_tools/analyze_logs.py plot_curve log.json --keys acc_pose --out results.pdf + ``` + +- 将多个日志文件绘制在同一张图上 + + ```shell + python tools/analysis_tools/analyze_logs.py plot_curve log1.json log2.json --keys loss_kpt --legend run1 run2 --title loss_kpt --out loss_kpt.png + ``` + +### 统计训练速度 + +```shell +python tools/analysis_tools/analyze_logs.py cal_train_time ${JSON_LOGS} [--include-outliers] +``` + +示例: + +```shell +python tools/analysis_tools/analyze_logs.py cal_train_time log.json +``` + +结果如下: + +```text +-----Analyze train time of hrnet_w32_256x192.json----- +slowest epoch 56, average time is 0.6924 +fastest epoch 1, average time is 0.6502 +time std over epochs is 0.0085 +average iter time: 0.6688 s/iter +``` + +## 统计模型参数量与计算量 + +MMPose 提供了 `tools/analysis_tools/get_flops.py` 来统计模型的参数量与计算量。 + +```shell +python tools/analysis_tools/get_flops.py ${CONFIG_FILE} [--shape ${INPUT_SHAPE}] [--cfg-options ${CFG_OPTIONS}] +``` + +参数说明: + +`CONFIG_FILE` : 模型配置文件的路径。 + +`--shape`: 模型的输入张量形状。 + +`--input-constructor`: 如果指定为 `batch`,将会生成一个 `batch tensor` 来计算 FLOPs。 + +`--batch-size`:如果 `--input-constructor` 指定为 `batch`,将会生成一个随机 `tensor`,形状为 `(batch_size, 3, **input_shape)` 来计算 FLOPs。 + +`--cfg-options`: 如果指定,可选的 `cfg` 的键值对将会被合并到配置文件中。 + +示例: + +```shell +python tools/analysis_tools/get_flops.py configs/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192.py +``` + +结果如下: + +```text +============================== +Input shape: (1, 3, 256, 192) +Flops: 7.7 GFLOPs +Params: 28.54 M +============================== +``` + +```{note} +目前该工具仍处于实验阶段,我们不能保证统计结果绝对正确,一些算子(比如 GN 或自定义算子)没有被统计到 FLOPs 中。 +``` + +## 打印全部配置信息 + +官方提供的配置文件,有时候继承了多个配置文件,这样做可以方便管理,减少冗余代码。但有时候我们希望知道配置文件中没有写明的默认参数值是什么,MMPose 提供了 `tools/analysis_tools/print_config.py` 来逐字逐句打印全部的配置信息。 + +```shell +python tools/analysis_tools/print_config.py ${CONFIG} [-h] [--options ${OPTIONS [OPTIONS...]}] +``` diff --git a/docs/zh_cn/user_guides/inference.md b/docs/zh_cn/user_guides/inference.md index efe84ff524..d77ff2185c 100644 --- a/docs/zh_cn/user_guides/inference.md +++ b/docs/zh_cn/user_guides/inference.md @@ -1,3 +1,3 @@ # 模型推理 -内容建设中 +中文内容建设中,暂时请查阅[英文版文档](../../en/user_guides/inference.md) diff --git a/docs/zh_cn/user_guides/prepare_datasets.md b/docs/zh_cn/user_guides/prepare_datasets.md index 1a9a764580..892b3fc5e9 100644 --- a/docs/zh_cn/user_guides/prepare_datasets.md +++ b/docs/zh_cn/user_guides/prepare_datasets.md @@ -1,3 +1,264 @@ # 准备数据集 -内容建设中 +MMPose 目前已支持了多个任务和相应的数据集。您可以在 [数据集](https://mmpose.readthedocs.io/zh_CN/1.x/dataset_zoo.html) 找到它们。请按照相应的指南准备数据。 + + + +- [自定义数据集-将数据组织为 COCO 格式](#自定义数据集-将数据组织为-coco-格式) +- [创建自定义数据集的元信息文件](#创建自定义数据集的元信息文件) +- [创建自定义数据集类](#创建自定义数据集类) +- [创建自定义配置文件](#创建自定义配置文件) +- [数据集封装](#数据集封装) + + + +## 自定义数据集-将数据组织为 COCO 格式 + +最简单的使用自定义数据集的方法是将您的注释格式转换为 COCO 数据集格式。 + +COCO 格式的注释 JSON 文件具有以下必要键: + +```python +'images': [ + { + 'file_name': '000000001268.jpg', + 'height': 427, + 'width': 640, + 'id': 1268 + }, + ... +], +'annotations': [ + { + 'segmentation': [[426.36, + ... + 424.34, + 223.3]], + 'keypoints': [0,0,0, + 0,0,0, + 0,0,0, + 427,220,2, + 443,222,2, + 414,228,2, + 449,232,2, + 408,248,1, + 454,261,2, + 0,0,0, + 0,0,0, + 411,287,2, + 431,287,2, + 0,0,0, + 458,265,2, + 0,0,0, + 466,300,1], + 'num_keypoints': 10, + 'area': 3894.5826, + 'iscrowd': 0, + 'image_id': 1268, + 'bbox': [402.34, 205.02, 65.26, 88.45], + 'category_id': 1, + 'id': 215218 + }, + ... +], +'categories': [ + {'id': 1, 'name': 'person'}, + ] +``` + +JSON 标注文件中有三个关键词是必需的: + +- `images`:包含所有图像信息的列表,每个图像都有一个 `file_name`、`height`、`width` 和 `id` 键。 +- `annotations`:包含所有实例标注信息的列表,每个实例都有一个 `segmentation`、`keypoints`、`num_keypoints`、`area`、`iscrowd`、`image_id`、`bbox`、`category_id` 和 `id` 键。 +- `categories`:包含所有类别信息的列表,每个类别都有一个 `id` 和 `name` 键。以人体姿态估计为例,`id` 为 1,`name` 为 `person`。 + +如果您的数据集已经是 COCO 格式的,那么您可以直接使用 `CocoDataset` 类来读取该数据集。 + +## 创建自定义数据集的元信息文件 + +对于一个新的数据集而言,您需要创建一个新的数据集元信息文件。该文件包含了数据集的基本信息,如关键点个数、排列顺序、可视化颜色、骨架连接关系等。元信息文件通常存放在 `config/_base_/datasets/` 目录下,例如: + +``` +config/_base_/datasets/custom.py +``` + +元信息文件中需要包含以下信息: + +- `keypoint_info`:每个关键点的信息: + 1. `name`: 关键点名称,必须是唯一的,例如 `nose`、`left_eye` 等。 + 2. `id`: 关键点 ID,必须是唯一的,从 0 开始。 + 3. `color`: 关键点可视化时的颜色,以 (\[B, G, R\]) 格式组织起来,用于可视化。 + 4. `type`: 关键点类型,可以是 `upper`、`lower` 或 \`\`,用于数据增强。 + 5. `swap`: 关键点交换关系,用于水平翻转数据增强。 +- `skeleton_info`:骨架连接关系,用于可视化。 +- `joint_weights`:每个关键点的权重,用于损失函数计算。 +- `sigma`:标准差,用于计算 OKS 分数,详细信息请参考 [keypoints-eval](https://cocodataset.org/#keypoints-eval)。 + +下面是一个简化版本的元信息文件([完整版](/configs/_base_/datasets/coco.py)): + +```python +dataset_info = dict( + dataset_name='coco', + paper_info=dict( + author='Lin, Tsung-Yi and Maire, Michael and ' + 'Belongie, Serge and Hays, James and ' + 'Perona, Pietro and Ramanan, Deva and ' + r'Doll{\'a}r, Piotr and Zitnick, C Lawrence', + title='Microsoft coco: Common objects in context', + container='European conference on computer vision', + year='2014', + homepage='http://cocodataset.org/', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + ... + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + ... + 18: + dict( + link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]) + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, + 1.5 + ], + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089 + ]) +``` + +## 创建自定义数据集类 + +如果标注信息不是用 COCO 格式存储的,那么您需要创建一个新的数据集类。数据集类需要继承自 `BaseDataset` 类,并且需要按照以下步骤实现: + +1. 在 `mmpose/datasets/datasets` 目录下找到该数据集符合的 package,如果没有符合的,则创建一个新的 package。 + +2. 在该 package 下创建一个新的数据集类,在对应的注册器中进行注册: + + ```python + from mmengine.dataset import BaseDataset + from mmpose.registry import DATASETS + + @DATASETS.register_module(name='MyCustomDataset') + class MyCustomDataset(BaseDataset): + ``` + + 如果未注册,你会在运行时遇到 `KeyError: 'XXXXX is not in the dataset registry'`。 + 关于 `mmengine.BaseDataset` 的更多信息,请参考 [这个文档](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html)。 + +3. 确保你在 package 的 `__init__.py` 中导入了该数据集类。 + +4. 确保你在 `mmpose/datasets/__init__.py` 中导入了该 package。 + +## 创建自定义配置文件 + +在配置文件中,你需要修改跟数据集有关的部分,例如: + +```python +... +# 自定义数据集类 +dataset_type = 'MyCustomDataset' # or 'CocoDataset' + +train_dataloader = dict( + batch_size=2, + dataset=dict( + type=dataset_type, + data_root='root/of/your/train/data', + ann_file='path/to/your/train/json', + data_prefix=dict(img='path/to/your/train/img'), + metainfo=dict(from_file='configs/_base_/datasets/custom.py'), + ...), + ) + +val_dataloader = dict( + batch_size=2, + dataset=dict( + type=dataset_type, + data_root='root/of/your/val/data', + ann_file='path/to/your/val/json', + data_prefix=dict(img='path/to/your/val/img'), + metainfo=dict(from_file='configs/_base_/datasets/custom.py'), + ...), + ) + +test_dataloader = dict( + batch_size=2, + dataset=dict( + type=dataset_type, + data_root='root/of/your/test/data', + ann_file='path/to/your/test/json', + data_prefix=dict(img='path/to/your/test/img'), + metainfo=dict(from_file='configs/_base_/datasets/custom.py'), + ...), + ) +... +``` + +请确保所有的路径都是正确的。 + +## 数据集封装 + +目前 [MMEngine](https://github.com/open-mmlab/mmengine) 支持以下数据集封装: + +- [ConcatDataset](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/basedataset.html#concatdataset) +- [RepeatDataset](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/basedataset.html#repeatdataset) + +### CombinedDataset + +MMPose 提供了一个 `CombinedDataset` 类,它可以将多个数据集封装成一个数据集。它的使用方法如下: + +```python +dataset_1 = dict( + type='dataset_type_1', + data_root='root/of/your/dataset1', + data_prefix=dict(img_path='path/to/your/img'), + ann_file='annotations/train.json', + pipeline=[ + # 使用转换器将标注信息统一为需要的格式 + converter_transform_1 + ]) + +dataset_2 = dict( + type='dataset_type_2', + data_root='root/of/your/dataset2', + data_prefix=dict(img_path='path/to/your/img'), + ann_file='annotations/train.json', + pipeline=[ + converter_transform_2 + ]) + +shared_pipeline = [ + LoadImage(), + ParseImage(), +] + +combined_dataset = dict( + type='CombinedDataset', + metainfo=dict(from_file='path/to/your/metainfo'), + datasets=[dataset_1, dataset_2], + pipeline=shared_pipeline, +) +``` + +- **合并数据集的元信息** 决定了标注格式,可以是子数据集的元信息,也可以是自定义的元信息。如果要自定义元信息,可以参考 [创建自定义数据集的元信息文件](#创建自定义数据集的元信息文件)。 +- **KeypointConverter** 用于将不同的标注格式转换成统一的格式。比如将关键点个数不同、关键点排列顺序不同的数据集进行合并。 +- 更详细的说明请前往进阶教程-[混合数据集训练](../advanced_guides/mixed_datasets.md)。 diff --git a/docs/zh_cn/user_guides/train_and_test.md b/docs/zh_cn/user_guides/train_and_test.md index e4e65e2b14..3cddc5c715 100644 --- a/docs/zh_cn/user_guides/train_and_test.md +++ b/docs/zh_cn/user_guides/train_and_test.md @@ -1,3 +1,3 @@ # 训练与测试 -内容建设中 +中文内容建设中,暂时请查阅[英文版文档](../../en/user_guides/train_and_test.md) diff --git a/docs/zh_cn/user_guides/useful_tools.md b/docs/zh_cn/user_guides/useful_tools.md index e6ed383c1c..d7e027e609 100644 --- a/docs/zh_cn/user_guides/useful_tools.md +++ b/docs/zh_cn/user_guides/useful_tools.md @@ -1,3 +1,3 @@ # 常用工具 -内容建设中 +中文内容建设中,暂时请查阅[英文版文档](../../en/user_guides/useful_tools.md) diff --git a/docs/zh_cn/user_guides/visualization.md b/docs/zh_cn/user_guides/visualization.md index 9408797cac..ffd20af99a 100644 --- a/docs/zh_cn/user_guides/visualization.md +++ b/docs/zh_cn/user_guides/visualization.md @@ -1,3 +1,3 @@ # 可视化 -内容建设中 +中文内容建设中,暂时请查阅[英文版文档](../../en/user_guides/visualization.md) diff --git a/mmpose/__init__.py b/mmpose/__init__.py index dc812f996d..ad7946470d 100644 --- a/mmpose/__init__.py +++ b/mmpose/__init__.py @@ -5,11 +5,11 @@ from .version import __version__, short_version -mmcv_minimum_version = '2.0.0rc0' +mmcv_minimum_version = '2.0.0rc4' mmcv_maximum_version = '2.1.0' mmcv_version = digit_version(mmcv.__version__) -mmengine_minimum_version = '0.1.0' +mmengine_minimum_version = '0.6.0' mmengine_maximum_version = '1.0.0' mmengine_version = digit_version(mmengine.__version__) diff --git a/mmpose/apis/__init__.py b/mmpose/apis/__init__.py index 60395cf82b..ff7149e453 100644 --- a/mmpose/apis/__init__.py +++ b/mmpose/apis/__init__.py @@ -1,4 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. -from .inference import inference_topdown, init_model +from .inference import inference_bottomup, inference_topdown, init_model +from .inferencers import MMPoseInferencer, Pose2DInferencer -__all__ = ['init_model', 'inference_topdown'] +__all__ = [ + 'init_model', 'inference_topdown', 'inference_bottomup', + 'Pose2DInferencer', 'MMPoseInferencer' +] diff --git a/mmpose/apis/inference.py b/mmpose/apis/inference.py index 15b8ebca99..f884d415ca 100644 --- a/mmpose/apis/inference.py +++ b/mmpose/apis/inference.py @@ -1,18 +1,20 @@ # Copyright (c) OpenMMLab. All rights reserved. import warnings from pathlib import Path -from typing import Optional, Union +from typing import List, Optional, Union import numpy as np import torch import torch.nn as nn from mmengine.config import Config -from mmengine.dataset import Compose +from mmengine.dataset import Compose, pseudo_collate +from mmengine.registry import init_default_scope from mmengine.runner import load_checkpoint from PIL import Image from mmpose.datasets.datasets.utils import parse_pose_metainfo from mmpose.models.builder import build_pose_estimator +from mmpose.structures import PoseDataSample from mmpose.structures.bbox import bbox_xywh2xyxy @@ -92,6 +94,9 @@ def init_model(config: Union[str, Path, Config], config.model.backbone.init_cfg = None config.model.train_cfg = None + # register all modules in mmpose into the registries + init_default_scope(config.get('default_scope', 'mmpose')) + model = build_pose_estimator(config.model) # get dataset_meta in this priority: checkpoint > config > default (COCO) dataset_meta = None @@ -104,7 +109,7 @@ def init_model(config: Union[str, Path, Config], dataset_meta = ckpt['meta']['dataset_meta'] if dataset_meta is None: - dataset_meta = dataset_meta_from_config(config, dataset_mode='test') + dataset_meta = dataset_meta_from_config(config, dataset_mode='train') if dataset_meta is None: warnings.simplefilter('once') @@ -123,9 +128,9 @@ def init_model(config: Union[str, Path, Config], def inference_topdown(model: nn.Module, img: Union[np.ndarray, str], - bboxes: Optional[np.ndarray] = None, - bbox_format: str = 'xyxy'): - """Inference image with the top-down pose estimator. + bboxes: Optional[Union[List, np.ndarray]] = None, + bbox_format: str = 'xyxy') -> List[PoseDataSample]: + """Inference image with a top-down pose estimator. Args: model (nn.Module): The top-down pose estimator @@ -137,13 +142,13 @@ def inference_topdown(model: nn.Module, and ``'xyxy'``. Defaults to ``'xyxy'`` Returns: - :obj:`PoseDataSample`: The inference results. Specifically, the + List[:obj:`PoseDataSample`]: The inference results. Specifically, the predicted keypoints and scores are saved at - ``data_samples.pred_instances.keypoints`` and - ``data_samples.pred_instances.keypoint_scores``. + ``data_sample.pred_instances.keypoints`` and + ``data_sample.pred_instances.keypoint_scores``. """ - cfg = model.cfg - pipeline = Compose(cfg.test_dataloader.dataset.pipeline) + init_default_scope(model.cfg.get('default_scope', 'mmpose')) + pipeline = Compose(model.cfg.test_dataloader.dataset.pipeline) if bboxes is None: # get bbox from the image size @@ -154,6 +159,9 @@ def inference_topdown(model: nn.Module, bboxes = np.array([[0, 0, w, h]], dtype=np.float32) else: + if isinstance(bboxes, list): + bboxes = np.array(bboxes) + assert bbox_format in {'xyxy', 'xywh'}, \ f'Invalid bbox_format "{bbox_format}".' @@ -161,24 +169,55 @@ def inference_topdown(model: nn.Module, bboxes = bbox_xywh2xyxy(bboxes) # construct batch data samples - data = [] + data_list = [] for bbox in bboxes: - if isinstance(img, str): - _data = dict(img_path=img) + data_info = dict(img_path=img) else: - _data = dict(img=img) + data_info = dict(img=img) + data_info['bbox'] = bbox[None] # shape (1, 4) + data_info['bbox_score'] = np.ones(1, dtype=np.float32) # shape (1,) + data_info.update(model.dataset_meta) + data_list.append(pipeline(data_info)) + + if data_list: + # collate data list into a batch, which is a dict with following keys: + # batch['inputs']: a list of input images + # batch['data_samples']: a list of :obj:`PoseDataSample` + batch = pseudo_collate(data_list) + with torch.no_grad(): + results = model.test_step(batch) + else: + results = [] + + return results - _data['bbox'] = bbox[None] # shape (1, 4) - _data['bbox_score'] = np.ones(1, dtype=np.float32) # shape (1,) - _data.update(model.dataset_meta) - data.append(pipeline(_data)) - data_ = dict() - data_['inputs'] = [_data['inputs'] for _data in data] - data_['data_samples'] = [_data['data_samples'] for _data in data] +def inference_bottomup(model: nn.Module, img: Union[np.ndarray, str]): + """Inference image with a bottom-up pose estimator. + + Args: + model (nn.Module): The bottom-up pose estimator + img (np.ndarray | str): The loaded image or image file to inference + + Returns: + List[:obj:`PoseDataSample`]: The inference results. Specifically, the + predicted keypoints and scores are saved at + ``data_sample.pred_instances.keypoints`` and + ``data_sample.pred_instances.keypoint_scores``. + """ + pipeline = Compose(model.cfg.test_dataloader.dataset.pipeline) + + # prepare data batch + if isinstance(img, str): + data_info = dict(img_path=img) + else: + data_info = dict(img=img) + data_info.update(model.dataset_meta) + data = pipeline(data_info) + batch = pseudo_collate([data]) with torch.no_grad(): - results = model.test_step(data_) + results = model.test_step(batch) return results diff --git a/mmpose/apis/inferencers/__init__.py b/mmpose/apis/inferencers/__init__.py new file mode 100644 index 0000000000..3c21a02e08 --- /dev/null +++ b/mmpose/apis/inferencers/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .mmpose_inferencer import MMPoseInferencer +from .pose2d_inferencer import Pose2DInferencer + +__all__ = ['Pose2DInferencer', 'MMPoseInferencer'] diff --git a/mmpose/apis/inferencers/base_mmpose_inferencer.py b/mmpose/apis/inferencers/base_mmpose_inferencer.py new file mode 100644 index 0000000000..d99dcc1b68 --- /dev/null +++ b/mmpose/apis/inferencers/base_mmpose_inferencer.py @@ -0,0 +1,444 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mimetypes +import os +import shutil +import tempfile +import warnings +from collections import defaultdict +from typing import (Any, Callable, Dict, Generator, List, Optional, Sequence, + Union) + +import cv2 +import mmcv +import mmengine +import numpy as np +import torch.nn as nn +from mmengine.config import Config, ConfigDict +from mmengine.dataset import Compose +from mmengine.fileio import (get_file_backend, isdir, join_path, + list_dir_or_file) +from mmengine.infer.infer import BaseInferencer +from mmengine.runner.checkpoint import _load_checkpoint_to_model +from mmengine.structures import InstanceData + +from mmpose.apis.inference import dataset_meta_from_config +from mmpose.structures import PoseDataSample, split_instances + +InstanceList = List[InstanceData] +InputType = Union[str, np.ndarray] +InputsType = Union[InputType, Sequence[InputType]] +PredType = Union[InstanceData, InstanceList] +ImgType = Union[np.ndarray, Sequence[np.ndarray]] +ConfigType = Union[Config, ConfigDict] +ResType = Union[Dict, List[Dict], InstanceData, List[InstanceData]] + + +class BaseMMPoseInferencer(BaseInferencer): + """The base class for MMPose inferencers.""" + + preprocess_kwargs: set = {'bbox_thr', 'nms_thr'} + forward_kwargs: set = set() + visualize_kwargs: set = { + 'return_vis', + 'show', + 'wait_time', + 'radius', + 'thickness', + 'kpt_thr', + 'vis_out_dir', + } + postprocess_kwargs: set = {'pred_out_dir'} + + def _load_weights_to_model(self, model: nn.Module, + checkpoint: Optional[dict], + cfg: Optional[ConfigType]) -> None: + """Loading model weights and meta information from cfg and checkpoint. + + Subclasses could override this method to load extra meta information + from ``checkpoint`` and ``cfg`` to model. + + Args: + model (nn.Module): Model to load weights and meta information. + checkpoint (dict, optional): The loaded checkpoint. + cfg (Config or ConfigDict, optional): The loaded config. + """ + if checkpoint is not None: + _load_checkpoint_to_model(model, checkpoint) + checkpoint_meta = checkpoint.get('meta', {}) + # save the dataset_meta in the model for convenience + if 'dataset_meta' in checkpoint_meta: + # mmpose 1.x + model.dataset_meta = checkpoint_meta['dataset_meta'] + else: + warnings.warn( + 'dataset_meta are not saved in the checkpoint\'s ' + 'meta data, load via config.') + model.dataset_meta = dataset_meta_from_config( + cfg, dataset_mode='train') + else: + warnings.warn('Checkpoint is not loaded, and the inference ' + 'result is calculated by the randomly initialized ' + 'model!') + model.dataset_meta = dataset_meta_from_config( + cfg, dataset_mode='train') + + def _inputs_to_list(self, inputs: InputsType) -> list: + """Preprocess the inputs to a list. + + Preprocess inputs to a list according to its type: + + - list or tuple: return inputs + - str: + - Directory path: return all files in the directory + - other cases: return a list containing the string. The string + could be a path to file, a url or other types of string + according to the task. + + Args: + inputs (InputsType): Inputs for the inferencer. + + Returns: + list: List of input for the :meth:`preprocess`. + """ + self._video_input = False + + if isinstance(inputs, str): + backend = get_file_backend(inputs) + if hasattr(backend, 'isdir') and isdir(inputs): + # Backends like HttpsBackend do not implement `isdir`, so only + # those backends that implement `isdir` could accept the + # inputs as a directory + filepath_list = [ + join_path(inputs, fname) + for fname in list_dir_or_file(inputs, list_dir=False) + ] + inputs = [] + for filepath in filepath_list: + input_type = mimetypes.guess_type(filepath)[0].split( + '/')[0] + if input_type == 'image': + inputs.append(filepath) + inputs.sort() + else: + # if inputs is a path to a video file, it will be converted + # to a list containing separated frame filenames + input_type = mimetypes.guess_type(inputs)[0].split('/')[0] + if input_type == 'video': + self._video_input = True + # split video frames into a temporary folder + frame_folder = tempfile.TemporaryDirectory() + video = mmcv.VideoReader(inputs) + self.video_info = dict( + fps=video.fps, + name=os.path.basename(inputs), + frame_folder=frame_folder) + video.cvt2frames(frame_folder.name, show_progress=False) + frames = sorted(list_dir_or_file(frame_folder.name)) + inputs = [join_path(frame_folder.name, f) for f in frames] + + if not isinstance(inputs, (list, tuple)): + inputs = [inputs] + + return list(inputs) + + def _get_webcam_inputs(self, inputs: str) -> Generator: + """Sets up and returns a generator function that reads frames from a + webcam input. The generator function returns a new frame each time it + is iterated over. + + Args: + inputs (str): A string describing the webcam input, in the format + "webcam:id". + + Returns: + A generator function that yields frames from the webcam input. + + Raises: + ValueError: If the inputs string is not in the expected format. + """ + + # Ensure the inputs string is in the expected format. + inputs = inputs.lower() + assert inputs.startswith('webcam'), f'Expected input to start with ' \ + f'"webcam", but got "{inputs}"' + + # Parse the camera ID from the inputs string. + inputs_ = inputs.split(':') + if len(inputs_) == 1: + camera_id = 0 + elif len(inputs_) == 2 and str.isdigit(inputs_[1]): + camera_id = int(inputs_[1]) + else: + raise ValueError( + f'Expected webcam input to have format "webcam:id", ' + f'but got "{inputs}"') + + # Attempt to open the video capture object. + vcap = cv2.VideoCapture(camera_id) + if not vcap.isOpened(): + warnings.warn(f'Cannot open camera (ID={camera_id})') + return [] + + # Set video input flag and metadata. + self._video_input = True + self.video_info = dict(fps=10, name='webcam.mp4', frame_folder=None) + + # Set up webcam reader generator function. + self._window_closing = False + + def _webcam_reader() -> Generator: + while True: + if self._window_closing: + vcap.release() + break + + ret_val, frame = vcap.read() + if not ret_val: + break + + yield frame + + return _webcam_reader() + + def _visualization_window_on_close(self, event): + self._window_closing = True + + def _init_pipeline(self, cfg: ConfigType) -> Callable: + """Initialize the test pipeline. + + Args: + cfg (ConfigType): model config path or dict + + Returns: + A pipeline to handle various input data, such as ``str``, + ``np.ndarray``. The returned pipeline will be used to process + a single data. + """ + return Compose(cfg.test_dataloader.dataset.pipeline) + + def preprocess(self, inputs: InputsType, batch_size: int = 1, **kwargs): + """Process the inputs into a model-feedable format. + + Args: + inputs (InputsType): Inputs given by user. + batch_size (int): batch size. Defaults to 1. + + Yields: + Any: Data processed by the ``pipeline`` and ``collate_fn``. + List[str or np.ndarray]: List of original inputs in the batch + """ + + for i, input in enumerate(inputs): + data_infos = self.preprocess_single(input, index=i, **kwargs) + # only supports inference with batch size 1 + yield self.collate_fn(data_infos), [input] + + def visualize(self, + inputs: list, + preds: List[PoseDataSample], + return_vis: bool = False, + show: bool = False, + wait_time: float = 0, + radius: int = 3, + thickness: int = 1, + kpt_thr: float = 0.3, + vis_out_dir: str = '', + window_name: str = '', + window_close_event_handler: Optional[Callable] = None + ) -> List[np.ndarray]: + """Visualize predictions. + + Args: + inputs (list): Inputs preprocessed by :meth:`_inputs_to_list`. + preds (Any): Predictions of the model. + return_vis (bool): Whether to return images with predicted results. + show (bool): Whether to display the image in a popup window. + Defaults to False. + wait_time (float): The interval of show (ms). Defaults to 0 + radius (int): Keypoint radius for visualization. Defaults to 3 + thickness (int): Link thickness for visualization. Defaults to 1 + kpt_thr (float): The threshold to visualize the keypoints. + Defaults to 0.3 + vis_out_dir (str, optional): Directory to save visualization + results w/o predictions. If left as empty, no file will + be saved. Defaults to ''. + window_name (str, optional): Title of display window. + window_close_event_handler (callable, optional): + + Returns: + List[np.ndarray]: Visualization results. + """ + if (not return_vis) and (not show) and (not vis_out_dir): + return + + if getattr(self, 'visualizer', None) is None: + raise ValueError('Visualization needs the "visualizer" term' + 'defined in the config, but got None.') + + self.visualizer.radius = radius + self.visualizer.line_width = thickness + + results = [] + + for single_input, pred in zip(inputs, preds): + if isinstance(single_input, str): + img = mmcv.imread(single_input, channel_order='rgb') + elif isinstance(single_input, np.ndarray): + img = mmcv.bgr2rgb(single_input.copy()) + else: + raise ValueError('Unsupported input type: ' + f'{type(single_input)}') + + img_name = os.path.basename(pred.metainfo['img_path']) + + if vis_out_dir: + if self._video_input: + out_file = join_path(vis_out_dir, 'vis_frames', img_name) + else: + out_file = join_path(vis_out_dir, img_name) + else: + out_file = None + + # since visualization and inference utilize the same process, + # the wait time is reduced when a video input is utilized, + # thereby eliminating the issue of inference getting stuck. + wait_time = 1e-5 if self._video_input else wait_time + + window_name = window_name if window_name else img_name + + visualization = self.visualizer.add_datasample( + window_name, + img, + pred, + draw_gt=False, + show=show, + wait_time=wait_time, + out_file=out_file, + kpt_score_thr=kpt_thr) + results.append(visualization) + + if show and not hasattr(self, '_window_close_cid'): + if window_close_event_handler is None: + window_close_event_handler = \ + self._visualization_window_on_close + self._window_close_cid = \ + self.visualizer.manager.canvas.mpl_connect( + 'close_event', + window_close_event_handler + ) + + if return_vis: + return results + else: + return [] + + def postprocess( + self, + preds: List[PoseDataSample], + visualization: List[np.ndarray], + return_datasample=False, + pred_out_dir: str = '', + ) -> dict: + """Process the predictions and visualization results from ``forward`` + and ``visualize``. + + This method should be responsible for the following tasks: + + 1. Convert datasamples into a json-serializable dict if needed. + 2. Pack the predictions and visualization results and return them. + 3. Dump or log the predictions. + + Args: + preds (List[Dict]): Predictions of the model. + visualization (np.ndarray): Visualized predictions. + return_datasample (bool): Whether to return results as + datasamples. Defaults to False. + pred_out_dir (str): Directory to save the inference results w/o + visualization. If left as empty, no file will be saved. + Defaults to ''. + + Returns: + dict: Inference and visualization results with key ``predictions`` + and ``visualization`` + + - ``visualization (Any)``: Returned by :meth:`visualize` + - ``predictions`` (dict or DataSample): Returned by + :meth:`forward` and processed in :meth:`postprocess`. + If ``return_datasample=False``, it usually should be a + json-serializable dict containing only basic data elements such + as strings and numbers. + """ + + result_dict = defaultdict(list) + + result_dict['visualization'] = visualization + for pred in preds: + if not return_datasample: + # convert datasamples to list of instance predictions + pred = split_instances(pred.pred_instances) + result_dict['predictions'].append(pred) + + if pred_out_dir != '': + if self._video_input: + pred_out_dir = join_path(pred_out_dir, 'pred_frames') + + for pred, data_sample in zip(result_dict['predictions'], preds): + fname = os.path.splitext( + os.path.basename( + data_sample.metainfo['img_path']))[0] + '.json' + mmengine.dump( + pred, join_path(pred_out_dir, fname), indent=' ') + + return result_dict + + def _merge_outputs(self, vis_out_dir: str, pred_out_dir: str, + **kwargs: Dict[str, Any]) -> None: + """Merge the visualized frames and predicted instance outputs and save + them. + + Args: + vis_out_dir (str): Path to the directory where the visualized + frames are saved. + pred_out_dir (str): Path to the directory where the predicted + instance outputs are saved. + **kwargs: Other arguments that are not used in this method. + """ + assert self._video_input + + if vis_out_dir != '': + vis_frame_out_dir = join_path(vis_out_dir, 'vis_frames') + if not isdir(vis_frame_out_dir) or len( + os.listdir(vis_frame_out_dir)) == 0: + warnings.warn( + f'{vis_frame_out_dir} does not exist or is empty.') + else: + mmcv.frames2video( + vis_frame_out_dir, + join_path(vis_out_dir, self.video_info['name']), + fps=self.video_info['fps'], + fourcc='mp4v', + show_progress=False) + shutil.rmtree(vis_frame_out_dir) + + if pred_out_dir != '': + pred_frame_out_dir = join_path(pred_out_dir, 'pred_frames') + if not isdir(pred_frame_out_dir) or len( + os.listdir(pred_frame_out_dir)) == 0: + warnings.warn( + f'{pred_frame_out_dir} does not exist or is empty.') + else: + predictions = [] + pred_files = list_dir_or_file(pred_frame_out_dir) + for frame_id, pred_file in enumerate(sorted(pred_files)): + predictions.append({ + 'frame_id': + frame_id, + 'instances': + mmengine.load( + join_path(pred_frame_out_dir, pred_file)) + }) + fname = os.path.splitext( + os.path.basename(self.video_info['name']))[0] + '.json' + mmengine.dump( + predictions, join_path(pred_out_dir, fname), indent=' ') + shutil.rmtree(pred_frame_out_dir) diff --git a/mmpose/apis/inferencers/mmpose_inferencer.py b/mmpose/apis/inferencers/mmpose_inferencer.py new file mode 100644 index 0000000000..f5b23fb125 --- /dev/null +++ b/mmpose/apis/inferencers/mmpose_inferencer.py @@ -0,0 +1,275 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from typing import Dict, List, Optional, Sequence, Union + +import numpy as np +from mmengine.config import Config, ConfigDict +from mmengine.fileio import join_path +from mmengine.infer.infer import ModelType +from mmengine.structures import InstanceData +from rich.progress import track + +from mmpose.structures import PoseDataSample +from .base_mmpose_inferencer import BaseMMPoseInferencer +from .pose2d_inferencer import Pose2DInferencer + +InstanceList = List[InstanceData] +InputType = Union[str, np.ndarray] +InputsType = Union[InputType, Sequence[InputType]] +PredType = Union[InstanceData, InstanceList] +ImgType = Union[np.ndarray, Sequence[np.ndarray]] +ConfigType = Union[Config, ConfigDict] +ResType = Union[Dict, List[Dict], InstanceData, List[InstanceData]] + + +class MMPoseInferencer(BaseMMPoseInferencer): + """MMPose Inferencer. It's a unified inferencer interface for pose + estimation task, currently including: Pose2D. and it can be used to perform + 2D keypoint detection. + + Args: + pose2d (str, optional): Pretrained 2D pose estimation algorithm. + It's the path to the config file or the model name defined in + metafile. For example, it could be: + + - model alias, e.g. ``'body'``, + - config name, e.g. ``'simcc_res50_8xb64-210e_coco-256x192'``, + - config path + + Defaults to ``None``. + pose2d_weights (str, optional): Path to the custom checkpoint file of + the selected pose2d model. If it is not specified and "pose2d" is + a model name of metafile, the weights will be loaded from + metafile. Defaults to None. + device (str, optional): Device to run inference. If None, the + available device will be automatically used. Defaults to None. + scope (str, optional): The scope of the model. Defaults to "mmpose". + det_model(str, optional): Config path or alias of detection model. + Defaults to None. + det_weights(str, optional): Path to the checkpoints of detection + model. Defaults to None. + det_cat_ids(int or list[int], optional): Category id for + detection model. Defaults to None. + """ + + preprocess_kwargs: set = {'bbox_thr', 'nms_thr'} + forward_kwargs: set = set() + visualize_kwargs: set = { + 'return_vis', + 'show', + 'wait_time', + 'radius', + 'thickness', + 'kpt_thr', + 'vis_out_dir', + } + postprocess_kwargs: set = {'pred_out_dir'} + + def __init__(self, + pose2d: Optional[str] = None, + pose2d_weights: Optional[str] = None, + device: Optional[str] = None, + scope: str = 'mmpose', + det_model: Optional[Union[ModelType, str]] = None, + det_weights: Optional[str] = None, + det_cat_ids: Optional[Union[int, List]] = None) -> None: + + if pose2d is None: + raise ValueError('2d pose estimation algorithm should provided.') + + self.visualizer = None + if pose2d is not None: + self.pose2d_inferencer = Pose2DInferencer(pose2d, pose2d_weights, + device, scope, det_model, + det_weights, det_cat_ids) + self.mode = 'pose2d' + + def preprocess(self, inputs: InputsType, batch_size: int = 1, **kwargs): + """Process the inputs into a model-feedable format. + + Args: + inputs (InputsType): Inputs given by user. + batch_size (int): batch size. Defaults to 1. + + Yields: + Any: Data processed by the ``pipeline`` and ``collate_fn``. + List[str or np.ndarray]: List of original inputs in the batch + """ + + for i, input in enumerate(inputs): + data_batch = {} + if 'pose2d' in self.mode: + data_infos = self.pose2d_inferencer.preprocess_single( + input, index=i, **kwargs) + data_batch['pose2d'] = self.pose2d_inferencer.collate_fn( + data_infos) + # only supports inference with batch size 1 + yield data_batch, [input] + + def forward(self, inputs: InputType, **forward_kwargs) -> PredType: + """Forward the inputs to the model. + + Args: + inputs (InputsType): The inputs to be forwarded. + + Returns: + Dict: The prediction results. Possibly with keys "pose2d". + """ + result = {} + if self.mode == 'pose2d': + data_samples = self.pose2d_inferencer.forward( + inputs['pose2d'], **forward_kwargs) + result['pose2d'] = data_samples + + return result + + def __call__( + self, + inputs: InputsType, + return_datasample: bool = False, + batch_size: int = 1, + out_dir: Optional[str] = None, + **kwargs, + ) -> dict: + """Call the inferencer. + + Args: + inputs (InputsType): Inputs for the inferencer. + return_datasample (bool): Whether to return results as + :obj:`BaseDataElement`. Defaults to False. + batch_size (int): Batch size. Defaults to 1. + out_dir (str, optional): directory to save visualization + results and predictions. Will be overoden if vis_out_dir or + pred_out_dir are given. Defaults to None + **kwargs: Key words arguments passed to :meth:`preprocess`, + :meth:`forward`, :meth:`visualize` and :meth:`postprocess`. + Each key in kwargs should be in the corresponding set of + ``preprocess_kwargs``, ``forward_kwargs``, + ``visualize_kwargs`` and ``postprocess_kwargs``. + + Returns: + dict: Inference and visualization results. + """ + if out_dir is not None: + if 'vis_out_dir' not in kwargs: + kwargs['vis_out_dir'] = f'{out_dir}/visualizations' + if 'pred_out_dir' not in kwargs: + kwargs['pred_out_dir'] = f'{out_dir}/predictions' + ( + preprocess_kwargs, + forward_kwargs, + visualize_kwargs, + postprocess_kwargs, + ) = self._dispatch_kwargs(**kwargs) + + # preprocessing + if isinstance(inputs, str) and inputs.startswith('webcam'): + inputs = self._get_webcam_inputs(inputs) + batch_size = 1 + if not visualize_kwargs.get('show', False): + warnings.warn('The display mode is closed when using webcam ' + 'input. It will be turned on automatically.') + visualize_kwargs['show'] = True + else: + inputs = self._inputs_to_list(inputs) + + inputs = self.preprocess( + inputs, batch_size=batch_size, **preprocess_kwargs) + + preds = [] + if 'pose2d' not in self.mode or not hasattr(self.pose2d_inferencer, + 'detector'): + inputs = track(inputs, description='Inference') + + for proc_inputs, ori_inputs in inputs: + preds = self.forward(proc_inputs, **forward_kwargs) + + visualization = self.visualize(ori_inputs, preds, + **visualize_kwargs) + results = self.postprocess(preds, visualization, return_datasample, + **postprocess_kwargs) + yield results + + # merge visualization and prediction results + if self._video_input: + self._merge_outputs(**visualize_kwargs, **postprocess_kwargs) + + def visualize(self, inputs: InputsType, preds: PredType, + **kwargs) -> List[np.ndarray]: + """Visualize predictions. + + Args: + inputs (list): Inputs preprocessed by :meth:`_inputs_to_list`. + preds (Any): Predictions of the model. + return_vis (bool): Whether to return images with predicted results. + show (bool): Whether to display the image in a popup window. + Defaults to False. + show_interval (int): The interval of show (s). Defaults to 0 + radius (int): Keypoint radius for visualization. Defaults to 3 + thickness (int): Link thickness for visualization. Defaults to 1 + kpt_thr (float): The threshold to visualize the keypoints. + Defaults to 0.3 + vis_out_dir (str, optional): directory to save visualization + results w/o predictions. If left as empty, no file will + be saved. Defaults to ''. + + Returns: + List[np.ndarray]: Visualization results. + """ + + if 'pose2d' in self.mode: + window_name = '' + if self._video_input: + window_name = self.video_info['name'] + if kwargs.get('vis_out_dir', ''): + kwargs['vis_out_dir'] = join_path(kwargs['vis_out_dir'], + 'vis_frames') + if kwargs.get('show', False): + kwargs['wait_time'] = 1e-5 + return self.pose2d_inferencer.visualize( + inputs, + preds['pose2d'], + window_name=window_name, + window_close_event_handler=self._visualization_window_on_close, + **kwargs) + + def postprocess( + self, + preds: List[PoseDataSample], + visualization: List[np.ndarray], + return_datasample=False, + pred_out_dir: str = '', + ) -> dict: + """Process the predictions and visualization results from ``forward`` + and ``visualize``. + + This method should be responsible for the following tasks: + + 1. Convert datasamples into a json-serializable dict if needed. + 2. Pack the predictions and visualization results and return them. + 3. Dump or log the predictions. + + Args: + preds (List[Dict]): Predictions of the model. + visualization (np.ndarray): Visualized predictions. + return_datasample (bool): Whether to return results as + datasamples. Defaults to False. + pred_out_dir (str): Directory to save the inference results w/o + visualization. If left as empty, no file will be saved. + Defaults to ''. + + Returns: + dict: Inference and visualization results with key ``predictions`` + and ``visualization`` + + - ``visualization (Any)``: Returned by :meth:`visualize` + - ``predictions`` (dict or DataSample): Returned by + :meth:`forward` and processed in :meth:`postprocess`. + If ``return_datasample=False``, it usually should be a + json-serializable dict containing only basic data elements such + as strings and numbers. + """ + + if 'pose2d' in self.mode: + return super().postprocess(preds['pose2d'], visualization, + return_datasample, pred_out_dir) diff --git a/mmpose/apis/inferencers/pose2d_inferencer.py b/mmpose/apis/inferencers/pose2d_inferencer.py new file mode 100644 index 0000000000..07fb27af66 --- /dev/null +++ b/mmpose/apis/inferencers/pose2d_inferencer.py @@ -0,0 +1,257 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from typing import Dict, List, Optional, Sequence, Tuple, Union + +import mmcv +import numpy as np +from mmengine.config import Config, ConfigDict +from mmengine.infer.infer import ModelType +from mmengine.registry import init_default_scope +from mmengine.structures import InstanceData +from rich.progress import track + +from mmpose.evaluation.functional import nms +from mmpose.registry import DATASETS, INFERENCERS +from mmpose.structures import merge_data_samples +from .base_mmpose_inferencer import BaseMMPoseInferencer +from .utils import default_det_models + +try: + from mmdet.apis.det_inferencer import DetInferencer + mmdet_available = True +except (ImportError, ModuleNotFoundError): + mmdet_available = False + +InstanceList = List[InstanceData] +InputType = Union[str, np.ndarray] +InputsType = Union[InputType, Sequence[InputType]] +PredType = Union[InstanceData, InstanceList] +ImgType = Union[np.ndarray, Sequence[np.ndarray]] +ConfigType = Union[Config, ConfigDict] +ResType = Union[Dict, List[Dict], InstanceData, List[InstanceData]] + + +@INFERENCERS.register_module(name='pose-estimation') +@INFERENCERS.register_module() +class Pose2DInferencer(BaseMMPoseInferencer): + """The inferencer for 2D pose estimation. + + Args: + model (str, optional): Pretrained 2D pose estimation algorithm. + It's the path to the config file or the model name defined in + metafile. For example, it could be: + + - model alias, e.g. ``'body'``, + - config name, e.g. ``'simcc_res50_8xb64-210e_coco-256x192'``, + - config path + + Defaults to ``None``. + weights (str, optional): Path to the checkpoint. If it is not + specified and "model" is a model name of metafile, the weights + will be loaded from metafile. Defaults to None. + device (str, optional): Device to run inference. If None, the + available device will be automatically used. Defaults to None. + scope (str, optional): The scope of the model. Defaults to "mmpose". + det_model(str, optional): Config path or alias of detection model. + Defaults to None. + det_weights(str, optional): Path to the checkpoints of detection + model. Defaults to None. + det_cat_ids(int or list[int], optional): Category id for + detection model. Defaults to None. + """ + + preprocess_kwargs: set = {'bbox_thr', 'nms_thr'} + forward_kwargs: set = set() + visualize_kwargs: set = { + 'return_vis', + 'show', + 'wait_time', + 'radius', + 'thickness', + 'kpt_thr', + 'vis_out_dir', + } + postprocess_kwargs: set = {'pred_out_dir'} + + def __init__(self, + model: Union[ModelType, str], + weights: Optional[str] = None, + device: Optional[str] = None, + scope: Optional[str] = 'mmpose', + det_model: Optional[Union[ModelType, str]] = None, + det_weights: Optional[str] = None, + det_cat_ids: Optional[Union[int, Tuple]] = None) -> None: + + init_default_scope(scope) + super().__init__( + model=model, weights=weights, device=device, scope=scope) + + # assign dataset metainfo to self.visualizer + self.visualizer.set_dataset_meta(self.model.dataset_meta) + + # initialize detector for top-down models + if self.cfg.data_mode == 'topdown': + if det_model is None: + det_model = DATASETS.get( + self.cfg.dataset_type).__module__.split( + 'datasets.')[-1].split('.')[0].lower() + det_info = default_det_models[det_model] + det_model, det_weights, det_cat_ids = det_info[ + 'model'], det_info['weights'], det_info['cat_ids'] + + if mmdet_available: + self.detector = DetInferencer( + det_model, det_weights, device=device) + else: + raise RuntimeError( + 'MMDetection (v3.0.0rc6 or above) is required to ' + 'build inferencers for top-down pose estimation models.') + + if isinstance(det_cat_ids, (tuple, list)): + self.det_cat_ids = det_cat_ids + else: + self.det_cat_ids = (det_cat_ids, ) + + self._video_input = False + + def preprocess_single(self, + input: InputType, + index: int, + bbox_thr: float = 0.3, + nms_thr: float = 0.3): + """Process a single input into a model-feedable format. + + Args: + input (InputType): Input given by user. + index (int): index of the input + bbox_thr (float): threshold for bounding box detection. + Defaults to 0.3. + nms_thr (float): IoU threshold for bounding box NMS. + Defaults to 0.3. + + Yields: + Any: Data processed by the ``pipeline`` and ``collate_fn``. + """ + + if isinstance(input, str): + data_info = dict(img_path=input) + else: + data_info = dict(img=input, img_path=f'{index}.jpg'.rjust(10, '0')) + data_info.update(self.model.dataset_meta) + + if self.cfg.data_mode == 'topdown': + det_results = self.detector( + input, return_datasample=True)['predictions'] + pred_instance = det_results[0].pred_instances.cpu().numpy() + bboxes = np.concatenate( + (pred_instance.bboxes, pred_instance.scores[:, None]), axis=1) + + label_mask = np.zeros(len(bboxes), dtype=np.uint8) + for cat_id in self.det_cat_ids: + label_mask = np.logical_or(label_mask, + pred_instance.labels == cat_id) + + bboxes = bboxes[np.logical_and(label_mask, + pred_instance.scores > bbox_thr)] + bboxes = bboxes[nms(bboxes, nms_thr)] + + data_infos = [] + if len(bboxes) > 0: + for bbox in bboxes: + inst = data_info.copy() + inst['bbox'] = bbox[None, :4] + inst['bbox_score'] = bbox[4:5] + data_infos.append(self.pipeline(inst)) + else: + inst = data_info.copy() + + # get bbox from the image size + if isinstance(input, str): + input = mmcv.imread(input) + h, w = input.shape[:2] + + inst['bbox'] = np.array([[0, 0, w, h]], dtype=np.float32) + inst['bbox_score'] = np.ones(1, dtype=np.float32) + data_infos.append(self.pipeline(inst)) + + else: # bottom-up + data_infos = [self.pipeline(data_info)] + + return data_infos + + def forward(self, inputs: Union[dict, tuple]): + data_samples = super().forward(inputs) + if self.cfg.data_mode == 'topdown': + data_samples = [merge_data_samples(data_samples)] + return data_samples + + def __call__( + self, + inputs: InputsType, + return_datasample: bool = False, + batch_size: int = 1, + out_dir: Optional[str] = None, + **kwargs, + ) -> dict: + """Call the inferencer. + + Args: + inputs (InputsType): Inputs for the inferencer. + return_datasample (bool): Whether to return results as + :obj:`BaseDataElement`. Defaults to False. + batch_size (int): Batch size. Defaults to 1. + out_dir (str, optional): directory to save visualization + results and predictions. Will be overoden if vis_out_dir or + pred_out_dir are given. Defaults to None + **kwargs: Key words arguments passed to :meth:`preprocess`, + :meth:`forward`, :meth:`visualize` and :meth:`postprocess`. + Each key in kwargs should be in the corresponding set of + ``preprocess_kwargs``, ``forward_kwargs``, + ``visualize_kwargs`` and ``postprocess_kwargs``. + + Returns: + dict: Inference and visualization results. + """ + if out_dir is not None: + if 'vis_out_dir' not in kwargs: + kwargs['vis_out_dir'] = f'{out_dir}/visualizations' + if 'pred_out_dir' not in kwargs: + kwargs['pred_out_dir'] = f'{out_dir}/predictions' + + ( + preprocess_kwargs, + forward_kwargs, + visualize_kwargs, + postprocess_kwargs, + ) = self._dispatch_kwargs(**kwargs) + + # preprocessing + if isinstance(inputs, str) and inputs.startswith('webcam'): + inputs = self._get_webcam_inputs(inputs) + batch_size = 1 + if not visualize_kwargs.get('show', False): + warnings.warn('The display mode is closed when using webcam ' + 'input. It will be turned on automatically.') + visualize_kwargs['show'] = True + else: + inputs = self._inputs_to_list(inputs) + + inputs = self.preprocess( + inputs, batch_size=batch_size, **preprocess_kwargs) + + preds = [] + if not hasattr(self, 'detector'): + inputs = track(inputs, description='Inference') + + for proc_inputs, ori_inputs in inputs: + preds = self.forward(proc_inputs, **forward_kwargs) + + visualization = self.visualize(ori_inputs, preds, + **visualize_kwargs) + results = self.postprocess(preds, visualization, return_datasample, + **postprocess_kwargs) + yield results + + # merge visualization and prediction results + if self._video_input: + self._merge_outputs(**visualize_kwargs, **postprocess_kwargs) diff --git a/mmpose/apis/inferencers/utils/__init__.py b/mmpose/apis/inferencers/utils/__init__.py new file mode 100644 index 0000000000..e43e7b6734 --- /dev/null +++ b/mmpose/apis/inferencers/utils/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .default_det_models import default_det_models + +__all__ = ['default_det_models'] diff --git a/mmpose/apis/inferencers/utils/default_det_models.py b/mmpose/apis/inferencers/utils/default_det_models.py new file mode 100644 index 0000000000..96fda2cf14 --- /dev/null +++ b/mmpose/apis/inferencers/utils/default_det_models.py @@ -0,0 +1,40 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp + +from mmengine.config.utils import MODULE2PACKAGE +from mmengine.utils import get_installed_path + +mmpose_path = get_installed_path(MODULE2PACKAGE['mmpose']) + +default_det_models = dict( + human=dict( + model=osp.join(mmpose_path, '.mim', + 'demo/mmdetection_cfg/faster_rcnn_r50_fpn_coco.py'), + weights='https://download.openmmlab.com/mmdetection/v2.0/' + 'faster_rcnn/faster_rcnn_r50_fpn_1x_coco/' + 'faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth', + cat_ids=(0, )), + face=dict( + model=osp.join(mmpose_path, '.mim', + 'demo/mmdetection_cfg/yolox-s_8xb8-300e_coco-face.py'), + weights='https://download.openmmlab.com/mmpose/mmdet_pretrained/' + 'yolo-x_8xb8-300e_coco-face_13274d7c.pth', + cat_ids=(0, )), + hand=dict( + model=osp.join( + mmpose_path, '.mim', + 'demo/mmdetection_cfg/cascade_rcnn_x101_64x4d_fpn_1class.py'), + weights='https://download.openmmlab.com/mmpose/mmdet_pretrained/' + 'cascade_rcnn_x101_64x4d_fpn_20e_onehand10k-dac19597_20201030.pth', + cat_ids=(0, )), + animal=dict( + model=osp.join(mmpose_path, '.mim', + 'demo/mmdetection_cfg/faster_rcnn_r50_fpn_coco.py'), + weights='https://download.openmmlab.com/mmdetection/v2.0/' + 'faster_rcnn/faster_rcnn_r50_fpn_1x_coco/' + 'faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth', + cat_ids=(15, 16, 17, 18, 19, 20, 21, 22, 23)), +) + +default_det_models['body'] = default_det_models['human'] +default_det_models['wholebody'] = default_det_models['human'] diff --git a/mmpose/apis/webcam/nodes/model_nodes/detector_node.py b/mmpose/apis/webcam/nodes/model_nodes/detector_node.py index 9e66887266..350831fe62 100644 --- a/mmpose/apis/webcam/nodes/model_nodes/detector_node.py +++ b/mmpose/apis/webcam/nodes/model_nodes/detector_node.py @@ -3,13 +3,13 @@ import numpy as np +from mmpose.utils import adapt_mmdet_pipeline from ...utils import get_config_path from ..node import Node from ..registry import NODES try: from mmdet.apis import inference_detector, init_detector - from mmdet.utils import register_all_modules has_mmdet = True except (ImportError, ModuleNotFoundError): has_mmdet = False @@ -90,9 +90,9 @@ def __init__(self, self.bbox_thr = bbox_thr # Init model - register_all_modules() self.model = init_detector( self.model_config, self.model_checkpoint, device=self.device) + self.model.cfg = adapt_mmdet_pipeline(self.model.cfg) # Register buffers self.register_input_buffer(input_buffer, 'input', trigger=True) @@ -110,7 +110,6 @@ def process(self, input_msgs): img = input_msg.get_image() - register_all_modules() preds = inference_detector(self.model, img) objects = self._post_process(preds) input_msg.update_objects(objects) @@ -124,7 +123,7 @@ def _post_process(self, preds) -> List[Dict]: """Post-process the predictions of MMDetection model.""" instances = preds.pred_instances.cpu().numpy() - classes = self.model.dataset_meta['CLASSES'] + classes = self.model.dataset_meta['classes'] if isinstance(classes, str): classes = (classes, ) diff --git a/mmpose/apis/webcam/nodes/model_nodes/pose_estimator_node.py b/mmpose/apis/webcam/nodes/model_nodes/pose_estimator_node.py index fb2a87124c..64691cf560 100644 --- a/mmpose/apis/webcam/nodes/model_nodes/pose_estimator_node.py +++ b/mmpose/apis/webcam/nodes/model_nodes/pose_estimator_node.py @@ -5,7 +5,6 @@ import numpy as np from mmpose.apis import inference_topdown, init_model -from mmpose.utils import register_all_modules from ...utils import get_config_path from ..node import Node from ..registry import NODES @@ -91,7 +90,6 @@ def __init__(self, self.bbox_thr = bbox_thr # Init model - register_all_modules() self.model = init_model( self.model_config, self.model_checkpoint, device=self.device) @@ -119,7 +117,6 @@ def process(self, input_msgs): if len(objects) > 0: # Inference pose bboxes = np.stack([object['bbox'] for object in objects]) - register_all_modules() pose_results = inference_topdown(self.model, img, bboxes) # Update objects @@ -128,8 +125,8 @@ def process(self, input_msgs): object['keypoints'] = pred_instances.keypoints[0] object['keypoint_scores'] = pred_instances.keypoint_scores[0] - dataset_meta = object.get('dataset_meta', dict()) - dataset_meta.update(self.model.dataset_meta) + dataset_meta = self.model.dataset_meta.copy() + dataset_meta.update(object.get('dataset_meta', dict())) object['dataset_meta'] = dataset_meta object['pose_model_cfg'] = self.model.cfg diff --git a/mmpose/apis/webcam/utils/misc.py b/mmpose/apis/webcam/utils/misc.py index c3c8bfe050..6c6f5417ae 100644 --- a/mmpose/apis/webcam/utils/misc.py +++ b/mmpose/apis/webcam/utils/misc.py @@ -82,7 +82,7 @@ def load_image_from_disk_or_url(filename: str, def get_cached_file_path(url: str, - save_dir: Optional[str] = None, + save_dir: str, progress: bool = True, check_hash: bool = False, file_name: Optional[str] = None) -> str: @@ -97,7 +97,7 @@ def get_cached_file_path(url: str, Args: url (str): URL of the object to download - save_dir (str, optional): directory in which to save the object + save_dir (str): directory in which to save the object progress (bool): whether or not to display a progress bar to stderr. Default: ``True`` check_hash(bool): If True, the filename part of the URL @@ -112,8 +112,6 @@ def get_cached_file_path(url: str, Returns: str: The path to the cached file. """ - if save_dir is None: - save_dir = os.path.join('webcam_resources') mkdir_or_exist(save_dir) diff --git a/mmpose/codecs/__init__.py b/mmpose/codecs/__init__.py index def4172c8b..a88ebac701 100644 --- a/mmpose/codecs/__init__.py +++ b/mmpose/codecs/__init__.py @@ -1,13 +1,16 @@ # Copyright (c) OpenMMLab. All rights reserved. from .associative_embedding import AssociativeEmbedding +from .decoupled_heatmap import DecoupledHeatmap from .integral_regression_label import IntegralRegressionLabel from .megvii_heatmap import MegviiHeatmap from .msra_heatmap import MSRAHeatmap from .regression_label import RegressionLabel from .simcc_label import SimCCLabel +from .spr import SPR from .udp_heatmap import UDPHeatmap __all__ = [ 'MSRAHeatmap', 'MegviiHeatmap', 'UDPHeatmap', 'RegressionLabel', - 'SimCCLabel', 'IntegralRegressionLabel', 'AssociativeEmbedding' + 'SimCCLabel', 'IntegralRegressionLabel', 'AssociativeEmbedding', 'SPR', + 'DecoupledHeatmap' ] diff --git a/mmpose/codecs/associative_embedding.py b/mmpose/codecs/associative_embedding.py index 416ea651c3..7e080f1657 100644 --- a/mmpose/codecs/associative_embedding.py +++ b/mmpose/codecs/associative_embedding.py @@ -21,8 +21,8 @@ def _group_keypoints_by_tags(vals: np.ndarray, locs: np.ndarray, keypoint_order: List[int], val_thr: float, - tag_dist_thr: float = 1.0, - max_groups: Optional[int] = None): + tag_thr: float = 1.0, + max_groups: Optional[int] = None) -> np.ndarray: """Group the keypoints by tags using Munkres algorithm. Note: @@ -44,18 +44,15 @@ def _group_keypoints_by_tags(vals: np.ndarray, The groupping usually starts from a keypoints around the head and torso, and gruadually moves out to the limbs val_thr (float): The threshold of the keypoint response value - tag_dist_thr (float): The maximum allowed tag distance when matching a + tag_thr (float): The maximum allowed tag distance when matching a keypoint to a group. A keypoint with larger tag distance to any of the existing groups will initializes a new group max_groups (int, optional): The maximum group number. ``None`` means no limitation. Defaults to ``None`` Returns: - tuple: - - grouped_keypoints (np.ndarray): The grouped keypoints in shape - (G, K, D) - - grouped_keypoint_scores (np.ndarray): The grouped keypoint scores - in shape (G, K) + np.ndarray: grouped keypoints in shape (G, K, D+1), where the last + dimenssion is the concatenated keypoint coordinates and scores. """ K, M, D = locs.shape assert vals.shape == tags.shape[:2] == (K, M) @@ -109,7 +106,7 @@ def _init_group(): num_kpts, num_groups = dists.shape[:2] # Experimental cost function for keypoint-group matching - costs = np.round(dists) * 100 - vals_i + costs = np.round(dists) * 100 - vals_i[..., None] if num_kpts > num_groups: padding = np.full((num_kpts, num_kpts - num_groups), 1e10, @@ -120,7 +117,7 @@ def _init_group(): matches = munkres.compute(costs) for kpt_idx, group_idx in matches: if group_idx < num_groups and dists[kpt_idx, - group_idx] < tag_dist_thr: + group_idx] < tag_thr: # Add the keypoint to the matched group group = groups[group_idx] else: @@ -133,10 +130,13 @@ def _init_group(): group.tag_list.append(tags_i[kpt_idx]) groups = groups[:max_groups] - grouped_keypoints = np.stack((g.kpts for g in groups)) # (G, K, D) - grouped_keypoint_scores = np.stack((g.scores for g in groups)) # (G, K) + if groups: + grouped_keypoints = np.stack( + [np.r_['1', g.kpts, g.scores[:, None]] for g in groups]) + else: + grouped_keypoints = np.empty((0, K, D + 1)) - return grouped_keypoints, grouped_keypoint_scores + return grouped_keypoints @KEYPOINT_CODECS.register_module() @@ -144,7 +144,7 @@ class AssociativeEmbedding(BaseKeypointCodec): """Encode/decode keypoints with the method introduced in "Associative Embedding". This is an asymmetric codec, where the keypoints are represented as gaussian heatmaps and position indices during encoding, and - reostred from predicted heatmaps and group tags. + restored from predicted heatmaps and group tags. See the paper `Associative Embedding: End-to-End Learning for Joint Detection and Grouping`_ by Newell et al (2017) for details @@ -158,6 +158,15 @@ class AssociativeEmbedding(BaseKeypointCodec): - image size: [w, h] - heatmap size: [W, H] + Encoded: + + - heatmaps (np.ndarray): The generated heatmap in shape (K, H, W) + where [W, H] is the `heatmap_size` + - keypoint_indices (np.ndarray): The keypoint position indices in shape + (N, K, 2). Each keypoint's index is [i, v], where i is the position + index in the heatmap (:math:`i=y*w+x`) and v is the visibility + - keypoint_weights (np.ndarray): The target weights in shape (N, K) + Args: input_size (tuple): Image size in [w, h] heatmap_size (tuple): Heatmap size in [W, H] @@ -167,8 +176,12 @@ class AssociativeEmbedding(BaseKeypointCodec): decode_keypoint_order (List[int]): The grouping order of the keypoint indices. The groupping usually starts from a keypoints around the head and torso, and gruadually moves out to the limbs - decode_thr (float): The threshold of keypoint response value in - heatmaps. Defaults to 0.1 + decode_keypoint_thr (float): The threshold of keypoint response value + in heatmaps. Defaults to 0.1 + decode_tag_thr (float): The maximum allowed tag distance when matching + a keypoint to a group. A keypoint with larger tag distance to any + of the existing groups will initializes a new group. Defaults to + 1.0 decode_nms_kernel (int): The kernel size of the NMS during decoding, which should be an odd integer. Defaults to 5 decode_gaussian_kernel (int): The kernel size of the Gaussian blur @@ -186,54 +199,44 @@ class AssociativeEmbedding(BaseKeypointCodec): .. _`UDP (CVPR 2020)`: https://arxiv.org/abs/1911.07524 """ - def __init__(self, - input_size: Tuple[int, int], - heatmap_size: Tuple[int, int], - sigma: Optional[float] = None, - use_udp: bool = False, - decode_keypoint_order: List[int] = [], - decode_nms_kernel: int = 5, - decode_gaussian_kernel: int = 3, - decode_thr: float = 0.1, - decode_topk: int = 20, - decode_max_instances: Optional[int] = None, - tag_per_keypoint: bool = True) -> None: + def __init__( + self, + input_size: Tuple[int, int], + heatmap_size: Tuple[int, int], + sigma: Optional[float] = None, + use_udp: bool = False, + decode_keypoint_order: List[int] = [], + decode_nms_kernel: int = 5, + decode_gaussian_kernel: int = 3, + decode_keypoint_thr: float = 0.1, + decode_tag_thr: float = 1.0, + decode_topk: int = 20, + decode_max_instances: Optional[int] = None, + ) -> None: super().__init__() self.input_size = input_size self.heatmap_size = heatmap_size self.use_udp = use_udp self.decode_nms_kernel = decode_nms_kernel self.decode_gaussian_kernel = decode_gaussian_kernel - self.decode_thr = decode_thr + self.decode_keypoint_thr = decode_keypoint_thr + self.decode_tag_thr = decode_tag_thr self.decode_topk = decode_topk self.decode_max_instances = decode_max_instances - self.tag_per_keypoint = tag_per_keypoint self.dedecode_keypoint_order = decode_keypoint_order.copy() + if self.use_udp: + self.scale_factor = ((np.array(input_size) - 1) / + (np.array(heatmap_size) - 1)).astype( + np.float32) + else: + self.scale_factor = (np.array(input_size) / + heatmap_size).astype(np.float32) + if sigma is None: sigma = (heatmap_size[0] * heatmap_size[1])**0.5 / 64 self.sigma = sigma - def _get_scale_factor(self, input_size: Tuple[int, int], - heatmap_size: Tuple[int, int]) -> np.ndarray: - """Calculate scale factors from the input size and the heatmap size. - - Args: - input_size (tuple): Image size in [w, h] - heatmap_size (tuple): Heatmap size in [W, H] - - Returns: - np.ndarray: scale factors in [fx, fy] where :math:`fx=w/W` and - :math:`fy=h/H`. - """ - if self.use_udp: - scale_factor = ((np.array(input_size) - 1) / - (np.array(heatmap_size) - 1)).astype(np.float32) - else: - scale_factor = (np.array(input_size) / - heatmap_size).astype(np.float32) - return scale_factor - def encode( self, keypoints: np.ndarray, @@ -248,7 +251,7 @@ def encode( (N, K) Returns: - tuple: + dict: - heatmaps (np.ndarray): The generated heatmap in shape (K, H, W) where [W, H] is the `heatmap_size` - keypoint_indices (np.ndarray): The keypoint position indices @@ -259,23 +262,20 @@ def encode( (N, K) """ - scale_factor = self._get_scale_factor(self.input_size, - self.heatmap_size) - if keypoints_visible is None: keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32) # keypoint coordinates in heatmap - _keypoints = keypoints / scale_factor + _keypoints = keypoints / self.scale_factor if self.use_udp: - heatmaps, keypoints_weights = generate_udp_gaussian_heatmaps( + heatmaps, keypoint_weights = generate_udp_gaussian_heatmaps( heatmap_size=self.heatmap_size, keypoints=_keypoints, keypoints_visible=keypoints_visible, sigma=self.sigma) else: - heatmaps, keypoints_weights = generate_gaussian_heatmaps( + heatmaps, keypoint_weights = generate_gaussian_heatmaps( heatmap_size=self.heatmap_size, keypoints=_keypoints, keypoints_visible=keypoints_visible, @@ -286,7 +286,12 @@ def encode( keypoints=_keypoints, keypoints_visible=keypoints_visible) - return heatmaps, keypoint_indices, keypoints_weights + encoded = dict( + heatmaps=heatmaps, + keypoint_indices=keypoint_indices, + keypoint_weights=keypoint_weights) + + return encoded def _encode_keypoint_indices(self, heatmap_size: Tuple[int, int], keypoints: np.ndarray, @@ -337,7 +342,7 @@ def _get_batch_topk(self, batch_heatmaps: Tensor, batch_tags: Tensor, topk_tags_per_kpts = [ torch.gather(_tag, dim=2, index=topk_indices) - for _tag in torch.unbind(batch_tags.view(B, K, L, H * W), dim=2) + for _tag in torch.unbind(batch_tags.view(B, L, K, H * W), dim=1) ] topk_tags = torch.stack(topk_tags_per_kpts, dim=-1) # (B, K, TopK, L) @@ -359,9 +364,10 @@ def _group_keypoints(self, batch_vals: np.ndarray, batch_tags: np.ndarray, (B, K, Topk, 2) Returns: - List[Tuple[np.ndarray, np.ndarray]]: Grouping results of a batch, - eath element is a tuple of keypoints (in shape [N, K, D]) and - keypoint scores (in shape [N, K]) decoded from one image. + List[np.ndarray]: Grouping results of a batch, each element is a + np.ndarray (in shape [N, K, D+1]) that contains the groups + detected in an image, including both keypoint coordinates and + scores. """ def _group_func(inputs: Tuple): @@ -371,7 +377,8 @@ def _group_func(inputs: Tuple): tags, locs, keypoint_order=self.dedecode_keypoint_order, - val_thr=self.decode_thr, + val_thr=self.decode_keypoint_thr, + tag_thr=self.decode_tag_thr, max_groups=self.decode_max_instances) _results = map(_group_func, zip(batch_vals, batch_tags, batch_locs)) @@ -390,7 +397,7 @@ def _fill_missing_keypoints(self, keypoints: np.ndarray, missing in the initial prediction heatmaps (np.ndarry): Heatmaps in shape (K, H, W) tags (np.ndarray): Tagging heatmaps in shape (C, H, W) where - C=K*L + C=L*K Returns: tuple: @@ -402,7 +409,8 @@ def _fill_missing_keypoints(self, keypoints: np.ndarray, N, K = keypoints.shape[:2] H, W = heatmaps.shape[1:] - keypoint_tags = np.split(tags, K, axis=0) + L = tags.shape[0] // K + keypoint_tags = [tags[k::K] for k in range(K)] for n in range(N): # Calculate the instance tag (mean tag of detected keypoints) @@ -413,13 +421,15 @@ def _fill_missing_keypoints(self, keypoints: np.ndarray, x = np.clip(x, 0, W - 1) y = np.clip(y, 0, H - 1) _tag.append(keypoint_tags[k][:, y, x]) - tag = np.mean(_tag, axis=0) + tag = np.mean(_tag, axis=0) + tag = tag.reshape(L, 1, 1) # Search maximum response of the missing keypoints for k in range(K): if keypoint_scores[n, k] > 0: continue - dist_map = np.linalg.norm(keypoint_tags - tag, ord=2, axis=0) + dist_map = np.linalg.norm( + keypoint_tags[k] - tag, ord=2, axis=0) cost_map = np.round(dist_map) * 100 - heatmaps[k] # H, W y, x = np.unravel_index(np.argmin(cost_map), shape=(H, W)) keypoints[n, k] = [x, y] @@ -427,12 +437,8 @@ def _fill_missing_keypoints(self, keypoints: np.ndarray, return keypoints, keypoint_scores - def batch_decode( - self, - batch_heatmaps: Tensor, - batch_tags: Tensor, - input_sizes: Optional[Tuple[int, int]] = None - ) -> Tuple[List[np.ndarray], List[np.ndarray]]: + def batch_decode(self, batch_heatmaps: Tensor, batch_tags: Tensor + ) -> Tuple[List[np.ndarray], List[np.ndarray]]: """Decode the keypoint coordinates from a batch of heatmaps and tagging heatmaps. The decoded keypoint coordinates are in the input image space. @@ -441,13 +447,7 @@ def batch_decode( batch_heatmaps (Tensor): Keypoint detection heatmaps in shape (B, K, H, W) batch_tags (Tensor): Tagging heatmaps in shape (B, C, H, W), where - :math:`C=L` if `tag_per_keypoint==False`, or - :math:`C=L*K` otherwise - input_sizes (List[Tuple[int, int]], optional): Manually set the - input size [w, h] of each sample for decoding. This is useful - when inference a model on images with arbitrary sizes. If not - given, the value `self.input_size` set at initialization will - be used for all samples. Defaults to ``None`` + :math:`C=L*K` Returns: tuple: @@ -457,14 +457,11 @@ def batch_decode( batch, each is in shape (N, K). It usually represents the confidience of the keypoint prediction """ - B, K, H, W = batch_heatmaps.shape + B, _, H, W = batch_heatmaps.shape assert batch_tags.shape[0] == B and batch_tags.shape[2:4] == (H, W), ( - f'Unmatched shapes of heatmap ({batch_heatmaps.shape}) and ' + f'Mismatched shapes of heatmap ({batch_heatmaps.shape}) and ' f'tagging map ({batch_tags.shape})') - if not self.tag_per_keypoint: - batch_tags = batch_tags.repeat((1, K, 1, 1)) - # Heatmap NMS batch_heatmaps = batch_heatmap_nms(batch_heatmaps, self.decode_nms_kernel) @@ -478,44 +475,38 @@ def batch_decode( batch_groups = self._group_keypoints(batch_topk_vals, batch_topk_tags, batch_topk_locs) - batch_keypoints, batch_keypoint_scores = map(list, zip(*batch_groups)) - # Convert to numpy batch_heatmaps_np = to_numpy(batch_heatmaps) batch_tags_np = to_numpy(batch_tags) # Refine the keypoint prediction - for i, (keypoints, scores, heatmaps, tags) in enumerate( - zip(batch_keypoints, batch_keypoint_scores, batch_heatmaps_np, - batch_tags_np)): - - # identify missing keypoints - keypoints, scores = self._fill_missing_keypoints( - keypoints, scores, heatmaps, tags) - - # refine keypoint coordinates according to heatmap distribution - if self.use_udp: - keypoints = refine_keypoints_dark_udp( - keypoints, - heatmaps, - blur_kernel_size=self.decode_gaussian_kernel) - else: - keypoints = refine_keypoints(keypoints, heatmaps) - - batch_keypoints[i] = keypoints - batch_keypoint_scores[i] = scores - - # restore keypoint scale - if input_sizes is None: - input_sizes = [self.input_size] * B - else: - assert len(input_sizes) == B + batch_keypoints = [] + batch_keypoint_scores = [] + for i, (groups, heatmaps, tags) in enumerate( + zip(batch_groups, batch_heatmaps_np, batch_tags_np)): + + keypoints, scores = groups[..., :-1], groups[..., -1] + + if keypoints.size > 0: + # identify missing keypoints + keypoints, scores = self._fill_missing_keypoints( + keypoints, scores, heatmaps, tags) + + # refine keypoint coordinates according to heatmap distribution + if self.use_udp: + keypoints = refine_keypoints_dark_udp( + keypoints, + heatmaps, + blur_kernel_size=self.decode_gaussian_kernel) + else: + keypoints = refine_keypoints(keypoints, heatmaps) - heatmap_size = (W, H) + batch_keypoints.append(keypoints) + batch_keypoint_scores.append(scores) + # restore keypoint scale batch_keypoints = [ - kpts * self._get_scale_factor(input_size, heatmap_size) - for kpts, input_size in zip(batch_keypoints, input_sizes) + kpts * self.scale_factor for kpts in batch_keypoints ] return batch_keypoints, batch_keypoint_scores diff --git a/mmpose/codecs/base.py b/mmpose/codecs/base.py index 88db2e4baa..d8479fdf1e 100644 --- a/mmpose/codecs/base.py +++ b/mmpose/codecs/base.py @@ -14,10 +14,14 @@ class BaseKeypointCodec(metaclass=ABCMeta): the methods :meth:`encode` and :meth:`decode`. """ + # pass additional encoding arguments to the `encode` method, beyond the + # mandatory `keypoints` and `keypoints_visible` arguments. + auxiliary_encode_keys = set() + @abstractmethod def encode(self, keypoints: np.ndarray, - keypoints_visible: Optional[np.ndarray] = None) -> Any: + keypoints_visible: Optional[np.ndarray] = None) -> dict: """Encode keypoints. Note: @@ -30,6 +34,9 @@ def encode(self, keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D) keypoints_visible (np.ndarray): Keypoint visibility in shape (N, K, D) + + Returns: + dict: Encoded items. """ @abstractmethod diff --git a/mmpose/codecs/decoupled_heatmap.py b/mmpose/codecs/decoupled_heatmap.py new file mode 100644 index 0000000000..da38a4ce2c --- /dev/null +++ b/mmpose/codecs/decoupled_heatmap.py @@ -0,0 +1,265 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import random +from typing import Optional, Tuple + +import numpy as np + +from mmpose.registry import KEYPOINT_CODECS +from .base import BaseKeypointCodec +from .utils import (generate_gaussian_heatmaps, get_diagonal_lengths, + get_instance_bbox, get_instance_root) +from .utils.post_processing import get_heatmap_maximum +from .utils.refinement import refine_keypoints + + +@KEYPOINT_CODECS.register_module() +class DecoupledHeatmap(BaseKeypointCodec): + """Encode/decode keypoints with the method introduced in the paper CID. + + See the paper Contextual Instance Decoupling for Robust Multi-Person + Pose Estimation`_ by Wang et al (2022) for details + + Note: + + - instance number: N + - keypoint number: K + - keypoint dimension: D + - image size: [w, h] + - heatmap size: [W, H] + + Encoded: + - heatmaps (np.ndarray): The coupled heatmap in shape + (1+K, H, W) where [W, H] is the `heatmap_size`. + - instance_heatmaps (np.ndarray): The decoupled heatmap in shape + (M*K, H, W) where M is the number of instances. + - keypoint_weights (np.ndarray): The weight for heatmaps in shape + (M*K). + - instance_coords (np.ndarray): The coordinates of instance roots + in shape (M, 2) + + Args: + input_size (tuple): Image size in [w, h] + heatmap_size (tuple): Heatmap size in [W, H] + root_type (str): The method to generate the instance root. Options + are: + + - ``'kpt_center'``: Average coordinate of all visible keypoints. + - ``'bbox_center'``: Center point of bounding boxes outlined by + all visible keypoints. + + Defaults to ``'kpt_center'`` + + heatmap_min_overlap (float): Minimum overlap rate among instances. + Used when calculating sigmas for instances. Defaults to 0.7 + background_weight (float): Loss weight of background pixels. + Defaults to 0.1 + encode_max_instances (int): The maximum number of instances + to encode for each sample. Defaults to 30 + + .. _`CID`: https://openaccess.thecvf.com/content/CVPR2022/html/Wang_ + Contextual_Instance_Decoupling_for_Robust_Multi-Person_Pose_Estimation_ + CVPR_2022_paper.html + """ + + # DecoupledHeatmap requires bounding boxes to determine the size of each + # instance, so that it can assign varying sigmas based on their size + auxiliary_encode_keys = {'bbox'} + + def __init__( + self, + input_size: Tuple[int, int], + heatmap_size: Tuple[int, int], + root_type: str = 'kpt_center', + heatmap_min_overlap: float = 0.7, + encode_max_instances: int = 30, + ): + super().__init__() + + self.input_size = input_size + self.heatmap_size = heatmap_size + self.root_type = root_type + self.encode_max_instances = encode_max_instances + self.heatmap_min_overlap = heatmap_min_overlap + + self.scale_factor = (np.array(input_size) / + heatmap_size).astype(np.float32) + + def _get_instance_wise_sigmas( + self, + bbox: np.ndarray, + ) -> np.ndarray: + """Get sigma values for each instance according to their size. + + Args: + bbox (np.ndarray): Bounding box in shape (N, 4, 2) + + Returns: + np.ndarray: Array containing the sigma values for each instance. + """ + sigmas = np.zeros((bbox.shape[0], ), dtype=np.float32) + + heights = np.sqrt(np.power(bbox[:, 0] - bbox[:, 1], 2).sum(axis=-1)) + widths = np.sqrt(np.power(bbox[:, 0] - bbox[:, 2], 2).sum(axis=-1)) + + for i in range(bbox.shape[0]): + h, w = heights[i], widths[i] + + # compute sigma for each instance + # condition 1 + a1, b1 = 1, h + w + c1 = w * h * (1 - self.heatmap_min_overlap) / ( + 1 + self.heatmap_min_overlap) + sq1 = np.sqrt(b1**2 - 4 * a1 * c1) + r1 = (b1 + sq1) / 2 + + # condition 2 + a2 = 4 + b2 = 2 * (h + w) + c2 = (1 - self.heatmap_min_overlap) * w * h + sq2 = np.sqrt(b2**2 - 4 * a2 * c2) + r2 = (b2 + sq2) / 2 + + # condition 3 + a3 = 4 * self.heatmap_min_overlap + b3 = -2 * self.heatmap_min_overlap * (h + w) + c3 = (self.heatmap_min_overlap - 1) * w * h + sq3 = np.sqrt(b3**2 - 4 * a3 * c3) + r3 = (b3 + sq3) / 2 + + sigmas[i] = min(r1, r2, r3) / 3 + + return sigmas + + def encode(self, + keypoints: np.ndarray, + keypoints_visible: Optional[np.ndarray] = None, + bbox: Optional[np.ndarray] = None) -> dict: + """Encode keypoints into heatmaps. + + Args: + keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D) + keypoints_visible (np.ndarray): Keypoint visibilities in shape + (N, K) + bbox (np.ndarray): Bounding box in shape (N, 8) which includes + coordinates of 4 corners. + + Returns: + dict: + - heatmaps (np.ndarray): The coupled heatmap in shape + (1+K, H, W) where [W, H] is the `heatmap_size`. + - instance_heatmaps (np.ndarray): The decoupled heatmap in shape + (N*K, H, W) where M is the number of instances. + - keypoint_weights (np.ndarray): The weight for heatmaps in shape + (N*K). + - instance_coords (np.ndarray): The coordinates of instance roots + in shape (N, 2) + """ + + if keypoints_visible is None: + keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32) + if bbox is None: + # generate pseudo bbox via visible keypoints + bbox = get_instance_bbox(keypoints, keypoints_visible) + bbox = np.tile(bbox, 2).reshape(-1, 4, 2) + # corner order: left_top, left_bottom, right_top, right_bottom + bbox[:, 1:3, 0] = bbox[:, 0:2, 0] + + # keypoint coordinates in heatmap + _keypoints = keypoints / self.scale_factor + _bbox = bbox.reshape(-1, 4, 2) / self.scale_factor + + # compute the root and scale of each instance + roots, roots_visible = get_instance_root(_keypoints, keypoints_visible, + self.root_type) + + sigmas = self._get_instance_wise_sigmas(_bbox) + + # generate global heatmaps + heatmaps, keypoint_weights = generate_gaussian_heatmaps( + heatmap_size=self.heatmap_size, + keypoints=np.concatenate((_keypoints, roots[:, None]), axis=1), + keypoints_visible=np.concatenate( + (keypoints_visible, roots_visible[:, None]), axis=1), + sigma=sigmas) + roots_visible = keypoint_weights[:, -1] + + # select instances + inst_roots, inst_indices = [], [] + diagonal_lengths = get_diagonal_lengths(_keypoints, keypoints_visible) + for i in np.argsort(diagonal_lengths): + if roots_visible[i] < 1: + continue + # rand root point in 3x3 grid + x, y = roots[i] + np.random.randint(-1, 2, (2, )) + x = max(0, min(x, self.heatmap_size[0] - 1)) + y = max(0, min(y, self.heatmap_size[1] - 1)) + if (x, y) not in inst_roots: + inst_roots.append((x, y)) + inst_indices.append(i) + if len(inst_indices) > self.encode_max_instances: + rand_indices = random.sample( + range(len(inst_indices)), self.encode_max_instances) + inst_roots = [inst_roots[i] for i in rand_indices] + inst_indices = [inst_indices[i] for i in rand_indices] + + # generate instance-wise heatmaps + inst_heatmaps, inst_heatmap_weights = [], [] + for i in inst_indices: + inst_heatmap, inst_heatmap_weight = generate_gaussian_heatmaps( + heatmap_size=self.heatmap_size, + keypoints=_keypoints[i:i + 1], + keypoints_visible=keypoints_visible[i:i + 1], + sigma=sigmas[i].item()) + inst_heatmaps.append(inst_heatmap) + inst_heatmap_weights.append(inst_heatmap_weight) + + if len(inst_indices) > 0: + inst_heatmaps = np.concatenate(inst_heatmaps) + inst_heatmap_weights = np.concatenate(inst_heatmap_weights) + inst_roots = np.array(inst_roots, dtype=np.int32) + else: + inst_heatmaps = np.empty((0, *self.heatmap_size[::-1])) + inst_heatmap_weights = np.empty((0, )) + inst_roots = np.empty((0, 2), dtype=np.int32) + + encoded = dict( + heatmaps=heatmaps, + instance_heatmaps=inst_heatmaps, + keypoint_weights=inst_heatmap_weights, + instance_coords=inst_roots) + + return encoded + + def decode(self, instance_heatmaps: np.ndarray, + instance_scores: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + """Decode keypoint coordinates from decoupled heatmaps. The decoded + keypoint coordinates are in the input image space. + + Args: + instance_heatmaps (np.ndarray): Heatmaps in shape (N, K, H, W) + instance_scores (np.ndarray): Confidence of instance roots + prediction in shape (N, 1) + + Returns: + tuple: + - keypoints (np.ndarray): Decoded keypoint coordinates in shape + (N, K, D) + - scores (np.ndarray): The keypoint scores in shape (N, K). It + usually represents the confidence of the keypoint prediction + """ + keypoints, keypoint_scores = [], [] + + for i in range(instance_heatmaps.shape[0]): + heatmaps = instance_heatmaps[i].copy() + kpts, scores = get_heatmap_maximum(heatmaps) + keypoints.append(refine_keypoints(kpts[None], heatmaps)) + keypoint_scores.append(scores[None]) + + keypoints = np.concatenate(keypoints) + # Restore the keypoint scale + keypoints = keypoints * self.scale_factor + + keypoint_scores = np.concatenate(keypoint_scores) + keypoint_scores *= instance_scores + + return keypoints, keypoint_scores diff --git a/mmpose/codecs/integral_regression_label.py b/mmpose/codecs/integral_regression_label.py index c1bc75c0f6..ed8e72cb10 100644 --- a/mmpose/codecs/integral_regression_label.py +++ b/mmpose/codecs/integral_regression_label.py @@ -17,7 +17,18 @@ class IntegralRegressionLabel(BaseKeypointCodec): Note: - - input image size: [w, h] + - instance number: N + - keypoint number: K + - keypoint dimension: D + - image size: [w, h] + + Encoded: + + - keypoint_labels (np.ndarray): The normalized regression labels in + shape (N, K, D) where D is 2 for 2d coordinates + - heatmaps (np.ndarray): The generated heatmap in shape (K, H, W) where + [W, H] is the `heatmap_size` + - keypoint_weights (np.ndarray): The target weights in shape (N, K) Args: input_size (tuple): Input image size in [w, h] @@ -48,11 +59,9 @@ def __init__(self, self.keypoint_codec = RegressionLabel(input_size) self.normalize = normalize - def encode( - self, - keypoints: np.ndarray, - keypoints_visible: Optional[np.ndarray] = None - ) -> Tuple[np.ndarray, np.ndarray]: + def encode(self, + keypoints: np.ndarray, + keypoints_visible: Optional[np.ndarray] = None) -> dict: """Encoding keypoints to regression labels and heatmaps. Args: @@ -61,24 +70,31 @@ def encode( (N, K) Returns: - tuple: - - reg_labels (np.ndarray): The normalized regression labels in + dict: + - keypoint_labels (np.ndarray): The normalized regression labels in shape (N, K, D) where D is 2 for 2d coordinates - heatmaps (np.ndarray): The generated heatmap in shape (K, H, W) where [W, H] is the `heatmap_size` - keypoint_weights (np.ndarray): The target weights in shape (N, K) """ - heatmaps, keypoint_weights = self.heatmap_codec.encode( - keypoints, keypoints_visible) - reg_labels, keypoint_weights = self.keypoint_codec.encode( - keypoints, keypoint_weights) + encoded_hm = self.heatmap_codec.encode(keypoints, keypoints_visible) + encoded_kp = self.keypoint_codec.encode(keypoints, keypoints_visible) + + heatmaps = encoded_hm['heatmaps'] + keypoint_labels = encoded_kp['keypoint_labels'] + keypoint_weights = encoded_kp['keypoint_weights'] if self.normalize: val_sum = heatmaps.sum(axis=(-1, -2)).reshape(-1, 1, 1) + 1e-24 heatmaps = heatmaps / val_sum - return heatmaps, reg_labels, keypoint_weights + encoded = dict( + keypoint_labels=keypoint_labels, + heatmaps=heatmaps, + keypoint_weights=keypoint_weights) + + return encoded def decode(self, encoded: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """Decode keypoint coordinates from normalized space to input image diff --git a/mmpose/codecs/megvii_heatmap.py b/mmpose/codecs/megvii_heatmap.py index 7fc0eba3bf..e898004637 100644 --- a/mmpose/codecs/megvii_heatmap.py +++ b/mmpose/codecs/megvii_heatmap.py @@ -23,6 +23,12 @@ class MegviiHeatmap(BaseKeypointCodec): - image size: [w, h] - heatmap size: [W, H] + Encoded: + + - heatmaps (np.ndarray): The generated heatmap in shape (K, H, W) + where [W, H] is the `heatmap_size` + - keypoint_weights (np.ndarray): The target weights in shape (N, K) + Args: input_size (tuple): Image size in [w, h] heatmap_size (tuple): Heatmap size in [W, H] @@ -47,11 +53,9 @@ def __init__( self.scale_factor = (np.array(input_size) / heatmap_size).astype(np.float32) - def encode( - self, - keypoints: np.ndarray, - keypoints_visible: Optional[np.ndarray] = None - ) -> Tuple[np.ndarray, np.ndarray]: + def encode(self, + keypoints: np.ndarray, + keypoints_visible: Optional[np.ndarray] = None) -> dict: """Encode keypoints into heatmaps. Note that the original keypoint coordinates should be in the input image space. @@ -61,7 +65,7 @@ def encode( (N, K) Returns: - tuple: + dict: - heatmaps (np.ndarray): The generated heatmap in shape (K, H, W) where [W, H] is the `heatmap_size` - keypoint_weights (np.ndarray): The target weights in shape @@ -96,7 +100,9 @@ def encode( # normalize the heatmap heatmaps[k] = heatmaps[k] / heatmaps[k, ky, kx] * 255. - return heatmaps, keypoint_weights + encoded = dict(heatmaps=heatmaps, keypoint_weights=keypoint_weights) + + return encoded def decode(self, encoded: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """Decode keypoint coordinates from heatmaps. The decoded keypoint diff --git a/mmpose/codecs/msra_heatmap.py b/mmpose/codecs/msra_heatmap.py index 1149ba2511..63ba292e4d 100644 --- a/mmpose/codecs/msra_heatmap.py +++ b/mmpose/codecs/msra_heatmap.py @@ -25,6 +25,12 @@ class MSRAHeatmap(BaseKeypointCodec): - image size: [w, h] - heatmap size: [W, H] + Encoded: + + - heatmaps (np.ndarray): The generated heatmap in shape (K, H, W) + where [W, H] is the `heatmap_size` + - keypoint_weights (np.ndarray): The target weights in shape (N, K) + Args: input_size (tuple): Image size in [w, h] heatmap_size (tuple): Heatmap size in [W, H] @@ -65,11 +71,9 @@ def __init__(self, self.scale_factor = (np.array(input_size) / heatmap_size).astype(np.float32) - def encode( - self, - keypoints: np.ndarray, - keypoints_visible: Optional[np.ndarray] = None - ) -> Tuple[np.ndarray, np.ndarray]: + def encode(self, + keypoints: np.ndarray, + keypoints_visible: Optional[np.ndarray] = None) -> dict: """Encode keypoints into heatmaps. Note that the original keypoint coordinates should be in the input image space. @@ -79,7 +83,7 @@ def encode( (N, K) Returns: - tuple: + dict: - heatmaps (np.ndarray): The generated heatmap in shape (K, H, W) where [W, H] is the `heatmap_size` - keypoint_weights (np.ndarray): The target weights in shape @@ -106,7 +110,9 @@ def encode( keypoints_visible=keypoints_visible, sigma=self.sigma) - return heatmaps, keypoint_weights + encoded = dict(heatmaps=heatmaps, keypoint_weights=keypoint_weights) + + return encoded def decode(self, encoded: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """Decode keypoint coordinates from heatmaps. The decoded keypoint diff --git a/mmpose/codecs/regression_label.py b/mmpose/codecs/regression_label.py index 4edc97445a..9ae385d2d9 100644 --- a/mmpose/codecs/regression_label.py +++ b/mmpose/codecs/regression_label.py @@ -14,7 +14,16 @@ class RegressionLabel(BaseKeypointCodec): Note: - - input image size: [w, h] + - instance number: N + - keypoint number: K + - keypoint dimension: D + - image size: [w, h] + + Encoded: + + - keypoint_labels (np.ndarray): The normalized regression labels in + shape (N, K, D) where D is 2 for 2d coordinates + - keypoint_weights (np.ndarray): The target weights in shape (N, K) Args: input_size (tuple): Input image size in [w, h] @@ -26,11 +35,9 @@ def __init__(self, input_size: Tuple[int, int]) -> None: self.input_size = input_size - def encode( - self, - keypoints: np.ndarray, - keypoints_visible: Optional[np.ndarray] = None - ) -> Tuple[np.ndarray, np.ndarray]: + def encode(self, + keypoints: np.ndarray, + keypoints_visible: Optional[np.ndarray] = None) -> dict: """Encoding keypoints from input image space to normalized space. Args: @@ -39,8 +46,8 @@ def encode( (N, K) Returns: - tuple: - - reg_labels (np.ndarray): The normalized regression labels in + dict: + - keypoint_labels (np.ndarray): The normalized regression labels in shape (N, K, D) where D is 2 for 2d coordinates - keypoint_weights (np.ndarray): The target weights in shape (N, K) @@ -53,10 +60,13 @@ def encode( (keypoints <= [w - 1, h - 1])).all(axis=-1) & ( keypoints_visible > 0.5) - reg_labels = (keypoints / np.array([w, h])).astype(np.float32) + keypoint_labels = (keypoints / np.array([w, h])).astype(np.float32) keypoint_weights = np.where(valid, 1., 0.).astype(np.float32) - return reg_labels, keypoint_weights + encoded = dict( + keypoint_labels=keypoint_labels, keypoint_weights=keypoint_weights) + + return encoded def decode(self, encoded: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """Decode keypoint coordinates from normalized space to input image diff --git a/mmpose/codecs/simcc_label.py b/mmpose/codecs/simcc_label.py index a02ce99ac3..a22498c352 100644 --- a/mmpose/codecs/simcc_label.py +++ b/mmpose/codecs/simcc_label.py @@ -5,6 +5,7 @@ import numpy as np from mmpose.codecs.utils import get_simcc_maximum +from mmpose.codecs.utils.refinement import refine_simcc_dark from mmpose.registry import KEYPOINT_CODECS from .base import BaseKeypointCodec @@ -18,14 +19,29 @@ class SimCCLabel(BaseKeypointCodec): Note: - - input image size: [w, h] + - instance number: N + - keypoint number: K + - keypoint dimension: D + - image size: [w, h] + + Encoded: + + - keypoint_x_labels (np.ndarray): The generated SimCC label for x-axis. + The label shape is (N, K, Wx) if ``smoothing_type=='gaussian'`` + and (N, K) if `smoothing_type=='standard'``, where + :math:`Wx=w*simcc_split_ratio` + - keypoint_y_labels (np.ndarray): The generated SimCC label for y-axis. + The label shape is (N, K, Wy) if ``smoothing_type=='gaussian'`` + and (N, K) if `smoothing_type=='standard'``, where + :math:`Wy=h*simcc_split_ratio` + - keypoint_weights (np.ndarray): The target weights in shape (N, K) Args: input_size (tuple): Input image size in [w, h] smoothing_type (str): The SimCC label smoothing strategy. Options are - ``'gaussian'`` and ``'standard'``. Defaults to ``'gaussian'`` - sigma (str): The sigma value in the Gaussian SimCC label. Defaults to - 6.0 + ``'gaussian'`` and ``'standard'``. Defaults to ``'gaussian'`` + sigma (float | int | tuple): The sigma value in the Gaussian SimCC + label. Defaults to 6.0 simcc_split_ratio (float): The ratio of the label size to the input size. For example, if the input width is ``w``, the x label size will be :math:`w*simcc_split_ratio`. Defaults to 2.0 @@ -39,18 +55,24 @@ class SimCCLabel(BaseKeypointCodec): def __init__(self, input_size: Tuple[int, int], smoothing_type: str = 'gaussian', - sigma: float = 6.0, + sigma: Union[float, int, Tuple[float]] = 6.0, simcc_split_ratio: float = 2.0, label_smooth_weight: float = 0.0, - normalize: bool = True) -> None: + normalize: bool = True, + use_dark: bool = False) -> None: super().__init__() self.input_size = input_size self.smoothing_type = smoothing_type - self.sigma = sigma self.simcc_split_ratio = simcc_split_ratio self.label_smooth_weight = label_smooth_weight self.normalize = normalize + self.use_dark = use_dark + + if isinstance(sigma, (float, int)): + self.sigma = np.array([sigma, sigma]) + else: + self.sigma = np.array(sigma) if self.smoothing_type not in {'gaussian', 'standard'}: raise ValueError( @@ -65,12 +87,9 @@ def __init__(self, if self.label_smooth_weight < 0.0 or self.label_smooth_weight > 1.0: raise ValueError('`label_smooth_weight` should be in range [0, 1]') - def encode( - self, - keypoints: np.ndarray, - keypoints_visible: Optional[np.ndarray] = None - ) -> Union[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray, - np.ndarray]]: + def encode(self, + keypoints: np.ndarray, + keypoints_visible: Optional[np.ndarray] = None) -> dict: """Encoding keypoints into SimCC labels. Note that the original keypoint coordinates should be in the input image space. @@ -80,12 +99,14 @@ def encode( (N, K) Returns: - tuple: - - simcc_x (np.ndarray): The generated SimCC label for x-axis. + dict: + - keypoint_x_labels (np.ndarray): The generated SimCC label for + x-axis. The label shape is (N, K, Wx) if ``smoothing_type=='gaussian'`` and (N, K) if `smoothing_type=='standard'``, where :math:`Wx=w*simcc_split_ratio` - - simcc_y (np.ndarray): The generated SimCC label for y-axis. + - keypoint_y_labels (np.ndarray): The generated SimCC label for + y-axis. The label shape is (N, K, Wy) if ``smoothing_type=='gaussian'`` and (N, K) if `smoothing_type=='standard'``, where :math:`Wy=h*simcc_split_ratio` @@ -96,24 +117,34 @@ def encode( keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32) if self.smoothing_type == 'gaussian': - return self._generate_gaussian(keypoints, keypoints_visible) + x_labels, y_labels, keypoint_weights = self._generate_gaussian( + keypoints, keypoints_visible) elif self.smoothing_type == 'standard': - return self._generate_standard(keypoints, keypoints_visible) + x_labels, y_labels, keypoint_weights = self._generate_standard( + keypoints, keypoints_visible) else: raise ValueError( f'{self.__class__.__name__} got invalid `smoothing_type` value' f'{self.smoothing_type}. Should be one of ' '{"gaussian", "standard"}') - def decode(self, - encoded: Tuple[np.ndarray, - np.ndarray]) -> Tuple[np.ndarray, np.ndarray]: + encoded = dict( + keypoint_x_labels=x_labels, + keypoint_y_labels=y_labels, + keypoint_weights=keypoint_weights) + + return encoded + + def decode(self, simcc_x: np.ndarray, + simcc_y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """Decode keypoint coordinates from SimCC representations. The decoded coordinates are in the input image space. Args: encoded (Tuple[np.ndarray, np.ndarray]): SimCC labels for x-axis and y-axis + simcc_x (np.ndarray): SimCC label for x-axis + simcc_y (np.ndarray): SimCC label for y-axis Returns: tuple: @@ -122,16 +153,25 @@ def decode(self, It usually represents the confidence of the keypoint prediction """ - simcc_x, simcc_y = encoded keypoints, scores = get_simcc_maximum(simcc_x, simcc_y) - keypoints /= self.simcc_split_ratio - # Unsqueeze the instance dimension for single-instance results - if len(keypoints) == 2: + if keypoints.ndim == 2: keypoints = keypoints[None, :] scores = scores[None, :] + if self.use_dark: + x_blur = int((self.sigma[0] * 20 - 7) // 3) + y_blur = int((self.sigma[1] * 20 - 7) // 3) + x_blur -= int((x_blur % 2) == 0) + y_blur -= int((y_blur % 2) == 0) + keypoints[:, :, 0] = refine_simcc_dark(keypoints[:, :, 0], simcc_x, + x_blur) + keypoints[:, :, 1] = refine_simcc_dark(keypoints[:, :, 1], simcc_y, + y_blur) + + keypoints /= self.simcc_split_ratio + return keypoints, scores def _map_coordinates( @@ -235,12 +275,12 @@ def _generate_gaussian( mu_x, mu_y = mu - target_x[n, k] = np.exp(-((x - mu_x)**2) / (2 * self.sigma**2)) - target_y[n, k] = np.exp(-((y - mu_y)**2) / (2 * self.sigma**2)) + target_x[n, k] = np.exp(-((x - mu_x)**2) / (2 * self.sigma[0]**2)) + target_y[n, k] = np.exp(-((y - mu_y)**2) / (2 * self.sigma[1]**2)) if self.normalize: norm_value = self.sigma * np.sqrt(np.pi * 2) - target_x /= norm_value - target_y /= norm_value + target_x /= norm_value[0] + target_y /= norm_value[1] return target_x, target_y, keypoint_weights diff --git a/mmpose/codecs/spr.py b/mmpose/codecs/spr.py new file mode 100644 index 0000000000..add6f5715b --- /dev/null +++ b/mmpose/codecs/spr.py @@ -0,0 +1,299 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple, Union + +import numpy as np +import torch +from torch import Tensor + +from mmpose.registry import KEYPOINT_CODECS +from .base import BaseKeypointCodec +from .utils import (batch_heatmap_nms, generate_displacement_heatmap, + generate_gaussian_heatmaps, get_diagonal_lengths, + get_instance_root) + + +@KEYPOINT_CODECS.register_module() +class SPR(BaseKeypointCodec): + """Encode/decode keypoints with Structured Pose Representation (SPR). + + See the paper `Single-stage multi-person pose machines`_ + by Nie et al (2017) for details + + Note: + + - instance number: N + - keypoint number: K + - keypoint dimension: D + - image size: [w, h] + - heatmap size: [W, H] + + Encoded: + + - heatmaps (np.ndarray): The generated heatmap in shape (1, H, W) + where [W, H] is the `heatmap_size`. If the keypoint heatmap is + generated together, the output heatmap shape is (K+1, H, W) + - heatmap_weights (np.ndarray): The target weights for heatmaps which + has same shape with heatmaps. + - displacements (np.ndarray): The dense keypoint displacement in + shape (K*2, H, W). + - displacement_weights (np.ndarray): The target weights for heatmaps + which has same shape with displacements. + + Args: + input_size (tuple): Image size in [w, h] + heatmap_size (tuple): Heatmap size in [W, H] + sigma (float or tuple, optional): The sigma values of the Gaussian + heatmaps. If sigma is a tuple, it includes both sigmas for root + and keypoint heatmaps. ``None`` means the sigmas are computed + automatically from the heatmap size. Defaults to ``None`` + generate_keypoint_heatmaps (bool): Whether to generate Gaussian + heatmaps for each keypoint. Defaults to ``False`` + root_type (str): The method to generate the instance root. Options + are: + + - ``'kpt_center'``: Average coordinate of all visible keypoints. + - ``'bbox_center'``: Center point of bounding boxes outlined by + all visible keypoints. + + Defaults to ``'kpt_center'`` + + minimal_diagonal_length (int or float): The threshold of diagonal + length of instance bounding box. Small instances will not be + used in training. Defaults to 32 + background_weight (float): Loss weight of background pixels. + Defaults to 0.1 + decode_thr (float): The threshold of keypoint response value in + heatmaps. Defaults to 0.01 + decode_nms_kernel (int): The kernel size of the NMS during decoding, + which should be an odd integer. Defaults to 5 + decode_max_instances (int): The maximum number of instances + to decode. Defaults to 30 + + .. _`Single-stage multi-person pose machines`: + https://arxiv.org/abs/1908.09220 + """ + + def __init__( + self, + input_size: Tuple[int, int], + heatmap_size: Tuple[int, int], + sigma: Optional[Union[float, Tuple[float]]] = None, + generate_keypoint_heatmaps: bool = False, + root_type: str = 'kpt_center', + minimal_diagonal_length: Union[int, float] = 5, + background_weight: float = 0.1, + decode_nms_kernel: int = 5, + decode_max_instances: int = 30, + decode_thr: float = 0.01, + ): + super().__init__() + + self.input_size = input_size + self.heatmap_size = heatmap_size + self.generate_keypoint_heatmaps = generate_keypoint_heatmaps + self.root_type = root_type + self.minimal_diagonal_length = minimal_diagonal_length + self.background_weight = background_weight + self.decode_nms_kernel = decode_nms_kernel + self.decode_max_instances = decode_max_instances + self.decode_thr = decode_thr + + self.scale_factor = (np.array(input_size) / + heatmap_size).astype(np.float32) + + if sigma is None: + sigma = (heatmap_size[0] * heatmap_size[1])**0.5 / 32 + if generate_keypoint_heatmaps: + # sigma for root heatmap and keypoint heatmaps + self.sigma = (sigma, sigma // 2) + else: + self.sigma = (sigma, ) + else: + if not isinstance(sigma, (tuple, list)): + sigma = (sigma, ) + if generate_keypoint_heatmaps: + assert len(sigma) == 2, 'sigma for keypoints must be given ' \ + 'if `generate_keypoint_heatmaps` ' \ + 'is True. e.g. sigma=(4, 2)' + self.sigma = sigma + + def _get_heatmap_weights(self, + heatmaps, + fg_weight: float = 1, + bg_weight: float = 0): + """Generate weight array for heatmaps. + + Args: + heatmaps (np.ndarray): Root and keypoint (optional) heatmaps + fg_weight (float): Weight for foreground pixels. Defaults to 1.0 + bg_weight (float): Weight for background pixels. Defaults to 0.0 + + Returns: + np.ndarray: Heatmap weight array in the same shape with heatmaps + """ + heatmap_weights = np.ones(heatmaps.shape) * bg_weight + heatmap_weights[heatmaps > 0] = fg_weight + return heatmap_weights + + def encode(self, + keypoints: np.ndarray, + keypoints_visible: Optional[np.ndarray] = None) -> dict: + """Encode keypoints into root heatmaps and keypoint displacement + fields. Note that the original keypoint coordinates should be in the + input image space. + + Args: + keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D) + keypoints_visible (np.ndarray): Keypoint visibilities in shape + (N, K) + + Returns: + dict: + - heatmaps (np.ndarray): The generated heatmap in shape + (1, H, W) where [W, H] is the `heatmap_size`. If keypoint + heatmaps are generated together, the shape is (K+1, H, W) + - heatmap_weights (np.ndarray): The pixel-wise weight for heatmaps + which has same shape with `heatmaps` + - displacements (np.ndarray): The generated displacement fields in + shape (K*D, H, W). The vector on each pixels represents the + displacement of keypoints belong to the associated instance + from this pixel. + - displacement_weights (np.ndarray): The pixel-wise weight for + displacements which has same shape with `displacements` + """ + + if keypoints_visible is None: + keypoints_visible = np.ones(keypoints.shape[:2], dtype=np.float32) + + # keypoint coordinates in heatmap + _keypoints = keypoints / self.scale_factor + + # compute the root and scale of each instance + roots, roots_visible = get_instance_root(_keypoints, keypoints_visible, + self.root_type) + diagonal_lengths = get_diagonal_lengths(_keypoints, keypoints_visible) + + # discard the small instances + roots_visible[diagonal_lengths < self.minimal_diagonal_length] = 0 + + # generate heatmaps + heatmaps, _ = generate_gaussian_heatmaps( + heatmap_size=self.heatmap_size, + keypoints=roots[:, None], + keypoints_visible=roots_visible[:, None], + sigma=self.sigma[0]) + heatmap_weights = self._get_heatmap_weights( + heatmaps, bg_weight=self.background_weight) + + if self.generate_keypoint_heatmaps: + keypoint_heatmaps, _ = generate_gaussian_heatmaps( + heatmap_size=self.heatmap_size, + keypoints=_keypoints, + keypoints_visible=keypoints_visible, + sigma=self.sigma[1]) + + keypoint_heatmaps_weights = self._get_heatmap_weights( + keypoint_heatmaps, bg_weight=self.background_weight) + + heatmaps = np.concatenate((keypoint_heatmaps, heatmaps), axis=0) + heatmap_weights = np.concatenate( + (keypoint_heatmaps_weights, heatmap_weights), axis=0) + + # generate displacements + displacements, displacement_weights = \ + generate_displacement_heatmap( + self.heatmap_size, + _keypoints, + keypoints_visible, + roots, + roots_visible, + diagonal_lengths, + self.sigma[0], + ) + + encoded = dict( + heatmaps=heatmaps, + heatmap_weights=heatmap_weights, + displacements=displacements, + displacement_weights=displacement_weights) + + return encoded + + def decode(self, heatmaps: Tensor, + displacements: Tensor) -> Tuple[np.ndarray, np.ndarray]: + """Decode the keypoint coordinates from heatmaps and displacements. The + decoded keypoint coordinates are in the input image space. + + Args: + heatmaps (Tensor): Encoded root and keypoints (optional) heatmaps + in shape (1, H, W) or (K+1, H, W) + displacements (Tensor): Encoded keypoints displacement fields + in shape (K*D, H, W) + + Returns: + tuple: + - keypoints (Tensor): Decoded keypoint coordinates in shape + (N, K, D) + - scores (tuple): + - root_scores (Tensor): The root scores in shape (N, ) + - keypoint_scores (Tensor): The keypoint scores in + shape (N, K). If keypoint heatmaps are not generated, + `keypoint_scores` will be `None` + """ + # heatmaps, displacements = encoded + _k, h, w = displacements.shape + k = _k // 2 + displacements = displacements.view(k, 2, h, w) + + # convert displacements to a dense keypoint prediction + y, x = torch.meshgrid(torch.arange(h), torch.arange(w)) + regular_grid = torch.stack([x, y], dim=0).to(displacements) + posemaps = (regular_grid[None] + displacements).flatten(2) + + # find local maximum on root heatmap + root_heatmap_peaks = batch_heatmap_nms(heatmaps[None, -1:], + self.decode_nms_kernel) + root_scores, pos_idx = root_heatmap_peaks.flatten().topk( + self.decode_max_instances) + mask = root_scores > self.decode_thr + root_scores, pos_idx = root_scores[mask], pos_idx[mask] + + keypoints = posemaps[:, :, pos_idx].permute(2, 0, 1).contiguous() + + if self.generate_keypoint_heatmaps and heatmaps.shape[0] == 1 + k: + # compute scores for each keypoint + keypoint_scores = self.get_keypoint_scores(heatmaps[:k], keypoints) + else: + keypoint_scores = None + + keypoints = torch.cat([ + kpt * self.scale_factor[i] + for i, kpt in enumerate(keypoints.split(1, -1)) + ], + dim=-1) + return keypoints, (root_scores, keypoint_scores) + + def get_keypoint_scores(self, heatmaps: Tensor, keypoints: Tensor): + """Calculate the keypoint scores with keypoints heatmaps and + coordinates. + + Args: + heatmaps (Tensor): Keypoint heatmaps in shape (K, H, W) + keypoints (Tensor): Keypoint coordinates in shape (N, K, D) + + Returns: + Tensor: Keypoint scores in [N, K] + """ + k, h, w = heatmaps.shape + keypoints = torch.stack(( + keypoints[..., 0] / (w - 1) * 2 - 1, + keypoints[..., 1] / (h - 1) * 2 - 1, + ), + dim=-1) + keypoints = keypoints.transpose(0, 1).unsqueeze(1).contiguous() + + keypoint_scores = torch.nn.functional.grid_sample( + heatmaps.unsqueeze(1), keypoints, + padding_mode='border').view(k, -1).transpose(0, 1).contiguous() + + return keypoint_scores diff --git a/mmpose/codecs/udp_heatmap.py b/mmpose/codecs/udp_heatmap.py index 9d48b39f9f..c38ea17be4 100644 --- a/mmpose/codecs/udp_heatmap.py +++ b/mmpose/codecs/udp_heatmap.py @@ -24,6 +24,16 @@ class UDPHeatmap(BaseKeypointCodec): - image size: [w, h] - heatmap size: [W, H] + Encoded: + + - heatmap (np.ndarray): The generated heatmap in shape (C_out, H, W) + where [W, H] is the `heatmap_size`, and the C_out is the output + channel number which depends on the `heatmap_type`. If + `heatmap_type=='gaussian'`, C_out equals to keypoint number K; + if `heatmap_type=='combined'`, C_out equals to K*3 + (x_offset, y_offset and class label) + - keypoint_weights (np.ndarray): The target weights in shape (K,) + Args: input_size (tuple): Image size in [w, h] heatmap_size (tuple): Heatmap size in [W, H] @@ -70,11 +80,9 @@ def __init__(self, f'{self.heatmap_type}. Should be one of ' '{"gaussian", "combined"}') - def encode( - self, - keypoints: np.ndarray, - keypoints_visible: Optional[np.ndarray] = None - ) -> Tuple[np.ndarray, np.ndarray]: + def encode(self, + keypoints: np.ndarray, + keypoints_visible: Optional[np.ndarray] = None) -> dict: """Encode keypoints into heatmaps. Note that the original keypoint coordinates should be in the input image space. @@ -84,7 +92,7 @@ def encode( (N, K) Returns: - tuple: + dict: - heatmap (np.ndarray): The generated heatmap in shape (C_out, H, W) where [W, H] is the `heatmap_size`, and the C_out is the output channel number which depends on the @@ -119,7 +127,9 @@ def encode( f'{self.heatmap_type}. Should be one of ' '{"gaussian", "combined"}') - return heatmaps, keypoint_weights + encoded = dict(heatmaps=heatmaps, keypoint_weights=keypoint_weights) + + return encoded def decode(self, encoded: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """Decode keypoint coordinates from heatmaps. The decoded keypoint @@ -149,14 +159,11 @@ def decode(self, encoded: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: elif self.heatmap_type == 'combined': _K, H, W = heatmaps.shape K = _K // 3 - for k in range(_K): - if k % 3 == 0: - # for classification map - ks = 2 * self.blur_kernel_size + 1 - else: - # for offset map - ks = self.blur_kernel_size - cv2.GaussianBlur(heatmaps[k], (ks, ks), 0, heatmaps[k]) + + for cls_heatmap in heatmaps[::3]: + # Apply Gaussian blur on classification maps + ks = 2 * self.blur_kernel_size + 1 + cv2.GaussianBlur(cls_heatmap, (ks, ks), 0, cls_heatmap) # valid radius radius = self.radius_factor * max(W, H) diff --git a/mmpose/codecs/utils/__init__.py b/mmpose/codecs/utils/__init__.py index b6acf34cc9..2c5ec8cce5 100644 --- a/mmpose/codecs/utils/__init__.py +++ b/mmpose/codecs/utils/__init__.py @@ -2,16 +2,22 @@ from .gaussian_heatmap import (generate_gaussian_heatmaps, generate_udp_gaussian_heatmaps, generate_unbiased_gaussian_heatmaps) -from .offset_heatmap import generate_offset_heatmap +from .instance_property import (get_diagonal_lengths, get_instance_bbox, + get_instance_root) +from .offset_heatmap import (generate_displacement_heatmap, + generate_offset_heatmap) from .post_processing import (batch_heatmap_nms, gaussian_blur, - get_heatmap_maximum, get_simcc_maximum) + gaussian_blur1d, get_heatmap_maximum, + get_simcc_maximum) from .refinement import (refine_keypoints, refine_keypoints_dark, - refine_keypoints_dark_udp) + refine_keypoints_dark_udp, refine_simcc_dark) __all__ = [ 'generate_gaussian_heatmaps', 'generate_udp_gaussian_heatmaps', 'generate_unbiased_gaussian_heatmaps', 'gaussian_blur', 'get_heatmap_maximum', 'get_simcc_maximum', 'generate_offset_heatmap', 'batch_heatmap_nms', 'refine_keypoints', 'refine_keypoints_dark', - 'refine_keypoints_dark_udp' + 'refine_keypoints_dark_udp', 'generate_displacement_heatmap', + 'refine_simcc_dark', 'gaussian_blur1d', 'get_diagonal_lengths', + 'get_instance_root', 'get_instance_bbox' ] diff --git a/mmpose/codecs/utils/gaussian_heatmap.py b/mmpose/codecs/utils/gaussian_heatmap.py index 91b1db36c1..91e08c2cdd 100644 --- a/mmpose/codecs/utils/gaussian_heatmap.py +++ b/mmpose/codecs/utils/gaussian_heatmap.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from itertools import product -from typing import Tuple +from typing import Tuple, Union import numpy as np @@ -9,7 +9,7 @@ def generate_gaussian_heatmaps( heatmap_size: Tuple[int, int], keypoints: np.ndarray, keypoints_visible: np.ndarray, - sigma: float, + sigma: Union[float, Tuple[float], np.ndarray], ) -> Tuple[np.ndarray, np.ndarray]: """Generate gaussian heatmaps of keypoints. @@ -18,7 +18,9 @@ def generate_gaussian_heatmaps( keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D) keypoints_visible (np.ndarray): Keypoint visibilities in shape (N, K) - sigma (float): The sigma value of the Gaussian heatmap + sigma (float or List[float]): A list of sigma values of the Gaussian + heatmap for each instance. If sigma is given as a single float + value, it will be expanded into a tuple Returns: tuple: @@ -34,51 +36,56 @@ def generate_gaussian_heatmaps( heatmaps = np.zeros((K, H, W), dtype=np.float32) keypoint_weights = keypoints_visible.copy() - # 3-sigma rule - radius = sigma * 3 + if isinstance(sigma, (int, float)): + sigma = (sigma, ) * N - # xy grid - gaussian_size = 2 * radius + 1 - x = np.arange(0, gaussian_size, 1, dtype=np.float32) - y = x[:, None] - x0 = y0 = gaussian_size // 2 + for n in range(N): + # 3-sigma rule + radius = sigma[n] * 3 - for n, k in product(range(N), range(K)): - # skip unlabled keypoints - if keypoints_visible[n, k] < 0.5: - continue + # xy grid + gaussian_size = 2 * radius + 1 + x = np.arange(0, gaussian_size, 1, dtype=np.float32) + y = x[:, None] + x0 = y0 = gaussian_size // 2 - # get gaussian center coordinates - mu = (keypoints[n, k] + 0.5).astype(np.int64) + for k in range(K): + # skip unlabled keypoints + if keypoints_visible[n, k] < 0.5: + continue - # check that the gaussian has in-bounds part - left, top = (mu - radius).astype(np.int64) - right, bottom = (mu + radius + 1).astype(np.int64) + # get gaussian center coordinates + mu = (keypoints[n, k] + 0.5).astype(np.int64) - if left >= W or top >= H or right < 0 or bottom < 0: - keypoint_weights[n, k] = 0 - continue + # check that the gaussian has in-bounds part + left, top = (mu - radius).astype(np.int64) + right, bottom = (mu + radius + 1).astype(np.int64) - # The gaussian is not normalized, - # we want the center value to equal 1 - gaussian = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2)) + if left >= W or top >= H or right < 0 or bottom < 0: + keypoint_weights[n, k] = 0 + continue - # valid range in gaussian - g_x1 = max(0, -left) - g_x2 = min(W, right) - left - g_y1 = max(0, -top) - g_y2 = min(H, bottom) - top + # The gaussian is not normalized, + # we want the center value to equal 1 + gaussian = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma[n]**2)) - # valid range in heatmap - h_x1 = max(0, left) - h_x2 = min(W, right) - h_y1 = max(0, top) - h_y2 = min(H, bottom) + # valid range in gaussian + g_x1 = max(0, -left) + g_x2 = min(W, right) - left + g_y1 = max(0, -top) + g_y2 = min(H, bottom) - top - heatmap_region = heatmaps[k, h_y1:h_y2, h_x1:h_x2] - gaussian_regsion = gaussian[g_y1:g_y2, g_x1:g_x2] + # valid range in heatmap + h_x1 = max(0, left) + h_x2 = min(W, right) + h_y1 = max(0, top) + h_y2 = min(H, bottom) - _ = np.maximum(heatmap_region, gaussian_regsion, out=heatmap_region) + heatmap_region = heatmaps[k, h_y1:h_y2, h_x1:h_x2] + gaussian_regsion = gaussian[g_y1:g_y2, g_x1:g_x2] + + _ = np.maximum( + heatmap_region, gaussian_regsion, out=heatmap_region) return heatmaps, keypoint_weights diff --git a/mmpose/codecs/utils/instance_property.py b/mmpose/codecs/utils/instance_property.py new file mode 100644 index 0000000000..15ae30aef0 --- /dev/null +++ b/mmpose/codecs/utils/instance_property.py @@ -0,0 +1,111 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import numpy as np + + +def get_instance_root(keypoints: np.ndarray, + keypoints_visible: Optional[np.ndarray] = None, + root_type: str = 'kpt_center') -> np.ndarray: + """Calculate the coordinates and visibility of instance roots. + + Args: + keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D) + keypoints_visible (np.ndarray): Keypoint visibilities in shape + (N, K) + root_type (str): Calculation of instance roots which should + be one of the following options: + + - ``'kpt_center'``: The roots' coordinates are the mean + coordinates of visible keypoints + - ``'bbox_center'``: The roots' are the center of bounding + boxes outlined by visible keypoints + + Defaults to ``'kpt_center'`` + + Returns: + tuple + - roots_coordinate(np.ndarray): Coordinates of instance roots in + shape [N, D] + - roots_visible(np.ndarray): Visibility of instance roots in + shape [N] + """ + + roots_coordinate = np.zeros((keypoints.shape[0], 2), dtype=np.float32) + roots_visible = np.ones((keypoints.shape[0]), dtype=np.float32) * 2 + + for i in range(keypoints.shape[0]): + + # collect visible keypoints + if keypoints_visible is not None: + visible_keypoints = keypoints[i][keypoints_visible[i] > 0] + else: + visible_keypoints = keypoints[i] + if visible_keypoints.size == 0: + roots_visible[i] = 0 + continue + + # compute the instance root with visible keypoints + if root_type == 'kpt_center': + roots_coordinate[i] = visible_keypoints.mean(axis=0) + roots_visible[i] = 1 + elif root_type == 'bbox_center': + roots_coordinate[i] = (visible_keypoints.max(axis=0) + + visible_keypoints.min(axis=0)) / 2.0 + roots_visible[i] = 1 + else: + raise ValueError( + f'the value of `root_type` must be \'kpt_center\' or ' + f'\'bbox_center\', but got \'{root_type}\'') + + return roots_coordinate, roots_visible + + +def get_instance_bbox(keypoints: np.ndarray, + keypoints_visible: Optional[np.ndarray] = None + ) -> np.ndarray: + """Calculate the pseudo instance bounding box from visible keypoints. The + bounding boxes are in the xyxy format. + + Args: + keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D) + keypoints_visible (np.ndarray): Keypoint visibilities in shape + (N, K) + + Returns: + np.ndarray: bounding boxes in [N, 4] + """ + bbox = np.zeros((keypoints.shape[0], 4), dtype=np.float32) + for i in range(keypoints.shape[0]): + if keypoints_visible is not None: + visible_keypoints = keypoints[i][keypoints_visible[i] > 0] + else: + visible_keypoints = keypoints[i] + if visible_keypoints.size == 0: + continue + + bbox[i, :2] = visible_keypoints.min(axis=0) + bbox[i, 2:] = visible_keypoints.max(axis=0) + return bbox + + +def get_diagonal_lengths(keypoints: np.ndarray, + keypoints_visible: Optional[np.ndarray] = None + ) -> np.ndarray: + """Calculate the diagonal length of instance bounding box from visible + keypoints. + + Args: + keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D) + keypoints_visible (np.ndarray): Keypoint visibilities in shape + (N, K) + + Returns: + np.ndarray: bounding box diagonal length in [N] + """ + pseudo_bbox = get_instance_bbox(keypoints, keypoints_visible) + pseudo_bbox = pseudo_bbox.reshape(-1, 2, 2) + h_w_diff = pseudo_bbox[:, 1] - pseudo_bbox[:, 0] + diagonal_length = np.sqrt(np.power(h_w_diff, 2).sum(axis=1)) + + return diagonal_length diff --git a/mmpose/codecs/utils/offset_heatmap.py b/mmpose/codecs/utils/offset_heatmap.py index 5b017ffd5b..c3c1c32ed3 100644 --- a/mmpose/codecs/utils/offset_heatmap.py +++ b/mmpose/codecs/utils/offset_heatmap.py @@ -28,11 +28,7 @@ def generate_offset_heatmap( Returns: tuple: - heatmap (np.ndarray): The generated heatmap in shape - (C_out, H, W) where [W, H] is the `heatmap_size`, and the - C_out is the output channel number which depends on the - `heatmap_type`. If `heatmap_type=='gaussian'`, C_out equals to - keypoint number K; if `heatmap_type=='combined'`, C_out - equals to K*3 (x_offset, y_offset and class label) + (K*3, H, W) where [W, H] is the `heatmap_size` - keypoint_weights (np.ndarray): The target weights in shape (K,) """ @@ -63,8 +59,85 @@ def generate_offset_heatmap( heatmaps[k, 1] = x_offset heatmaps[k, 2] = y_offset - # keep only valid region in offset maps - heatmaps[:, 1:] *= heatmaps[:, :1] heatmaps = heatmaps.reshape(K * 3, H, W) return heatmaps, keypoint_weights + + +def generate_displacement_heatmap( + heatmap_size: Tuple[int, int], + keypoints: np.ndarray, + keypoints_visible: np.ndarray, + roots: np.ndarray, + roots_visible: np.ndarray, + diagonal_lengths: np.ndarray, + radius: float, +): + """Generate displacement heatmaps of keypoints, where each keypoint is + represented by 3 maps: one pixel-level class label map (1 for keypoint and + 0 for non-keypoint) and 2 pixel-level offset maps for x and y directions + respectively. + + Args: + heatmap_size (Tuple[int, int]): Heatmap size in [W, H] + keypoints (np.ndarray): Keypoint coordinates in shape (N, K, D) + keypoints_visible (np.ndarray): Keypoint visibilities in shape + (N, K) + roots (np.ndarray): Coordinates of instance centers in shape (N, D). + The displacement fields of each instance will locate around its + center. + roots_visible (np.ndarray): Roots visibilities in shape (N,) + diagonal_lengths (np.ndarray): Diaginal length of the bounding boxes + of each instance in shape (N,) + radius (float): The radius factor of the binary label + map. The positive region is defined as the neighbor of the + keypoint with the radius :math:`r=radius_factor*max(W, H)` + + Returns: + tuple: + - displacements (np.ndarray): The generated displacement map in + shape (K*2, H, W) where [W, H] is the `heatmap_size` + - displacement_weights (np.ndarray): The target weights in shape + (K*2, H, W) + """ + N, K, _ = keypoints.shape + W, H = heatmap_size + + displacements = np.zeros((K * 2, H, W), dtype=np.float32) + displacement_weights = np.zeros((K * 2, H, W), dtype=np.float32) + instance_size_map = np.zeros((H, W), dtype=np.float32) + + for n in range(N): + if (roots_visible[n] < 1 or (roots[n, 0] < 0 or roots[n, 1] < 0) + or (roots[n, 0] >= W or roots[n, 1] >= H)): + continue + + diagonal_length = diagonal_lengths[n] + + for k in range(K): + if keypoints_visible[n, k] < 1 or keypoints[n, k, 0] < 0 \ + or keypoints[n, k, 1] < 0 or keypoints[n, k, 0] >= W \ + or keypoints[n, k, 1] >= H: + continue + + start_x = max(int(roots[n, 0] - radius), 0) + start_y = max(int(roots[n, 1] - radius), 0) + end_x = min(int(roots[n, 0] + radius), W) + end_y = min(int(roots[n, 1] + radius), H) + + for x in range(start_x, end_x): + for y in range(start_y, end_y): + if displacements[2 * k, y, + x] != 0 or displacements[2 * k + 1, y, + x] != 0: + if diagonal_length > instance_size_map[y, x]: + # keep the gt displacement of smaller instance + continue + + displacement_weights[2 * k:2 * k + 2, y, + x] = 1 / diagonal_length + displacements[2 * k:2 * k + 2, y, + x] = keypoints[n, k] - [x, y] + instance_size_map[y, x] = diagonal_length + + return displacements, displacement_weights diff --git a/mmpose/codecs/utils/post_processing.py b/mmpose/codecs/utils/post_processing.py index e6c9b71de3..e8e90204ab 100644 --- a/mmpose/codecs/utils/post_processing.py +++ b/mmpose/codecs/utils/post_processing.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +from itertools import product from typing import Tuple import cv2 @@ -141,6 +142,37 @@ def gaussian_blur(heatmaps: np.ndarray, kernel: int = 11) -> np.ndarray: return heatmaps +def gaussian_blur1d(simcc: np.ndarray, kernel: int = 11) -> np.ndarray: + """Modulate simcc distribution with Gaussian. + + Note: + - num_keypoints: K + - simcc length: Wx + + Args: + simcc (np.ndarray[K, Wx]): model predicted simcc. + kernel (int): Gaussian kernel size (K) for modulation, which should + match the simcc gaussian sigma when training. + K=17 for sigma=3 and k=11 for sigma=2. + + Returns: + np.ndarray ([K, Wx]): Modulated simcc distribution. + """ + assert kernel % 2 == 1 + + border = (kernel - 1) // 2 + N, K, Wx = simcc.shape + + for n, k in product(range(N), range(K)): + origin_max = np.max(simcc[n, k]) + dr = np.zeros((1, Wx + 2 * border), dtype=np.float32) + dr[0, border:-border] = simcc[n, k].copy() + dr = cv2.GaussianBlur(dr, (kernel, 1), 0) + simcc[n, k] = dr[0, border:-border].copy() + simcc[n, k] *= origin_max / np.max(simcc[n, k]) + return simcc + + def batch_heatmap_nms(batch_heatmaps: Tensor, kernel_size: int = 5): """Apply NMS on a batch of heatmaps. diff --git a/mmpose/codecs/utils/refinement.py b/mmpose/codecs/utils/refinement.py index e0dab8dfbf..3495f37d0a 100644 --- a/mmpose/codecs/utils/refinement.py +++ b/mmpose/codecs/utils/refinement.py @@ -3,7 +3,7 @@ import numpy as np -from .post_processing import gaussian_blur +from .post_processing import gaussian_blur, gaussian_blur1d def refine_keypoints(keypoints: np.ndarray, @@ -163,3 +163,53 @@ def refine_keypoints_dark_udp(keypoints: np.ndarray, heatmaps: np.ndarray, derivative).squeeze() return keypoints + + +def refine_simcc_dark(keypoints: np.ndarray, simcc: np.ndarray, + blur_kernel_size: int) -> np.ndarray: + """SimCC version. Refine keypoint predictions using distribution aware + coordinate decoding for UDP. See `UDP`_ for details. The operation is in- + place. + + Note: + + - instance number: N + - keypoint number: K + - keypoint dimension: D + + Args: + keypoints (np.ndarray): The keypoint coordinates in shape (N, K, D) + simcc (np.ndarray): The heatmaps in shape (N, K, Wx) + blur_kernel_size (int): The Gaussian blur kernel size of the heatmap + modulation + + Returns: + np.ndarray: Refine keypoint coordinates in shape (N, K, D) + + .. _`UDP`: https://arxiv.org/abs/1911.07524 + """ + N = simcc.shape[0] + + # modulate simcc + simcc = gaussian_blur1d(simcc, blur_kernel_size) + np.clip(simcc, 1e-3, 50., simcc) + np.log(simcc, simcc) + + simcc = np.pad(simcc, ((0, 0), (0, 0), (2, 2)), 'edge') + + for n in range(N): + px = (keypoints[n] + 2.5).astype(np.int64).reshape(-1, 1) # K, 1 + + dx0 = np.take_along_axis(simcc[n], px, axis=1) # K, 1 + dx1 = np.take_along_axis(simcc[n], px + 1, axis=1) + dx_1 = np.take_along_axis(simcc[n], px - 1, axis=1) + dx2 = np.take_along_axis(simcc[n], px + 2, axis=1) + dx_2 = np.take_along_axis(simcc[n], px - 2, axis=1) + + dx = 0.5 * (dx1 - dx_1) + dxx = 1e-9 + 0.25 * (dx2 - 2 * dx0 + dx_2) + + offset = dx / dxx + keypoints[n] -= offset.reshape(-1) + + return keypoints diff --git a/mmpose/datasets/__init__.py b/mmpose/datasets/__init__.py index d9b9bee32a..b90a12db49 100644 --- a/mmpose/datasets/__init__.py +++ b/mmpose/datasets/__init__.py @@ -1,6 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. from .builder import build_dataset +from .dataset_wrappers import CombinedDataset from .datasets import * # noqa +from .samplers import MultiSourceSampler from .transforms import * # noqa -__all__ = ['build_dataset'] +__all__ = ['build_dataset', 'CombinedDataset', 'MultiSourceSampler'] diff --git a/mmpose/datasets/builder.py b/mmpose/datasets/builder.py index b3993a7ebb..2e5a236ff4 100644 --- a/mmpose/datasets/builder.py +++ b/mmpose/datasets/builder.py @@ -6,7 +6,7 @@ import numpy as np import torch from mmengine import build_from_cfg, is_seq_of -from torch.utils.data.dataset import ConcatDataset +from mmengine.dataset import ConcatDataset, RepeatDataset from mmpose.registry import DATASETS @@ -64,7 +64,6 @@ def build_dataset(cfg, default_args=None): Returns: Dataset: The constructed dataset. """ - from .dataset_wrappers import RepeatDataset if isinstance(cfg, (list, tuple)): dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg]) diff --git a/mmpose/datasets/dataset_wrappers.py b/mmpose/datasets/dataset_wrappers.py index aaaa173b91..28eeac9945 100644 --- a/mmpose/datasets/dataset_wrappers.py +++ b/mmpose/datasets/dataset_wrappers.py @@ -1,31 +1,122 @@ # Copyright (c) OpenMMLab. All rights reserved. -from .builder import DATASETS +from copy import deepcopy +from typing import Any, Callable, List, Tuple, Union -@DATASETS.register_module() -class RepeatDataset: - """A wrapper of repeated dataset. +from mmengine.dataset import BaseDataset +from mmengine.registry import build_from_cfg + +from mmpose.registry import DATASETS +from .datasets.utils import parse_pose_metainfo - The length of repeated dataset will be `times` larger than the original - dataset. This is useful when the data loading time is long but the dataset - is small. Using RepeatDataset can reduce the data loading time between - epochs. + +@DATASETS.register_module() +class CombinedDataset(BaseDataset): + """A wrapper of combined dataset. Args: - dataset (:obj:`Dataset`): The dataset to be repeated. - times (int): Repeat times. + metainfo (dict): The meta information of combined dataset. + datasets (list): The configs of datasets to be combined. + pipeline (list, optional): Processing pipeline. Defaults to []. """ - def __init__(self, dataset, times): - self.dataset = dataset - self.times = times + def __init__(self, + metainfo: dict, + datasets: list, + pipeline: List[Union[dict, Callable]] = [], + **kwargs): + + self.datasets = [] + + for cfg in datasets: + dataset = build_from_cfg(cfg, DATASETS) + self.datasets.append(dataset) + + self._lens = [len(dataset) for dataset in self.datasets] + self._len = sum(self._lens) - self._ori_len = len(self.dataset) + super(CombinedDataset, self).__init__(pipeline=pipeline, **kwargs) + self._metainfo = parse_pose_metainfo(metainfo) - def __getitem__(self, idx): - """Get data.""" - return self.dataset[idx % self._ori_len] + @property + def metainfo(self): + return deepcopy(self._metainfo) def __len__(self): - """Length after repetition.""" - return self.times * self._ori_len + return self._len + + def _get_subset_index(self, index: int) -> Tuple[int, int]: + """Given a data sample's global index, return the index of the sub- + dataset the data sample belongs to, and the local index within that + sub-dataset. + + Args: + index (int): The global data sample index + + Returns: + tuple[int, int]: + - subset_index (int): The index of the sub-dataset + - local_index (int): The index of the data sample within + the sub-dataset + """ + if index >= len(self) or index < -len(self): + raise ValueError( + f'index({index}) is out of bounds for dataset with ' + f'length({len(self)}).') + + if index < 0: + index = index + len(self) + + subset_index = 0 + while index >= self._lens[subset_index]: + index -= self._lens[subset_index] + subset_index += 1 + return subset_index, index + + def prepare_data(self, idx: int) -> Any: + """Get data processed by ``self.pipeline``.The source dataset is + depending on the index. + + Args: + idx (int): The index of ``data_info``. + + Returns: + Any: Depends on ``self.pipeline``. + """ + + data_info = self.get_data_info(idx) + + return self.pipeline(data_info) + + def get_data_info(self, idx: int) -> dict: + """Get annotation by index. + + Args: + idx (int): Global index of ``CombinedDataset``. + Returns: + dict: The idx-th annotation of the datasets. + """ + subset_idx, sample_idx = self._get_subset_index(idx) + # Get data sample processed by ``subset.pipeline`` + data_info = self.datasets[subset_idx][sample_idx] + + # Add metainfo items that are required in the pipeline and the model + metainfo_keys = [ + 'upper_body_ids', 'lower_body_ids', 'flip_pairs', + 'dataset_keypoint_weights', 'flip_indices' + ] + + for key in metainfo_keys: + data_info[key] = deepcopy(self._metainfo[key]) + + return data_info + + def full_init(self): + """Fully initialize all sub datasets.""" + + if self._fully_initialized: + return + + for dataset in self.datasets: + dataset.full_init() + self._fully_initialized = True diff --git a/mmpose/datasets/datasets/animal/ap10k_dataset.py b/mmpose/datasets/datasets/animal/ap10k_dataset.py index a1c7db75a0..de1efbc67f 100644 --- a/mmpose/datasets/datasets/animal/ap10k_dataset.py +++ b/mmpose/datasets/datasets/animal/ap10k_dataset.py @@ -1,9 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -import os.path as osp -from typing import Optional - -import numpy as np - from mmpose.registry import DATASETS from ..base import BaseCocoStyleDataset @@ -76,65 +71,3 @@ class AP10KDataset(BaseCocoStyleDataset): """ METAINFO: dict = dict(from_file='configs/_base_/datasets/ap10k.py') - - def parse_data_info(self, raw_data_info: dict) -> Optional[dict]: - """Parse raw AP-10K annotation of an instance. - - Args: - raw_data_info (dict): Raw data information loaded from - ``ann_file``. It should have following contents: - - - ``'raw_ann_info'``: Raw annotation of an instance - - ``'raw_img_info'``: Raw information of the image that - contains the instance - - Returns: - dict | None: Parsed instance annotation - """ - - ann = raw_data_info['raw_ann_info'] - img = raw_data_info['raw_img_info'] - - # filter invalid instance - if 'bbox' not in ann or 'keypoints' not in ann or max( - ann['keypoints']) == 0: - return None - - img_path = osp.join(self.data_prefix['img'], img['file_name']) - img_w, img_h = img['width'], img['height'] - - # get bbox in shape [1, 4], formatted as xywh - x, y, w, h = ann['bbox'] - x1 = np.clip(x, 0, img_w - 1) - y1 = np.clip(y, 0, img_h - 1) - x2 = np.clip(x + w, 0, img_w - 1) - y2 = np.clip(y + h, 0, img_h - 1) - - bbox = np.array([x1, y1, x2, y2], dtype=np.float32).reshape(1, 4) - - # keypoints in shape [1, K, 2] and keypoints_visible in [1, K] - _keypoints = np.array( - ann['keypoints'], dtype=np.float32).reshape(1, -1, 3) - keypoints = _keypoints[..., :2] - keypoints_visible = np.minimum(1, _keypoints[..., 2]) - - if 'num_keypoints' in ann: - num_keypoints = ann['num_keypoints'] - else: - num_keypoints = np.count_nonzero(keypoints.max(axis=2)) - - data_info = { - 'img_id': ann['image_id'], - 'img_path': img_path, - 'bbox': bbox, - 'bbox_score': np.ones(1, dtype=np.float32), - 'num_keypoints': num_keypoints, - 'keypoints': keypoints, - 'keypoints_visible': keypoints_visible, - 'iscrowd': ann.get('iscrowd', 0), - 'segmentation': ann.get('segmentation', None), - 'id': ann['id'], - 'category': ann['category_id'], - } - - return data_info diff --git a/mmpose/datasets/datasets/base/base_coco_style_dataset.py b/mmpose/datasets/datasets/base/base_coco_style_dataset.py index d222a2ecab..a1c3bc2f5c 100644 --- a/mmpose/datasets/datasets/base/base_coco_style_dataset.py +++ b/mmpose/datasets/datasets/base/base_coco_style_dataset.py @@ -1,9 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. +import copy import os.path as osp from copy import deepcopy from itertools import filterfalse, groupby -from typing import (Any, Callable, Dict, Iterable, List, Optional, Sequence, - Union) +from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union import numpy as np from mmengine.dataset import BaseDataset, force_full_init @@ -147,10 +147,23 @@ def prepare_data(self, idx) -> Any: """ data_info = self.get_data_info(idx) + return self.pipeline(data_info) + + def get_data_info(self, idx: int) -> dict: + """Get data info by index. + + Args: + idx (int): Index of data info. + + Returns: + dict: Data info. + """ + data_info = super().get_data_info(idx) + # Add metainfo items that are required in the pipeline and the model metainfo_keys = [ 'upper_body_ids', 'lower_body_ids', 'flip_pairs', - 'dataset_keypoint_weights', 'flip_indices' + 'dataset_keypoint_weights', 'flip_indices', 'skeleton_links' ] for key in metainfo_keys: @@ -160,7 +173,7 @@ def prepare_data(self, idx) -> Any: data_info[key] = deepcopy(self._metainfo[key]) - return self.pipeline(data_info) + return data_info def load_data_list(self) -> List[dict]: """Load data list from COCO annotation file or person detection result @@ -169,38 +182,51 @@ def load_data_list(self) -> List[dict]: if self.bbox_file: data_list = self._load_detection_results() else: - data_list = self._load_annotations() + instance_list, image_list = self._load_annotations() if self.data_mode == 'topdown': - data_list = self._get_topdown_data_infos(data_list) + data_list = self._get_topdown_data_infos(instance_list) else: - data_list = self._get_bottomup_data_infos(data_list) + data_list = self._get_bottomup_data_infos( + instance_list, image_list) return data_list - def _load_annotations(self): + def _load_annotations(self) -> Tuple[List[dict], List[dict]]: """Load data from annotations in COCO format.""" check_file_exist(self.ann_file) coco = COCO(self.ann_file) - data_list = [] + # set the metainfo about categories, which is a list of dict + # and each dict contains the 'id', 'name', etc. about this category + self._metainfo['CLASSES'] = coco.loadCats(coco.getCatIds()) + + instance_list = [] + image_list = [] for img_id in coco.getImgIds(): img = coco.loadImgs(img_id)[0] - ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False) + img.update({ + 'img_id': + img_id, + 'img_path': + osp.join(self.data_prefix['img'], img['file_name']), + }) + image_list.append(img) + + ann_ids = coco.getAnnIds(imgIds=img_id) for ann in coco.loadAnns(ann_ids): - data_info = self.parse_data_info( + instance_info = self.parse_data_info( dict(raw_ann_info=ann, raw_img_info=img)) # skip invalid instance annotation. - if not data_info: + if not instance_info: continue - data_list.append(data_info) - - return data_list + instance_list.append(instance_info) + return instance_list, image_list def parse_data_info(self, raw_data_info: dict) -> Optional[dict]: """Parse raw COCO annotation of an instance. @@ -221,11 +247,9 @@ def parse_data_info(self, raw_data_info: dict) -> Optional[dict]: img = raw_data_info['raw_img_info'] # filter invalid instance - if 'bbox' not in ann or 'keypoints' not in ann or max( - ann['keypoints']) == 0: + if 'bbox' not in ann or 'keypoints' not in ann: return None - img_path = osp.join(self.data_prefix['img'], img['file_name']) img_w, img_h = img['width'], img['height'] # get bbox in shape [1, 4], formatted as xywh @@ -250,7 +274,7 @@ def parse_data_info(self, raw_data_info: dict) -> Optional[dict]: data_info = { 'img_id': ann['image_id'], - 'img_path': img_path, + 'img_path': img['img_path'], 'bbox': bbox, 'bbox_score': np.ones(1, dtype=np.float32), 'num_keypoints': num_keypoints, @@ -259,8 +283,15 @@ def parse_data_info(self, raw_data_info: dict) -> Optional[dict]: 'iscrowd': ann.get('iscrowd', 0), 'segmentation': ann.get('segmentation', None), 'id': ann['id'], + 'category_id': ann['category_id'], + # store the raw annotation of the instance + # it is useful for evaluation without providing ann_file + 'raw_ann_info': copy.deepcopy(ann), } + if 'crowdIndex' in img: + data_info['crowd_index'] = img['crowdIndex'] + return data_info @staticmethod @@ -279,60 +310,71 @@ def _is_valid_instance(data_info: Dict) -> bool: w, h = bbox[2:4] - bbox[:2] if w <= 0 or h <= 0: return False + # invalid keypoints + if 'keypoints' in data_info: + if np.max(data_info['keypoints']) <= 0: + return False return True - def _get_topdown_data_infos(self, data_list: List[Dict]) -> List[Dict]: + def _get_topdown_data_infos(self, instance_list: List[Dict]) -> List[Dict]: """Organize the data list in top-down mode.""" # sanitize data samples - data_list_tp = list(filter(self._is_valid_instance, data_list)) + data_list_tp = list(filter(self._is_valid_instance, instance_list)) return data_list_tp - def _get_bottomup_data_infos(self, data_list): + def _get_bottomup_data_infos(self, instance_list: List[Dict], + image_list: List[Dict]) -> List[Dict]: """Organize the data list in bottom-up mode.""" - def _concat(seq: Iterable, key: Any, axis=0): - seq = [x[key] for x in seq] - if isinstance(seq[0], np.ndarray): - seq = np.concatenate(seq, axis=axis) - return seq - # bottom-up data list data_list_bu = [] + used_img_ids = set() + # group instances by img_id - for img_id, data_infos in groupby(data_list, lambda x: x['img_id']): + for img_id, data_infos in groupby(instance_list, + lambda x: x['img_id']): + used_img_ids.add(img_id) data_infos = list(data_infos) - # get valid instances for keypoint annotations - data_infos_valid = list( - filter(self._is_valid_instance, data_infos)) - if not data_infos_valid: - continue - - img_path = data_infos_valid[0]['img_path'] - # image data + img_path = data_infos[0]['img_path'] data_info_bu = { 'img_id': img_id, 'img_path': img_path, } - # instance data - for key in data_infos_valid[0].keys(): + + for key in data_infos[0].keys(): if key not in data_info_bu: - data_info_bu[key] = _concat(data_infos_valid, key) + seq = [d[key] for d in data_infos] + if isinstance(seq[0], np.ndarray): + seq = np.concatenate(seq, axis=0) + data_info_bu[key] = seq # The segmentation annotation of invalid objects will be used # to generate valid region mask in the pipeline. invalid_segs = [] for data_info_invalid in filterfalse(self._is_valid_instance, data_infos): - if 'segementation' in data_info_invalid: + if 'segmentation' in data_info_invalid: invalid_segs.append(data_info_invalid['segmentation']) data_info_bu['invalid_segs'] = invalid_segs data_list_bu.append(data_info_bu) + # add images without instance for evaluation + if self.test_mode: + for img_info in image_list: + if img_info['img_id'] not in used_img_ids: + data_info_bu = { + 'img_id': img_info['img_id'], + 'img_path': img_info['img_path'], + 'id': list(), + 'raw_ann_info': None, + } + data_list_bu.append(data_info_bu) + return data_list_bu def _load_detection_results(self) -> List[dict]: @@ -347,6 +389,9 @@ def _load_detection_results(self) -> List[dict]: # load coco annotations to build image id-to-name index coco = COCO(self.ann_file) + # set the metainfo about categories, which is a list of dict + # and each dict contains the 'id', 'name', etc. about this category + self._metainfo['CLASSES'] = coco.loadCats(coco.getCatIds()) num_keypoints = self.metainfo['num_keypoints'] data_list = [] diff --git a/mmpose/datasets/datasets/body/mpii_dataset.py b/mmpose/datasets/datasets/body/mpii_dataset.py index c2e15d6692..a1f777a10b 100644 --- a/mmpose/datasets/datasets/body/mpii_dataset.py +++ b/mmpose/datasets/datasets/body/mpii_dataset.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import json import os.path as osp -from typing import Callable, List, Optional, Sequence, Union +from typing import Callable, List, Optional, Sequence, Tuple, Union import numpy as np from mmengine.utils import check_file_exist @@ -134,7 +134,7 @@ def __init__(self, lazy_init=lazy_init, max_refetch=max_refetch) - def _load_annotations(self) -> List[dict]: + def _load_annotations(self) -> Tuple[List[dict], List[dict]]: """Load data from annotations in MPII format.""" check_file_exist(self.ann_file) @@ -148,7 +148,9 @@ def _load_annotations(self) -> List[dict]: [2, 0, 1]) SC_BIAS = 0.6 - data_list = [] + instance_list = [] + image_list = [] + used_img_ids = set() ann_id = 0 # mpii bbox scales are normalized with factor 200. @@ -176,7 +178,7 @@ def _load_annotations(self) -> List[dict]: keypoints = np.array(ann['joints']).reshape(1, -1, 2) keypoints_visible = np.array(ann['joints_vis']).reshape(1, -1) - data_info = { + instance_info = { 'id': ann_id, 'img_id': int(ann['image'].split('.')[0]), 'img_path': osp.join(self.data_prefix['img'], ann['image']), @@ -193,9 +195,16 @@ def _load_annotations(self) -> List[dict]: headbox = headboxes_src[idx] head_size = np.linalg.norm(headbox[1] - headbox[0], axis=0) head_size *= SC_BIAS - data_info['head_size'] = head_size.reshape(1, -1) + instance_info['head_size'] = head_size.reshape(1, -1) - data_list.append(data_info) + if instance_info['img_id'] not in used_img_ids: + used_img_ids.add(instance_info['img_id']) + image_list.append({ + 'img_id': instance_info['img_id'], + 'img_path': instance_info['img_path'], + }) + + instance_list.append(instance_info) ann_id = ann_id + 1 - return data_list + return instance_list, image_list diff --git a/mmpose/datasets/datasets/body/mpii_trb_dataset.py b/mmpose/datasets/datasets/body/mpii_trb_dataset.py index 0fc138779a..be9154ad70 100644 --- a/mmpose/datasets/datasets/body/mpii_trb_dataset.py +++ b/mmpose/datasets/datasets/body/mpii_trb_dataset.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import json import os.path as osp -from typing import List +from typing import List, Tuple import numpy as np from mmengine.utils import check_file_exist @@ -103,7 +103,7 @@ class MpiiTrbDataset(BaseCocoStyleDataset): METAINFO: dict = dict(from_file='configs/_base_/datasets/mpii_trb.py') - def _load_annotations(self) -> List[dict]: + def _load_annotations(self) -> Tuple[List[dict], List[dict]]: """Load data from annotations in MPII-TRB format.""" check_file_exist(self.ann_file) @@ -112,7 +112,9 @@ def _load_annotations(self) -> List[dict]: imgid2info = {img['id']: img for img in data['images']} - data_list = [] + instance_list = [] + image_list = [] + used_img_ids = set() # mpii-trb bbox scales are normalized with factor 200. pixel_std = 200. @@ -135,7 +137,7 @@ def _load_annotations(self) -> List[dict]: img_path = osp.join(self.data_prefix['img'], imgid2info[img_id]['file_name']) - data_info = { + instance_info = { 'id': ann['id'], 'img_id': img_id, 'img_path': img_path, @@ -151,10 +153,16 @@ def _load_annotations(self) -> List[dict]: # val set if 'headbox' in ann: - data_info['headbox'] = np.array( + instance_info['headbox'] = np.array( ann['headbox'], dtype=np.float32) - data_list.append(data_info) + instance_list.append(instance_info) + if instance_info['img_id'] not in used_img_ids: + used_img_ids.add(instance_info['img_id']) + image_list.append({ + 'img_id': instance_info['img_id'], + 'img_path': instance_info['img_path'], + }) - data_list = sorted(data_list, key=lambda x: x['id']) - return data_list + instance_list = sorted(instance_list, key=lambda x: x['id']) + return instance_list, image_list diff --git a/mmpose/datasets/datasets/hand/coco_wholebody_hand_dataset.py b/mmpose/datasets/datasets/hand/coco_wholebody_hand_dataset.py index 61d06ac797..1831c9c89d 100644 --- a/mmpose/datasets/datasets/hand/coco_wholebody_hand_dataset.py +++ b/mmpose/datasets/datasets/hand/coco_wholebody_hand_dataset.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import os.path as osp -from typing import List +from typing import List, Tuple import numpy as np from mmengine.utils import check_file_exist @@ -84,17 +84,27 @@ class CocoWholeBodyHandDataset(BaseCocoStyleDataset): METAINFO: dict = dict( from_file='configs/_base_/datasets/coco_wholebody_hand.py') - def _load_annotations(self) -> List[dict]: + def _load_annotations(self) -> Tuple[List[dict], List[dict]]: """Load data from annotations in COCO format.""" check_file_exist(self.ann_file) coco = COCO(self.ann_file) - data_list = [] + instance_list = [] + image_list = [] id = 0 for img_id in coco.getImgIds(): img = coco.loadImgs(img_id)[0] + + img.update({ + 'img_id': + img_id, + 'img_path': + osp.join(self.data_prefix['img'], img['file_name']), + }) + image_list.append(img) + ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False) anns = coco.loadAnns(ann_ids) for ann in anns: @@ -103,8 +113,6 @@ def _load_annotations(self) -> List[dict]: # valid instances (left and right hand) in one image if ann[f'{type}hand_valid'] and max( ann[f'{type}hand_kpts']) > 0: - img_path = osp.join(self.data_prefix['img'], - img['file_name']) bbox_xywh = np.array( ann[f'{type}hand_box'], @@ -120,9 +128,9 @@ def _load_annotations(self) -> List[dict]: num_keypoints = np.count_nonzero(keypoints.max(axis=2)) - data_info = { + instance_info = { 'img_id': ann['image_id'], - 'img_path': img_path, + 'img_path': img['img_path'], 'bbox': bbox, 'bbox_score': np.ones(1, dtype=np.float32), 'num_keypoints': num_keypoints, @@ -132,8 +140,8 @@ def _load_annotations(self) -> List[dict]: 'segmentation': ann['segmentation'], 'id': id, } - data_list.append(data_info) + instance_list.append(instance_info) id = id + 1 - data_list = sorted(data_list, key=lambda x: x['id']) - return data_list + instance_list = sorted(instance_list, key=lambda x: x['id']) + return instance_list, image_list diff --git a/mmpose/datasets/datasets/wholebody/coco_wholebody_dataset.py b/mmpose/datasets/datasets/wholebody/coco_wholebody_dataset.py index 29e7c8dfbc..00a2ea418f 100644 --- a/mmpose/datasets/datasets/wholebody/coco_wholebody_dataset.py +++ b/mmpose/datasets/datasets/wholebody/coco_wholebody_dataset.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +import copy import os.path as osp from typing import Optional @@ -117,6 +118,10 @@ def parse_data_info(self, raw_data_info: dict) -> Optional[dict]: 'iscrowd': ann['iscrowd'], 'segmentation': ann['segmentation'], 'id': ann['id'], + 'category_id': ann['category_id'], + # store the raw annotation of the instance + # it is useful for evaluation without providing ann_file + 'raw_ann_info': copy.deepcopy(ann), } return data_info diff --git a/mmpose/datasets/samplers.py b/mmpose/datasets/samplers.py new file mode 100644 index 0000000000..d6bb34287a --- /dev/null +++ b/mmpose/datasets/samplers.py @@ -0,0 +1,114 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import itertools +import math +from typing import Iterator, List, Optional, Sized, Union + +import torch +from mmengine.dist import get_dist_info, sync_random_seed +from torch.utils.data import Sampler + +from mmpose.datasets import CombinedDataset +from mmpose.registry import DATA_SAMPLERS + + +@DATA_SAMPLERS.register_module() +class MultiSourceSampler(Sampler): + """Multi-Source Sampler. According to the sampling ratio, sample data from + different datasets to form batches. + + Args: + dataset (Sized): The dataset + batch_size (int): Size of mini-batch + source_ratio (list[int | float]): The sampling ratio of different + source datasets in a mini-batch + shuffle (bool): Whether shuffle the dataset or not. Defaults to + ``True`` + round_up (bool): Whether to add extra samples to make the number of + samples evenly divisible by the world size. Defaults to True. + seed (int, optional): Random seed. If ``None``, set a random seed. + Defaults to ``None`` + """ + + def __init__(self, + dataset: Sized, + batch_size: int, + source_ratio: List[Union[int, float]], + shuffle: bool = True, + round_up: bool = True, + seed: Optional[int] = None) -> None: + + assert isinstance(dataset, CombinedDataset),\ + f'The dataset must be CombinedDataset, but get {dataset}' + assert isinstance(batch_size, int) and batch_size > 0, \ + 'batch_size must be a positive integer value, ' \ + f'but got batch_size={batch_size}' + assert isinstance(source_ratio, list), \ + f'source_ratio must be a list, but got source_ratio={source_ratio}' + assert len(source_ratio) == len(dataset._lens), \ + 'The length of source_ratio must be equal to ' \ + f'the number of datasets, but got source_ratio={source_ratio}' + + rank, world_size = get_dist_info() + self.rank = rank + self.world_size = world_size + + self.dataset = dataset + self.cumulative_sizes = [0] + list(itertools.accumulate(dataset._lens)) + self.batch_size = batch_size + self.source_ratio = source_ratio + self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / world_size)) + self.num_per_source = [ + int(batch_size * sr / sum(source_ratio)) for sr in source_ratio + ] + self.num_per_source[0] = batch_size - sum(self.num_per_source[1:]) + + assert sum(self.num_per_source) == batch_size, \ + 'The sum of num_per_source must be equal to ' \ + f'batch_size, but get {self.num_per_source}' + + self.seed = sync_random_seed() if seed is None else seed + self.shuffle = shuffle + self.round_up = round_up + self.source2inds = { + source: self._indices_of_rank(len(ds)) + for source, ds in enumerate(dataset.datasets) + } + + def _infinite_indices(self, sample_size: int) -> Iterator[int]: + """Infinitely yield a sequence of indices.""" + g = torch.Generator() + g.manual_seed(self.seed) + while True: + if self.shuffle: + yield from torch.randperm(sample_size, generator=g).tolist() + else: + yield from torch.arange(sample_size).tolist() + + def _indices_of_rank(self, sample_size: int) -> Iterator[int]: + """Slice the infinite indices by rank.""" + yield from itertools.islice( + self._infinite_indices(sample_size), self.rank, None, + self.world_size) + + def __iter__(self) -> Iterator[int]: + batch_buffer = [] + num_iters = self.num_samples // self.batch_size + if self.round_up and self.num_samples > num_iters * self.batch_size: + num_iters += 1 + for i in range(num_iters): + for source, num in enumerate(self.num_per_source): + batch_buffer_per_source = [] + for idx in self.source2inds[source]: + idx += self.cumulative_sizes[source] + batch_buffer_per_source.append(idx) + if len(batch_buffer_per_source) == num: + batch_buffer += batch_buffer_per_source + break + return iter(batch_buffer) + + def __len__(self) -> int: + return self.num_samples + + def set_epoch(self, epoch: int) -> None: + """Compatible in `epoch-based runner.""" + pass diff --git a/mmpose/datasets/transforms/__init__.py b/mmpose/datasets/transforms/__init__.py index 236b6beeb2..61dae74b8c 100644 --- a/mmpose/datasets/transforms/__init__.py +++ b/mmpose/datasets/transforms/__init__.py @@ -5,6 +5,7 @@ GetBBoxCenterScale, PhotometricDistortion, RandomBBoxTransform, RandomFlip, RandomHalfBody) +from .converting import KeypointConverter from .formatting import PackPoseInputs from .loading import LoadImage from .topdown_transforms import TopdownAffine @@ -14,5 +15,5 @@ 'RandomHalfBody', 'TopdownAffine', 'Albumentation', 'PhotometricDistortion', 'PackPoseInputs', 'LoadImage', 'BottomupGetHeatmapMask', 'BottomupRandomAffine', 'BottomupResize', - 'GenerateTarget' + 'GenerateTarget', 'KeypointConverter' ] diff --git a/mmpose/datasets/transforms/bottomup_transforms.py b/mmpose/datasets/transforms/bottomup_transforms.py index f3a25d02b6..c31e0ae17d 100644 --- a/mmpose/datasets/transforms/bottomup_transforms.py +++ b/mmpose/datasets/transforms/bottomup_transforms.py @@ -164,6 +164,8 @@ class BottomupRandomAffine(BaseTransform): (0.75, 1.5) scale_prob (float): Probability of applying random resizing. Defaults to 1.0 + scale_type (str): wrt ``long`` or ``short`` length of the image. + Defaults to ``short`` rotate_factor (float): Randomly rotate the bbox in :math:`[-rotate_factor, rotate_factor]` in degrees. Defaults to 40.0 @@ -179,6 +181,7 @@ def __init__(self, shift_prob: float = 1., scale_factor: Tuple[float, float] = (0.75, 1.5), scale_prob: float = 1., + scale_type: str = 'short', rotate_factor: float = 30., rotate_prob: float = 1, use_udp: bool = False) -> None: @@ -189,6 +192,7 @@ def __init__(self, self.shift_prob = shift_prob self.scale_factor = scale_factor self.scale_prob = scale_prob + self.scale_type = scale_type self.rotate_factor = rotate_factor self.rotate_prob = rotate_prob self.use_udp = use_udp @@ -200,8 +204,7 @@ def _truncnorm(low: float = -1., """Sample from a truncated normal distribution.""" return truncnorm.rvs(low, high, size=size).astype(np.float32) - @staticmethod - def _fix_aspect_ratio(scale: np.ndarray, aspect_ratio: float): + def _fix_aspect_ratio(self, scale: np.ndarray, aspect_ratio: float): """Extend the scale to match the given aspect ratio. Args: @@ -213,10 +216,19 @@ def _fix_aspect_ratio(scale: np.ndarray, aspect_ratio: float): """ w, h = scale if w > h * aspect_ratio: - _w, _h = w, w / aspect_ratio + if self.scale_type == 'long': + _w, _h = w, w / aspect_ratio + elif self.scale_type == 'short': + _w, _h = h * aspect_ratio, h + else: + raise ValueError(f'Unknown scale type: {self.scale_type}') else: - _w, _h = h * aspect_ratio, h - + if self.scale_type == 'short': + _w, _h = w, w / aspect_ratio + elif self.scale_type == 'long': + _w, _h = h * aspect_ratio, h + else: + raise ValueError(f'Unknown scale type: {self.scale_type}') return np.array([_w, _h], dtype=scale.dtype) @cache_randomness @@ -238,8 +250,8 @@ def _get_transform_params(self) -> Tuple: # get scale if np.random.rand() < self.scale_prob: scale_min, scale_max = self.scale_factor - scale = scale_min + (scale_max - - scale_min) * self._truncnorm(size=(1, )) + scale = scale_min + (scale_max - scale_min) * ( + self._truncnorm(size=(1, )) + 1) / 2 else: scale = np.ones(1, dtype=np.float32) @@ -299,6 +311,12 @@ def transform(self, results: Dict) -> Optional[dict]: results['keypoints'][..., :2] = cv2.transform( results['keypoints'][..., :2], warp_mat) + if 'bbox' in results: + bbox = np.tile(results['bbox'], 2).reshape(-1, 4, 2) + # corner order: left_top, left_bottom, right_top, right_bottom + bbox[:, 1:3, 0] = bbox[:, 0:2, 0] + results['bbox'] = cv2.transform(bbox, warp_mat).reshape(-1, 8) + results['input_size'] = self.input_size results['warp_mat'] = warp_mat @@ -314,30 +332,32 @@ class BottomupResize(BaseTransform): Required Keys: - img - - img_shape + - ori_shape Modified Keys: - img + - img_shape Added Keys: - input_size - + - warp_mat + - aug_scale Args: input_size (Tuple[int, int]): The input size of the model in [w, h]. Note that the actually size of the resized image will be affected by ``resize_mode`` and ``size_factor``, thus may not exactly equals to the ``input_size`` - aux_scales (List[float], optional): The auxiliary input scales for + aug_scales (List[float], optional): The extra input scales for multi-scale testing. If given, the input image will be resized to different scales to build a image pyramid. And heatmaps from all scales will be aggregated to make final prediction. Defaults to ``None`` size_factor (int): The actual input size will be ceiled to a multiple of the `size_factor` value at both sides. - Defaults to 8 + Defaults to 16 resize_mode (str): The method to resize the image to the input size. Options are: @@ -357,14 +377,14 @@ class BottomupResize(BaseTransform): def __init__(self, input_size: Tuple[int, int], - aux_scales: Optional[List[float]] = None, - size_factor: int = 8, + aug_scales: Optional[List[float]] = None, + size_factor: int = 32, resize_mode: str = 'fit', use_udp: bool = False): super().__init__() self.input_size = input_size - self.aux_scales = aux_scales + self.aug_scales = aug_scales self.resize_mode = resize_mode self.size_factor = size_factor self.use_udp = use_udp @@ -374,9 +394,11 @@ def _ceil_to_multiple(size: Tuple[int, int], base: int): """Ceil the given size (tuple of [w, h]) to a multiple of the base.""" return tuple(int(np.ceil(s / base) * base) for s in size) - def _get_actual_size(self, img_size: Tuple[int, int], - input_size: Tuple[int, int]) -> Tuple: - """Calculate the actual input size and the size of the resized image. + def _get_input_size(self, img_size: Tuple[int, int], + input_size: Tuple[int, int]) -> Tuple: + """Calculate the actual input size (which the original image will be + resized to) and the padded input size (which the resized image will be + padded to, or which is the size of the model input). Args: img_size (Tuple[int, int]): The original image size in [w, h] @@ -384,44 +406,44 @@ def _get_actual_size(self, img_size: Tuple[int, int], Returns: tuple: - - actual_input_size (Tuple[int, int]): The target size to generate + - actual_input_size (Tuple[int, int]): The target size to resize + the image + - padded_input_size (Tuple[int, int]): The target size to generate the model input which will contain the resized image - - actual_img_size (Tuple[int, int]): The target size to resize the - image """ img_w, img_h = img_size ratio = img_w / img_h if self.resize_mode == 'fit': - actual_input_size = self._ceil_to_multiple(input_size, + padded_input_size = self._ceil_to_multiple(input_size, self.size_factor) - if actual_input_size != input_size: + if padded_input_size != input_size: raise ValueError( 'When ``resize_mode==\'fit\', the input size (height and' ' width) should be mulitples of the size_factor(' f'{self.size_factor}) at all scales. Got invalid input ' f'size {input_size}.') - tgt_w, tgt_h = actual_input_size - rsz_w = min(tgt_w, tgt_h * ratio) - rsz_h = min(tgt_h, tgt_w / ratio) - actual_img_size = (rsz_w, rsz_h) + pad_w, pad_h = padded_input_size + rsz_w = min(pad_w, pad_h * ratio) + rsz_h = min(pad_h, pad_w / ratio) + actual_input_size = (rsz_w, rsz_h) elif self.resize_mode == 'expand': - _actual_input_size = self._ceil_to_multiple( + _padded_input_size = self._ceil_to_multiple( input_size, self.size_factor) - tgt_w, tgt_h = _actual_input_size - rsz_w = max(tgt_w, tgt_h * ratio) - rsz_h = max(tgt_h, tgt_w / ratio) + pad_w, pad_h = _padded_input_size + rsz_w = max(pad_w, pad_h * ratio) + rsz_h = max(pad_h, pad_w / ratio) - actual_img_size = (rsz_w, rsz_h) - actual_input_size = self._ceil_to_multiple(actual_img_size, + actual_input_size = (rsz_w, rsz_h) + padded_input_size = self._ceil_to_multiple(actual_input_size, self.size_factor) else: raise ValueError(f'Invalid resize mode {self.resize_mode}') - return actual_input_size, actual_img_size + return actual_input_size, padded_input_size def transform(self, results: Dict) -> Optional[dict]: """The transform function of :class:`BottomupResize` to perform @@ -438,21 +460,17 @@ def transform(self, results: Dict) -> Optional[dict]: """ img = results['img'] - img_h, img_w = results['img_shape'] + img_h, img_w = results['ori_shape'] w, h = self.input_size input_sizes = [(w, h)] - if self.aux_scales: - input_sizes += [(int(w * s), int(h * s)) for s in self.aux_scales] + if self.aug_scales: + input_sizes += [(int(w * s), int(h * s)) for s in self.aug_scales] imgs = [] - warp_mats = [] - actual_input_sizes = [] - actual_img_sizes = [] - - for _w, _h in input_sizes: + for i, (_w, _h) in enumerate(input_sizes): - actual_input_size, actual_img_size = self._get_actual_size( + actual_input_size, padded_input_size = self._get_input_size( img_size=(img_w, img_h), input_size=(_w, _h)) if self.use_udp: @@ -463,32 +481,37 @@ def transform(self, results: Dict) -> Optional[dict]: center=center, scale=scale, rot=0, - output_size=actual_img_size) + output_size=actual_input_size) else: center = np.array([img_w / 2, img_h / 2], dtype=np.float32) - scale = np.array([img_w, img_h], dtype=np.float32) + scale = np.array([ + img_w * padded_input_size[0] / actual_input_size[0], + img_h * padded_input_size[1] / actual_input_size[1] + ], + dtype=np.float32) warp_mat = get_warp_matrix( center=center, scale=scale, rot=0, - output_size=actual_img_size) + output_size=padded_input_size) _img = cv2.warpAffine( - img, warp_mat, actual_input_size, flags=cv2.INTER_LINEAR) + img, warp_mat, padded_input_size, flags=cv2.INTER_LINEAR) imgs.append(_img) - warp_mats.append(warp_mat) - actual_input_sizes.append(actual_input_size) - actual_img_sizes.append(actual_img_size) - if self.aux_scales: + # Store the transform information w.r.t. the main input size + if i == 0: + results['img_shape'] = padded_input_size[::-1] + results['input_center'] = center + results['input_scale'] = scale + results['input_size'] = padded_input_size + + if self.aug_scales: results['img'] = imgs + results['aug_scales'] = self.aug_scales else: results['img'] = imgs[0] - - # Store the transform information w.r.t. the main input size - results['warp_mat'] = warp_mats[0] - results['input_size'] = actual_input_sizes[0] - results['img_size'] = actual_img_sizes[0] + results['aug_scale'] = None return results diff --git a/mmpose/datasets/transforms/common_transforms.py b/mmpose/datasets/transforms/common_transforms.py index a29e163df4..13ec3eb965 100644 --- a/mmpose/datasets/transforms/common_transforms.py +++ b/mmpose/datasets/transforms/common_transforms.py @@ -10,6 +10,7 @@ from mmcv.transforms import BaseTransform from mmcv.transforms.utils import avoid_cache_randomness, cache_randomness from mmengine import is_list_of +from mmengine.dist import get_dist_info from scipy.stats import truncnorm from mmpose.codecs import * # noqa: F401, F403 @@ -96,6 +97,7 @@ class RandomFlip(BaseTransform): - img - img_shape - flip_indices + - input_size (optional) - bbox (optional) - bbox_center (optional) - keypoints (optional) @@ -201,7 +203,7 @@ def transform(self, results: dict) -> dict: results['flip'] = True results['flip_direction'] = flip_dir - h, w = results['img_shape'] + h, w = results.get('input_size', results['img_shape']) # flip image and mask if isinstance(results['img'], list): results['img'] = [ @@ -618,7 +620,6 @@ def __init__(self, } else: self.keymap_to_albu = keymap - self.keymap_back = {v: k for k, v in self.keymap_to_albu.items()} def albu_builder(self, cfg: dict) -> albumentations: """Import a module from albumentations. @@ -639,9 +640,12 @@ def albu_builder(self, cfg: dict) -> albumentations: if mmengine.is_str(obj_type): if albumentations is None: raise RuntimeError('albumentations is not installed') - if not hasattr(albumentations.augmentations.transforms, obj_type): - warnings.warn('{obj_type} is not pixel-level transformations. ' - 'Please use with caution.') + rank, _ = get_dist_info() + if rank == 0 and not hasattr( + albumentations.augmentations.transforms, obj_type): + warnings.warn( + f'{obj_type} is not pixel-level transformations. ' + 'Please use with caution.') obj_cls = getattr(albumentations, obj_type) else: raise TypeError(f'type must be a str, but got {type(obj_type)}') @@ -654,23 +658,6 @@ def albu_builder(self, cfg: dict) -> albumentations: return obj_cls(**args) - @staticmethod - def mapper(d: dict, keymap: dict) -> dict: - """Dictionary mapper. - - Renames keys according to keymap provided. - - Args: - d (dict): old dict - keymap (dict): key mapping like {'old_key': 'new_key'}. - - Returns: - dict: new dict. - """ - - updated_dict = {keymap.get(k, k): v for k, v in d.items()} - return updated_dict - def transform(self, results: dict) -> dict: """The transform function of :class:`Albumentation` to apply albumentations transforms. @@ -684,11 +671,18 @@ def transform(self, results: dict) -> dict: dict: updated result dict. """ # map result dict to albumentations format - results = self.mapper(results, self.keymap_to_albu) + results_albu = {} + for k, v in self.keymap_to_albu.items(): + assert k in results, \ + f'The `{k}` is required to perform albumentations transforms' + results_albu[v] = results[k] + # Apply albumentations transforms - results = self.aug(**results) - # map result dict back to the original format - results = self.mapper(results, self.keymap_back) + results_albu = self.aug(**results_albu) + + # map the albu results back to the original format + for k, v in self.keymap_to_albu.items(): + results[k] = results_albu[v] return results @@ -884,58 +878,60 @@ class GenerateTarget(BaseTransform): - keypoints_visible - dataset_keypoint_weights - Added Keys (depends on the args): - - heatmaps - - keypoint_labels - - keypoint_x_labels - - keypoint_y_labels - - keypoint_weights + Added Keys: + + - The keys of the encoded items from the codec will be updated into + the results, e.g. ``'heatmaps'`` or ``'keypoint_weights'``. See + the specific codec for more details. Args: - encoder (dict | list[dict]): The codec config for keypoint encoding - target_type (str): The type of the encoded form of the keypoints. - Should be one of the following options: - - - ``'heatmap'``: The encoded should be instance-irrelevant - heatmaps and will be stored in ``results['heatmaps']`` - - ``'multilevel_heatmap'`` The encoded should be a list of - heatmaps and will be stored in ``results['heatmaps']``. - Note that in this case, ``self.encoder`` should also be - a list, and each encoder encodes a single-level heatmaps. - - ``'keypoint_label'``: The encoded should be instance-level - labels and will be stored in ``results['keypoint_label']`` - - ``'keypoint_xy_label'``: The encoed should be instance-level - labels in x-axis and y-axis respectively. They will be stored - in ``results['keypoint_x_label']`` and - ``results['keypoint_y_label']`` - - ``'heatmap+keypoint_label'``: The encoded should be heatmaps and - keypoint_labels, will be stored in ``results['heatmaps']`` - and ``results['keypoint_label']`` + encoder (dict | list[dict]): The codec config for keypoint encoding. + Both single encoder and multiple encoders (given as a list) are + supported + multilevel (bool): Determine the method to handle multiple encoders. + If ``multilevel==True``, generate multilevel targets from a group + of encoders of the same type (e.g. multiple :class:`MSRAHeatmap` + encoders with different sigma values); If ``multilevel==False``, + generate combined targets from a group of different encoders. This + argument will have no effect in case of single encoder. Defaults + to ``False`` use_dataset_keypoint_weights (bool): Whether use the keypoint weights from the dataset meta information. Defaults to ``False`` + target_type (str, deprecated): This argument is deprecated and has no + effect. Defaults to ``None`` """ def __init__(self, encoder: MultiConfig, - target_type: str, + target_type: Optional[str] = None, + multilevel: bool = False, use_dataset_keypoint_weights: bool = False) -> None: super().__init__() + + if target_type is not None: + warnings.warn( + 'The argument `target_type` is deprecated in GenerateTarget. ' + 'The target type and encoded keys will be determined by ' + 'encoder(s).', DeprecationWarning) + self.encoder_cfg = deepcopy(encoder) - self.target_type = target_type + self.multilevel = multilevel self.use_dataset_keypoint_weights = use_dataset_keypoint_weights - if self.target_type == 'multilevel_heatmap': - if not isinstance(self.encoder_cfg, list): - raise ValueError( - 'The encoder should be a list if target type is ' - '"multilevel_heatmap"') + if isinstance(self.encoder_cfg, list): self.encoder = [ KEYPOINT_CODECS.build(cfg) for cfg in self.encoder_cfg ] else: + assert not self.multilevel, ( + 'Need multiple encoder configs if ``multilevel==True``') self.encoder = KEYPOINT_CODECS.build(self.encoder_cfg) def transform(self, results: Dict) -> Optional[dict]: + """The transform function of :class:`GenerateTarget`. + + See ``transform()`` method of :class:`BaseTransform` for details. + """ if results.get('transformed_keypoints', None) is not None: # use keypoints transformed by TopdownAffine @@ -950,56 +946,83 @@ def transform(self, results: Dict) -> Optional[dict]: keypoints_visible = results['keypoints_visible'] - if self.target_type == 'heatmap': - heatmaps, keypoint_weights = self.encoder.encode( - keypoints=keypoints, keypoints_visible=keypoints_visible) - - results['heatmaps'] = heatmaps - results['keypoint_weights'] = keypoint_weights - - elif self.target_type == 'keypoint_label': - keypoint_labels, keypoint_weights = self.encoder.encode( - keypoints=keypoints, keypoints_visible=keypoints_visible) - - results['keypoint_labels'] = keypoint_labels - results['keypoint_weights'] = keypoint_weights - - elif self.target_type == 'keypoint_xy_label': - x_labels, y_labels, keypoint_weights = self.encoder.encode( - keypoints=keypoints, keypoints_visible=keypoints_visible) - - results['keypoint_x_labels'] = x_labels - results['keypoint_y_labels'] = y_labels - results['keypoint_weights'] = keypoint_weights - - elif self.target_type == 'heatmap+keypoint_label': - heatmaps, keypoint_labels, keypoint_weights = self.encoder.encode( - keypoints=keypoints, keypoints_visible=keypoints_visible) - - results['heatmaps'] = heatmaps - results['keypoint_labels'] = keypoint_labels - results['keypoint_weights'] = keypoint_weights - - elif self.target_type == 'multilevel_heatmap': - heatmaps = [] - keypoint_weights = [] - - for encoder in self.encoder: - _heatmaps, _keypoint_weights = encoder.encode( - keypoints=keypoints, keypoints_visible=keypoints_visible) - heatmaps.append(_heatmaps) - keypoint_weights.append(_keypoint_weights) - - results['heatmaps'] = heatmaps - # keypoint_weights.shape: [N, K] -> [N, n, K] - results['keypoint_weights'] = np.stack(keypoint_weights, axis=1) + # Encoded items from the encoder(s) will be updated into the results. + # Please refer to the document of the specific codec for details about + # encoded items. + if not isinstance(self.encoder, list): + # For single encoding, the encoded items will be directly added + # into results. + auxiliary_encode_kwargs = { + key: results[key] + for key in self.encoder.auxiliary_encode_keys + } + encoded = self.encoder.encode( + keypoints=keypoints, + keypoints_visible=keypoints_visible, + **auxiliary_encode_kwargs) else: - raise ValueError(f'Invalid target type {self.target_type}') + encoded_list = [] + for _encoder in self.encoder: + auxiliary_encode_kwargs = { + key: results[key] + for key in _encoder.auxiliary_encode_keys + } + encoded_list.append( + _encoder.encode( + keypoints=keypoints, + keypoints_visible=keypoints_visible, + **auxiliary_encode_kwargs)) + + if self.multilevel: + # For multilevel encoding, the encoded items from each encoder + # should have the same keys. + + keys = encoded_list[0].keys() + if not all(_encoded.keys() == keys + for _encoded in encoded_list): + raise ValueError( + 'Encoded items from all encoders must have the same ' + 'keys if ``multilevel==True``.') + + encoded = { + k: [_encoded[k] for _encoded in encoded_list] + for k in keys + } + + else: + # For combined encoding, the encoded items from different + # encoders should have no overlapping items, except for + # `keypoint_weights`. If multiple `keypoint_weights` are given, + # they will be multiplied as the final `keypoint_weights`. + + encoded = dict() + keypoint_weights = [] + + for _encoded in encoded_list: + for key, value in _encoded.items(): + if key == 'keypoint_weights': + keypoint_weights.append(value) + elif key not in encoded: + encoded[key] = value + else: + raise ValueError( + f'Overlapping item "{key}" from multiple ' + 'encoders, which is not supported when ' + '``multilevel==False``') + + if keypoint_weights: + encoded['keypoint_weights'] = keypoint_weights + + if self.use_dataset_keypoint_weights and 'keypoint_weights' in encoded: + if isinstance(encoded['keypoint_weights'], list): + for w in encoded['keypoint_weights']: + w *= results['dataset_keypoint_weights'] + else: + encoded['keypoint_weights'] *= results[ + 'dataset_keypoint_weights'] - # multiply meta keypoint weight - if self.use_dataset_keypoint_weights: - results['keypoint_weights'] *= results['dataset_keypoint_weights'] + results.update(encoded) return results @@ -1011,7 +1034,6 @@ def __repr__(self) -> str: """ repr_str = self.__class__.__name__ repr_str += (f'(encoder={str(self.encoder_cfg)}, ') - repr_str += (f'(target_type={str(self.target_type)}, ') repr_str += ('use_dataset_keypoint_weights=' f'{self.use_dataset_keypoint_weights})') return repr_str diff --git a/mmpose/datasets/transforms/converting.py b/mmpose/datasets/transforms/converting.py new file mode 100644 index 0000000000..0730808967 --- /dev/null +++ b/mmpose/datasets/transforms/converting.py @@ -0,0 +1,58 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +import numpy as np +from mmcv.transforms import BaseTransform + +from mmpose.registry import TRANSFORMS + + +@TRANSFORMS.register_module() +class KeypointConverter(BaseTransform): + """Change the order of keypoints according to the given mapping. + + Required Keys: + + - keypoints + - keypoints_visible + + Modified Keys: + + - keypoints + - keypoints_visible + + Args: + num_keypoints (int): The number of keypoints in target dataset. + mapping (list): A list containing mapping indexes. Each element has + format (source_index, target_index) + """ + + def __init__(self, num_keypoints: int, mapping: List[Tuple[int, int]]): + self.num_keypoints = num_keypoints + self.mapping = mapping + + def transform(self, results: dict) -> dict: + num_instances = results['keypoints'].shape[0] + + keypoints = np.zeros((num_instances, self.num_keypoints, 2)) + keypoints_visible = np.zeros((num_instances, self.num_keypoints)) + + source_index, target_index = zip(*self.mapping) + keypoints[:, target_index] = results['keypoints'][:, source_index] + keypoints_visible[:, target_index] = results[ + 'keypoints_visible'][:, source_index] + + results['keypoints'] = keypoints + results['keypoints_visible'] = keypoints_visible + return results + + def __repr__(self) -> str: + """print the basic information of the transform. + + Returns: + str: Formatted string. + """ + repr_str = self.__class__.__name__ + repr_str += f'(num_keypoints={self.num_keypoints}, '\ + f'mapping={self.mapping})' + return repr_str diff --git a/mmpose/datasets/transforms/formatting.py b/mmpose/datasets/transforms/formatting.py index b2b0ed316d..eba9e554aa 100644 --- a/mmpose/datasets/transforms/formatting.py +++ b/mmpose/datasets/transforms/formatting.py @@ -3,7 +3,7 @@ import numpy as np import torch -from mmcv.transforms import BaseTransform, to_tensor +from mmcv.transforms import BaseTransform from mmengine.structures import InstanceData, PixelData from mmengine.utils import is_seq_of @@ -27,8 +27,8 @@ def image_to_tensor(img: Union[np.ndarray, if isinstance(img, np.ndarray): if len(img.shape) < 3: img = np.expand_dims(img, -1) - img = np.ascontiguousarray(img.transpose(2, 0, 1)) - tensor = to_tensor(img) + + tensor = torch.from_numpy(img).permute(2, 0, 1).contiguous() else: assert is_seq_of(img, np.ndarray) tensor = torch.stack([image_to_tensor(_img) for _img in img]) @@ -47,8 +47,13 @@ class PackPoseInputs(BaseTransform): - ``img_id``: id of the image + - ``'category_id'``: the id of the instance category + - ``img_path``: path to the image file + - ``crowd_index`` (optional): measure the crowding level of an image, + defined in CrowdPose dataset + - ``ori_shape``: original shape of the image as a tuple (h, w, c) - ``img_shape``: shape of the image input to the network as a tuple \ @@ -63,15 +68,18 @@ class PackPoseInputs(BaseTransform): - ``flip_indices``: the indices of each keypoint's symmetric keypoint + - ``raw_ann_info`` (optional): raw annotation of the instance(s) + Args: meta_keys (Sequence[str], optional): Meta keys which will be stored in :obj: `PoseDataSample` as meta info. Defaults to ``('id', - 'img_id', 'img_path', 'ori_shape', 'img_shape', 'input_size', - 'flip', 'flip_direction', 'flip_indices)`` + 'img_id', 'img_path', 'category_id', 'crowd_index, 'ori_shape', + 'img_shape',, 'input_size', 'input_center', 'input_scale', 'flip', + 'flip_direction', 'flip_indices', 'raw_ann_info')`` """ # items in `instance_mapping_table` will be directly packed into - # PoseDataSample without converting to Tensor + # PoseDataSample.gt_instances without converting to Tensor instance_mapping_table = { 'bbox': 'bboxes', 'head_size': 'head_size', @@ -79,24 +87,38 @@ class PackPoseInputs(BaseTransform): 'bbox_scale': 'bbox_scales', 'bbox_score': 'bbox_scores', 'keypoints': 'keypoints', - 'keypoints_visible': 'keypoints_visible' + 'keypoints_visible': 'keypoints_visible', } + # items in `label_mapping_table` will be packed into + # PoseDataSample.gt_instance_labels and converted to Tensor. These items + # will be used for computing losses label_mapping_table = { 'keypoint_labels': 'keypoint_labels', 'keypoint_x_labels': 'keypoint_x_labels', 'keypoint_y_labels': 'keypoint_y_labels', - 'keypoint_weights': 'keypoint_weights' + 'keypoint_weights': 'keypoint_weights', + 'instance_coords': 'instance_coords' } + # items in `field_mapping_table` will be packed into + # PoseDataSample.gt_fields and converted to Tensor. These items will be + # used for computing losses field_mapping_table = { 'heatmaps': 'heatmaps', + 'instance_heatmaps': 'instance_heatmaps', + 'heatmap_mask': 'heatmap_mask', + 'heatmap_weights': 'heatmap_weights', + 'displacements': 'displacements', + 'displacement_weights': 'displacement_weights', } def __init__(self, - meta_keys=('id', 'img_id', 'img_path', 'ori_shape', - 'img_shape', 'input_size', 'flip', - 'flip_direction', 'flip_indices'), + meta_keys=('id', 'img_id', 'img_path', 'category_id', + 'crowd_index', 'ori_shape', 'img_shape', + 'input_size', 'input_center', 'input_scale', + 'flip', 'flip_direction', 'flip_indices', + 'raw_ann_info'), pack_transformed=False): self.meta_keys = meta_keys self.pack_transformed = pack_transformed @@ -139,7 +161,16 @@ def transform(self, results: dict) -> dict: gt_instance_labels = InstanceData() for key, packed_key in self.label_mapping_table.items(): if key in results: - gt_instance_labels.set_field(results[key], packed_key) + if isinstance(results[key], list): + # A list of labels is usually generated by combined + # multiple encoders (See ``GenerateTarget`` in + # mmpose/datasets/transforms/common_transforms.py) + # In this case, labels in list should have the same + # shape and will be stacked. + _labels = np.stack(results[key]) + gt_instance_labels.set_field(_labels, packed_key) + else: + gt_instance_labels.set_field(results[key], packed_key) data_sample.gt_instance_labels = gt_instance_labels.to_tensor() # pack fields diff --git a/mmpose/engine/hooks/__init__.py b/mmpose/engine/hooks/__init__.py index 4fa8431d3e..dadb9c5f91 100644 --- a/mmpose/engine/hooks/__init__.py +++ b/mmpose/engine/hooks/__init__.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +from .ema_hook import ExpMomentumEMA from .visualization_hook import PoseVisualizationHook -__all__ = ['PoseVisualizationHook'] +__all__ = ['PoseVisualizationHook', 'ExpMomentumEMA'] diff --git a/mmpose/engine/hooks/ema_hook.py b/mmpose/engine/hooks/ema_hook.py new file mode 100644 index 0000000000..fd1a689f96 --- /dev/null +++ b/mmpose/engine/hooks/ema_hook.py @@ -0,0 +1,69 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Optional + +import torch +import torch.nn as nn +from mmengine.model import ExponentialMovingAverage +from torch import Tensor + +from mmpose.registry import MODELS + + +@MODELS.register_module() +class ExpMomentumEMA(ExponentialMovingAverage): + """Exponential moving average (EMA) with exponential momentum strategy, + which is used in YOLOX. + + Ported from ` the implementation of MMDetection + `_. + + Args: + model (nn.Module): The model to be averaged. + momentum (float): The momentum used for updating ema parameter. + Ema's parameter are updated with the formula: + `averaged_param = (1-momentum) * averaged_param + momentum * + source_param`. Defaults to 0.0002. + gamma (int): Use a larger momentum early in training and gradually + annealing to a smaller value to update the ema model smoothly. The + momentum is calculated as + `(1 - momentum) * exp(-(1 + steps) / gamma) + momentum`. + Defaults to 2000. + interval (int): Interval between two updates. Defaults to 1. + device (torch.device, optional): If provided, the averaged model will + be stored on the :attr:`device`. Defaults to None. + update_buffers (bool): if True, it will compute running averages for + both the parameters and the buffers of the model. Defaults to + False. + """ + + def __init__(self, + model: nn.Module, + momentum: float = 0.0002, + gamma: int = 2000, + interval=1, + device: Optional[torch.device] = None, + update_buffers: bool = False) -> None: + super().__init__( + model=model, + momentum=momentum, + interval=interval, + device=device, + update_buffers=update_buffers) + assert gamma > 0, f'gamma must be greater than 0, but got {gamma}' + self.gamma = gamma + + def avg_func(self, averaged_param: Tensor, source_param: Tensor, + steps: int) -> None: + """Compute the moving average of the parameters using the exponential + momentum strategy. + + Args: + averaged_param (Tensor): The averaged parameters. + source_param (Tensor): The source parameters. + steps (int): The number of times the parameters have been + updated. + """ + momentum = (1 - self.momentum) * math.exp( + -float(1 + steps) / self.gamma) + self.momentum + averaged_param.mul_(1 - momentum).add_(source_param, alpha=momentum) diff --git a/mmpose/engine/hooks/visualization_hook.py b/mmpose/engine/hooks/visualization_hook.py index 32d4f91cbd..c1fc9f52a0 100644 --- a/mmpose/engine/hooks/visualization_hook.py +++ b/mmpose/engine/hooks/visualization_hook.py @@ -96,11 +96,14 @@ def after_val_iter(self, runner: Runner, batch_idx: int, data_batch: dict, total_curr_iter = runner.iter + batch_idx # Visualize only the first data - img_path = data_batch[0]['data_sample'].get('img_path') + img_path = data_batch['data_samples'][0].get('img_path') img_bytes = self.file_client.get(img_path) img = mmcv.imfrombytes(img_bytes, channel_order='rgb') - data_sample = outputs[0] + + # revert the heatmap on the original image + data_sample = merge_data_samples([data_sample]) + if total_curr_iter % self.interval == 0: self._visualizer.add_datasample( os.path.basename(img_path) if self.show else 'val_img', @@ -147,7 +150,8 @@ def after_test_iter(self, runner: Runner, batch_idx: int, data_batch: dict, out_file = None if self.out_dir is not None: - out_file_name, postfix = os.path.basename(img_path).split('.') + out_file_name, postfix = os.path.basename(img_path).rsplit( + '.', 1) index = len([ fname for fname in os.listdir(self.out_dir) if fname.startswith(out_file_name) diff --git a/mmpose/engine/optim_wrappers/__init__.py b/mmpose/engine/optim_wrappers/__init__.py new file mode 100644 index 0000000000..7c0b1f533a --- /dev/null +++ b/mmpose/engine/optim_wrappers/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .layer_decay_optim_wrapper import LayerDecayOptimWrapperConstructor + +__all__ = ['LayerDecayOptimWrapperConstructor'] diff --git a/mmpose/engine/optim_wrappers/layer_decay_optim_wrapper.py b/mmpose/engine/optim_wrappers/layer_decay_optim_wrapper.py new file mode 100644 index 0000000000..6513e5593d --- /dev/null +++ b/mmpose/engine/optim_wrappers/layer_decay_optim_wrapper.py @@ -0,0 +1,73 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dist.utils import get_dist_info +from mmengine.optim import DefaultOptimWrapperConstructor +from mmengine.registry import OPTIM_WRAPPER_CONSTRUCTORS + + +def get_num_layer_for_vit(var_name, num_max_layer): + if var_name in ('backbone.cls_token', 'backbone.mask_token', + 'backbone.pos_embed'): + return 0 + elif var_name.startswith('backbone.patch_embed'): + return 0 + elif var_name.startswith('backbone.layers'): + layer_id = int(var_name.split('.')[2]) + return layer_id + 1 + else: + return num_max_layer - 1 + + +@OPTIM_WRAPPER_CONSTRUCTORS.register_module(force=True) +class LayerDecayOptimWrapperConstructor(DefaultOptimWrapperConstructor): + + def __init__(self, optim_wrapper_cfg, paramwise_cfg=None): + super().__init__(optim_wrapper_cfg, paramwise_cfg=None) + self.layer_decay_rate = paramwise_cfg.get('layer_decay_rate', 0.5) + + super().__init__(optim_wrapper_cfg, paramwise_cfg) + + def add_params(self, params, module, prefix='', lr=None): + parameter_groups = {} + print(self.paramwise_cfg) + num_layers = self.paramwise_cfg.get('num_layers') + 2 + layer_decay_rate = self.paramwise_cfg.get('layer_decay_rate') + weight_decay = self.base_wd + + for name, param in module.named_parameters(): + if not param.requires_grad: + continue # frozen weights + if (len(param.shape) == 1 or name.endswith('.bias') + or 'pos_embed' in name): + group_name = 'no_decay' + this_weight_decay = 0. + else: + group_name = 'decay' + this_weight_decay = weight_decay + layer_id = get_num_layer_for_vit(name, num_layers) + group_name = 'layer_%d_%s' % (layer_id, group_name) + + if group_name not in parameter_groups: + scale = layer_decay_rate**(num_layers - layer_id - 1) + + parameter_groups[group_name] = { + 'weight_decay': this_weight_decay, + 'params': [], + 'param_names': [], + 'lr_scale': scale, + 'group_name': group_name, + 'lr': scale * self.base_lr, + } + + parameter_groups[group_name]['params'].append(param) + parameter_groups[group_name]['param_names'].append(name) + rank, _ = get_dist_info() + if rank == 0: + to_display = {} + for key in parameter_groups: + to_display[key] = { + 'param_names': parameter_groups[key]['param_names'], + 'lr_scale': parameter_groups[key]['lr_scale'], + 'lr': parameter_groups[key]['lr'], + 'weight_decay': parameter_groups[key]['weight_decay'], + } + params.extend(parameter_groups.values()) diff --git a/mmpose/evaluation/functional/keypoint_eval.py b/mmpose/evaluation/functional/keypoint_eval.py index 3a430c3ff1..060243357b 100644 --- a/mmpose/evaluation/functional/keypoint_eval.py +++ b/mmpose/evaluation/functional/keypoint_eval.py @@ -274,7 +274,7 @@ def simcc_pck_accuracy(output: Tuple[np.ndarray, np.ndarray], N, _, Wx = pred_x.shape _, _, Wy = pred_y.shape - H, W = int(Wx / simcc_split_ratio), int(Wy / simcc_split_ratio) + W, H = int(Wx / simcc_split_ratio), int(Wy / simcc_split_ratio) if normalize is None: normalize = np.tile(np.array([[H, W]]), (N, 1)) diff --git a/mmpose/evaluation/functional/nms.py b/mmpose/evaluation/functional/nms.py index 86a0ab35e0..50bbe1550b 100644 --- a/mmpose/evaluation/functional/nms.py +++ b/mmpose/evaluation/functional/nms.py @@ -1,20 +1,23 @@ # ------------------------------------------------------------------------------ # Adapted from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch +# and https://github.com/HRNet/DEKR # Original licence: Copyright (c) Microsoft, under the MIT License. # ------------------------------------------------------------------------------ +from typing import List, Optional + import numpy as np -def nms(dets, thr): +def nms(dets: np.ndarray, thr: float) -> List[int]: """Greedily select boxes with high confidence and overlap <= thr. Args: - dets: [[x1, y1, x2, y2, score]]. - thr: Retain overlap < thr. + dets (np.ndarray): [[x1, y1, x2, y2, score]]. + thr (float): Retain overlap < thr. Returns: - list: Indexes to keep. + list: Indexes to keep. """ if len(dets) == 0: return [] @@ -48,19 +51,38 @@ def nms(dets, thr): return keep -def oks_iou(g, d, a_g, a_d, sigmas=None, vis_thr=None): +def oks_iou(g: np.ndarray, + d: np.ndarray, + a_g: float, + a_d: np.ndarray, + sigmas: Optional[np.ndarray] = None, + vis_thr: Optional[float] = None) -> np.ndarray: """Calculate oks ious. + Note: + + - number of keypoints: K + - number of instances: N + Args: - g: Ground truth keypoints. - d: Detected keypoints. - a_g: Area of the ground truth object. - a_d: Area of the detected object. - sigmas: standard deviation of keypoint labelling. - vis_thr: threshold of the keypoint visibility. + g (np.ndarray): The instance to calculate OKS IOU with other + instances. Containing the keypoints coordinates. Shape: (K*3, ) + d (np.ndarray): The rest instances. Containing the keypoints + coordinates. Shape: (N, K*3) + a_g (float): Area of the ground truth object. + a_d (np.ndarray): Area of the detected object. Shape: (N, ) + sigmas (np.ndarray, optional): Keypoint labelling uncertainty. + Please refer to `COCO keypoint evaluation + `__ for more details. + If not given, use the sigmas on COCO dataset. + If specified, shape: (K, ). Defaults to ``None`` + vis_thr(float, optional): Threshold of the keypoint visibility. + If specified, will calculate OKS based on those keypoints whose + visibility higher than vis_thr. If not given, calculate the OKS + based on all keypoints. Defaults to ``None`` Returns: - list: The oks ious. + np.ndarray: The oks ious. """ if sigmas is None: sigmas = np.array([ @@ -86,15 +108,26 @@ def oks_iou(g, d, a_g, a_d, sigmas=None, vis_thr=None): return ious -def oks_nms(kpts_db, thr, sigmas=None, vis_thr=None, score_per_joint=False): +def oks_nms(kpts_db: List[dict], + thr: float, + sigmas: Optional[np.ndarray] = None, + vis_thr: Optional[float] = None, + score_per_joint: bool = False): """OKS NMS implementations. Args: - kpts_db: keypoints. - thr: Retain overlap < thr. - sigmas: standard deviation of keypoint labelling. - vis_thr: threshold of the keypoint visibility. - score_per_joint: the input scores (in kpts_db) are per joint scores + kpts_db (List[dict]): The keypoints results of the same image. + thr (float): The threshold of NMS. Will retain oks overlap < thr. + sigmas (np.ndarray, optional): Keypoint labelling uncertainty. + Please refer to `COCO keypoint evaluation + `__ for more details. + If not given, use the sigmas on COCO dataset. Defaults to ``None`` + vis_thr(float, optional): Threshold of the keypoint visibility. + If specified, will calculate OKS based on those keypoints whose + visibility higher than vis_thr. If not given, calculate the OKS + based on all keypoints. Defaults to ``None`` + score_per_joint(bool): Whether the input scores (in kpts_db) are + per-joint scores. Defaults to ``False`` Returns: np.ndarray: indexes to keep. @@ -128,14 +161,18 @@ def oks_nms(kpts_db, thr, sigmas=None, vis_thr=None, score_per_joint=False): return keep -def _rescore(overlap, scores, thr, type='gaussian'): +def _rescore(overlap: np.ndarray, + scores: np.ndarray, + thr: float, + type: str = 'gaussian'): """Rescoring mechanism gaussian or linear. Args: - overlap: calculated ious - scores: target scores. - thr: retain oks overlap < thr. - type: 'gaussian' or 'linear' + overlap (np.ndarray): The calculated oks ious. + scores (np.ndarray): target scores. + thr (float): retain oks overlap < thr. + type (str): The rescoring type. Could be 'gaussian' or 'linear'. + Defaults to ``'gaussian'`` Returns: np.ndarray: indexes to keep @@ -152,20 +189,28 @@ def _rescore(overlap, scores, thr, type='gaussian'): return scores -def soft_oks_nms(kpts_db, - thr, - max_dets=20, - sigmas=None, - vis_thr=None, - score_per_joint=False): +def soft_oks_nms(kpts_db: List[dict], + thr: float, + max_dets: int = 20, + sigmas: Optional[np.ndarray] = None, + vis_thr: Optional[float] = None, + score_per_joint: bool = False): """Soft OKS NMS implementations. Args: - kpts_db - thr: retain oks overlap < thr. - max_dets: max number of detections to keep. - sigmas: Keypoint labelling uncertainty. - score_per_joint: the input scores (in kpts_db) are per joint scores + kpts_db (List[dict]): The keypoints results of the same image. + thr (float): The threshold of NMS. Will retain oks overlap < thr. + max_dets (int): Maximum number of detections to keep. Defaults to 20 + sigmas (np.ndarray, optional): Keypoint labelling uncertainty. + Please refer to `COCO keypoint evaluation + `__ for more details. + If not given, use the sigmas on COCO dataset. Defaults to ``None`` + vis_thr(float, optional): Threshold of the keypoint visibility. + If specified, will calculate OKS based on those keypoints whose + visibility higher than vis_thr. If not given, calculate the OKS + based on all keypoints. Defaults to ``None`` + score_per_joint(bool): Whether the input scores (in kpts_db) are + per-joint scores. Defaults to ``False`` Returns: np.ndarray: indexes to keep. @@ -205,3 +250,78 @@ def soft_oks_nms(kpts_db, keep = keep[:keep_cnt] return keep + + +def nearby_joints_nms( + kpts_db: List[dict], + dist_thr: float, + num_nearby_joints_thr: Optional[int] = None, + score_per_joint: bool = False, + max_dets: int = 30, +): + """Nearby joints NMS implementations. Instances with non-maximum scores + will be suppressed if they have too much closed joints with other + instances. This function is modified from project + `DEKR`. + + Args: + kpts_db (list[dict]): keypoints and scores. + dist_thr (float): threshold for judging whether two joints are close. + num_nearby_joints_thr (int): threshold for judging whether two + instances are close. + max_dets (int): max number of detections to keep. + score_per_joint (bool): the input scores (in kpts_db) are per joint + scores. + + Returns: + np.ndarray: indexes to keep. + """ + + assert dist_thr > 0, '`dist_thr` must be greater than 0.' + if len(kpts_db) == 0: + return [] + + if score_per_joint: + scores = np.array([k['score'].mean() for k in kpts_db]) + else: + scores = np.array([k['score'] for k in kpts_db]) + + kpts = np.array([k['keypoints'] for k in kpts_db]) + + num_people, num_joints, _ = kpts.shape + if num_nearby_joints_thr is None: + num_nearby_joints_thr = num_joints // 2 + assert num_nearby_joints_thr < num_joints, '`num_nearby_joints_thr` must '\ + 'be less than the number of joints.' + + # compute distance threshold + pose_area = kpts.max(axis=1) - kpts.min(axis=1) + pose_area = np.sqrt(np.power(pose_area, 2).sum(axis=1)) + pose_area = pose_area.reshape(num_people, 1, 1) + pose_area = np.tile(pose_area, (num_people, num_joints)) + close_dist_thr = pose_area * dist_thr + + # count nearby joints between instances + instance_dist = kpts[:, None] - kpts + instance_dist = np.sqrt(np.power(instance_dist, 2).sum(axis=3)) + close_instance_num = (instance_dist < close_dist_thr).sum(2) + close_instance = close_instance_num > num_nearby_joints_thr + + # apply nms + ignored_pose_inds, keep_pose_inds = set(), list() + indexes = np.argsort(scores)[::-1] + for i in indexes: + if i in ignored_pose_inds: + continue + keep_inds = close_instance[i].nonzero()[0] + keep_ind = keep_inds[np.argmax(scores[keep_inds])] + if keep_ind not in ignored_pose_inds: + keep_pose_inds.append(keep_ind) + ignored_pose_inds = ignored_pose_inds.union(set(keep_inds)) + + # limit the number of output instances + if max_dets > 0 and len(keep_pose_inds) > max_dets: + sub_inds = np.argsort(scores[keep_pose_inds])[-1:-max_dets - 1:-1] + keep_pose_inds = [keep_pose_inds[i] for i in sub_inds] + + return keep_pose_inds diff --git a/mmpose/evaluation/metrics/__init__.py b/mmpose/evaluation/metrics/__init__.py index ea9b08a76c..f02c353ef7 100644 --- a/mmpose/evaluation/metrics/__init__.py +++ b/mmpose/evaluation/metrics/__init__.py @@ -3,9 +3,11 @@ from .coco_wholebody_metric import CocoWholeBodyMetric from .keypoint_2d_metrics import (AUC, EPE, NME, JhmdbPCKAccuracy, MpiiPCKAccuracy, PCKAccuracy) +from .keypoint_partition_metric import KeypointPartitionMetric from .posetrack18_metric import PoseTrack18Metric __all__ = [ 'CocoMetric', 'PCKAccuracy', 'MpiiPCKAccuracy', 'JhmdbPCKAccuracy', 'AUC', - 'EPE', 'NME', 'PoseTrack18Metric', 'CocoWholeBodyMetric' + 'EPE', 'NME', 'PoseTrack18Metric', 'CocoWholeBodyMetric', + 'KeypointPartitionMetric' ] diff --git a/mmpose/evaluation/metrics/coco_metric.py b/mmpose/evaluation/metrics/coco_metric.py index b18a3cdb07..9b8bb706f2 100644 --- a/mmpose/evaluation/metrics/coco_metric.py +++ b/mmpose/evaluation/metrics/coco_metric.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +import datetime import os.path as osp import tempfile from collections import OrderedDict, defaultdict @@ -17,7 +18,7 @@ @METRICS.register_module() class CocoMetric(BaseMetric): - """COCO evaluation metric. + """COCO pose estimation task evaluation metric. Evaluate AR, AP, and mAP for keypoint detection tasks. Support COCO dataset and other datasets in COCO format. Please refer to @@ -25,15 +26,17 @@ class CocoMetric(BaseMetric): for more details. Args: - ann_file (str): Path to the coco format annotation file. + ann_file (str, optional): Path to the coco format annotation file. + If not specified, ground truth annotations from the dataset will + be converted to coco format. Defaults to None use_area (bool): Whether to use ``'area'`` message in the annotations. If the ground truth annotations (e.g. CrowdPose, AIC) do not have the field ``'area'``, please set ``use_area=False``. - Default: ``True``. + Defaults to ``True`` iou_type (str): The same parameter as `iouType` in :class:`xtcocotools.COCOeval`, which can be ``'keypoints'``, or ``'keypoints_crowd'`` (used in CrowdPose dataset). - Defaults to ``'keypoints'``. + Defaults to ``'keypoints'`` score_mode (str): The mode to score the prediction results which should be one of the following options: @@ -68,22 +71,22 @@ class CocoMetric(BaseMetric): doing quantitative evaluation. This is designed for the need of test submission when the ground truth annotations are absent. If set to ``True``, ``outfile_prefix`` should specify the path to - store the output results. Default: ``False``. + store the output results. Defaults to ``False`` outfile_prefix (str | None): The prefix of json files. It includes the file path and the prefix of filename, e.g., ``'a/b/prefix'``. - If not specified, a temp file will be created. Default: ``None``. + If not specified, a temp file will be created. Defaults to ``None`` collect_device (str): Device name used for collecting results from different ranks during distributed training. Must be ``'cpu'`` or - ``'gpu'``. Default: ``'cpu'``. + ``'gpu'``. Defaults to ``'cpu'`` prefix (str, optional): The prefix that will be added in the metric names to disambiguate homonymous metrics of different evaluators. If prefix is not provided in the argument, ``self.default_prefix`` - will be used instead. Default: ``None``. + will be used instead. Defaults to ``None`` """ default_prefix: Optional[str] = 'coco' def __init__(self, - ann_file: str, + ann_file: Optional[str] = None, use_area: bool = True, iou_type: str = 'keypoints', score_mode: str = 'bbox_keypoint', @@ -95,14 +98,18 @@ def __init__(self, collect_device: str = 'cpu', prefix: Optional[str] = None) -> None: super().__init__(collect_device=collect_device, prefix=prefix) - # initialize coco helper with the annotation json file self.ann_file = ann_file - self.coco = COCO(ann_file) + # initialize coco helper with the annotation json file + # if ann_file is not specified, initialize with the converted dataset + if ann_file is not None: + self.coco = COCO(ann_file) + else: + self.coco = None self.use_area = use_area self.iou_type = iou_type - allowed_score_modes = ['bbox', 'bbox_keypoint', 'bbox_rle'] + allowed_score_modes = ['bbox', 'bbox_keypoint', 'bbox_rle', 'keypoint'] if score_mode not in allowed_score_modes: raise ValueError( "`score_mode` should be one of 'bbox', 'bbox_keypoint', " @@ -123,13 +130,13 @@ def __init__(self, 'None when `format_only` is True, otherwise the result file '\ 'will be saved to a temp directory which will be cleaned up '\ 'in the end.' - else: + elif ann_file is not None: # do evaluation only if the ground truth annotations exist assert 'annotations' in load(ann_file), \ 'Ground truth annotations are required for evaluation '\ 'when `format_only` is False.' - self.format_only = format_only + self.format_only = format_only self.outfile_prefix = outfile_prefix def process(self, data_batch: Sequence[dict], @@ -163,19 +170,149 @@ def process(self, data_batch: Sequence[dict], keypoint_scores = data_sample['pred_instances']['keypoint_scores'] assert keypoint_scores.shape == keypoints.shape[:2] - result = dict() - result['id'] = data_sample['id'] - result['img_id'] = data_sample['img_id'] - result['keypoints'] = keypoints - result['keypoint_scores'] = keypoint_scores - result['bbox_scores'] = data_sample['gt_instances']['bbox_scores'] + # parse prediction results + pred = dict() + pred['id'] = data_sample['id'] + pred['img_id'] = data_sample['img_id'] + pred['keypoints'] = keypoints + pred['keypoint_scores'] = keypoint_scores + pred['category_id'] = data_sample.get('category_id', 1) + + if ('bbox_scores' not in data_sample['gt_instances'] + or len(data_sample['gt_instances']['bbox_scores']) != + len(keypoints)): + # bottom-up models might output different number of + # instances from annotation + bbox_scores = np.ones(len(keypoints)) + else: + bbox_scores = data_sample['gt_instances']['bbox_scores'] + pred['bbox_scores'] = bbox_scores # get area information if 'bbox_scales' in data_sample['gt_instances']: - result['areas'] = np.prod( + pred['areas'] = np.prod( data_sample['gt_instances']['bbox_scales'], axis=1) + + # parse gt + gt = dict() + if self.coco is None: + gt['width'] = data_sample['ori_shape'][1] + gt['height'] = data_sample['ori_shape'][0] + gt['img_id'] = data_sample['img_id'] + if self.iou_type == 'keypoints_crowd': + assert 'crowd_index' in data_sample, \ + '`crowd_index` is required when `self.iou_type` is ' \ + '`keypoints_crowd`' + gt['crowd_index'] = data_sample['crowd_index'] + assert 'raw_ann_info' in data_sample, \ + 'The row ground truth annotations are required for ' \ + 'evaluation when `ann_file` is not provided' + anns = data_sample['raw_ann_info'] + gt['raw_ann_info'] = anns if isinstance(anns, list) else [anns] + # add converted result to the results list - self.results.append(result) + self.results.append((pred, gt)) + + def gt_to_coco_json(self, gt_dicts: Sequence[dict], + outfile_prefix: str) -> str: + """Convert ground truth to coco format json file. + + Args: + gt_dicts (Sequence[dict]): Ground truth of the dataset. Each dict + contains the ground truth information about the data sample. + Required keys of the each `gt_dict` in `gt_dicts`: + - `img_id`: image id of the data sample + - `width`: original image width + - `height`: original image height + - `raw_ann_info`: the raw annotation information + Optional keys: + - `crowd_index`: measure the crowding level of an image, + defined in CrowdPose dataset + It is worth mentioning that, in order to compute `CocoMetric`, + there are some required keys in the `raw_ann_info`: + - `id`: the id to distinguish different annotations + - `image_id`: the image id of this annotation + - `category_id`: the category of the instance. + - `bbox`: the object bounding box + - `keypoints`: the keypoints cooridinates along with their + visibilities. Note that it need to be aligned + with the official COCO format, e.g., a list with length + N * 3, in which N is the number of keypoints. And each + triplet represent the [x, y, visible] of the keypoint. + - `iscrowd`: indicating whether the annotation is a crowd. + It is useful when matching the detection results to + the ground truth. + There are some optional keys as well: + - `area`: it is necessary when `self.use_area` is `True` + - `num_keypoints`: it is necessary when `self.iou_type` + is set as `keypoints_crowd`. + outfile_prefix (str): The filename prefix of the json files. If the + prefix is "somepath/xxx", the json file will be named + "somepath/xxx.gt.json". + Returns: + str: The filename of the json file. + """ + image_infos = [] + annotations = [] + img_ids = [] + ann_ids = [] + + for gt_dict in gt_dicts: + # filter duplicate image_info + if gt_dict['img_id'] not in img_ids: + image_info = dict( + id=gt_dict['img_id'], + width=gt_dict['width'], + height=gt_dict['height'], + ) + if self.iou_type == 'keypoints_crowd': + image_info['crowdIndex'] = gt_dict['crowd_index'] + + image_infos.append(image_info) + img_ids.append(gt_dict['img_id']) + + # filter duplicate annotations + for ann in gt_dict['raw_ann_info']: + if ann is None: + # during evaluation on bottom-up datasets, some images + # do not have instance annotation + continue + + annotation = dict( + id=ann['id'], + image_id=ann['image_id'], + category_id=ann['category_id'], + bbox=ann['bbox'], + keypoints=ann['keypoints'], + iscrowd=ann['iscrowd'], + ) + if self.use_area: + assert 'area' in ann, \ + '`area` is required when `self.use_area` is `True`' + annotation['area'] = ann['area'] + + if self.iou_type == 'keypoints_crowd': + assert 'num_keypoints' in ann, \ + '`num_keypoints` is required when `self.iou_type` ' \ + 'is `keypoints_crowd`' + annotation['num_keypoints'] = ann['num_keypoints'] + + annotations.append(annotation) + ann_ids.append(ann['id']) + + info = dict( + date_created=str(datetime.datetime.now()), + description='Coco json file converted by mmpose CocoMetric.') + coco_json = dict( + info=info, + images=image_infos, + categories=self.dataset_meta['CLASSES'], + licenses=None, + annotations=annotations, + ) + converted_json_path = f'{outfile_prefix}.gt.json' + dump(coco_json, converted_json_path, sort_keys=True, indent=4) + return converted_json_path def compute_metrics(self, results: list) -> Dict[str, float]: """Compute the metrics from processed results. @@ -189,6 +326,9 @@ def compute_metrics(self, results: list) -> Dict[str, float]: """ logger: MMLogger = MMLogger.get_current_instance() + # split prediction and gt list + preds, gts = zip(*results) + tmp_dir = None if self.outfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() @@ -196,25 +336,33 @@ def compute_metrics(self, results: list) -> Dict[str, float]: else: outfile_prefix = self.outfile_prefix + if self.coco is None: + # use converted gt json file to initialize coco helper + logger.info('Converting ground truth to coco format...') + coco_json_path = self.gt_to_coco_json( + gt_dicts=gts, outfile_prefix=outfile_prefix) + self.coco = COCO(coco_json_path) + kpts = defaultdict(list) - # group the results by img_id - for result in results: - img_id = result['img_id'] - for idx in range(len(result['bbox_scores'])): + # group the preds by img_id + for pred in preds: + img_id = pred['img_id'] + for idx in range(len(pred['keypoints'])): instance = { - 'id': result['id'], - 'img_id': result['img_id'], - 'keypoints': result['keypoints'][idx], - 'keypoint_scores': result['keypoint_scores'][idx], - 'bbox_score': result['bbox_scores'][idx], + 'id': pred['id'], + 'img_id': pred['img_id'], + 'category_id': pred['category_id'], + 'keypoints': pred['keypoints'][idx], + 'keypoint_scores': pred['keypoint_scores'][idx], + 'bbox_score': pred['bbox_scores'][idx], } - if 'areas' in result: - instance['area'] = result['areas'][idx] + if 'areas' in pred: + instance['area'] = pred['areas'][idx] else: # use keypoint to calculate bbox and get area - keypoints = result['keypoints'][idx] + keypoints = pred['keypoints'][idx] area = ( np.max(keypoints[:, 0]) - np.min(keypoints[:, 0])) * ( np.max(keypoints[:, 1]) - np.min(keypoints[:, 1])) @@ -238,6 +386,8 @@ def compute_metrics(self, results: list) -> Dict[str, float]: axis=-1) if self.score_mode == 'bbox': instance['score'] = instance['bbox_score'] + elif self.score_mode == 'keypoint': + instance['score'] = np.mean(instance['keypoint_scores']) else: bbox_score = instance['bbox_score'] if self.score_mode == 'bbox_rle': @@ -303,7 +453,6 @@ def results2json(self, keypoints: Dict[int, list], str: The json file name of keypoint results. """ # the results with category_id - cat_id = 1 cat_results = [] for _, img_kpts in keypoints.items(): @@ -315,7 +464,7 @@ def results2json(self, keypoints: Dict[int, list], result = [{ 'image_id': img_kpt['img_id'], - 'category_id': cat_id, + 'category_id': img_kpt['category_id'], 'keypoints': keypoint.tolist(), 'score': float(img_kpt['score']), } for img_kpt, keypoint in zip(img_kpts, _keypoints)] @@ -347,10 +496,16 @@ def _do_python_keypoint_eval(self, outfile_prefix: str) -> list: coco_eval.accumulate() coco_eval.summarize() - stats_names = [ - 'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5', - 'AR .75', 'AR (M)', 'AR (L)' - ] + if self.iou_type == 'keypoints_crowd': + stats_names = [ + 'AP', 'AP .5', 'AP .75', 'AR', 'AR .5', 'AR .75', 'AP(E)', + 'AP(M)', 'AP(H)' + ] + else: + stats_names = [ + 'AP', 'AP .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5', + 'AR .75', 'AR (M)', 'AR (L)' + ] info_str = list(zip(stats_names, coco_eval.stats)) @@ -386,264 +541,3 @@ def _sort_and_unique_bboxes(self, del kpts[img_id][i] return kpts - - -@METRICS.register_module() -class AP10KCocoMetric(CocoMetric): - """AP-10K evaluation metric. - - Evaluate AR, AP, and mAP for keypoint detection tasks. Support AP-10K - dataset with annotations in COCO format. Please refer to - `COCO keypoint evaluation `__ - for more details. - - Args: - ann_file (str): Path to the coco format annotation file. - use_area (bool): Whether to use ``'area'`` message in the annotations. - If the ground truth annotations (e.g. CrowdPose, AIC) do not have - the field ``'area'``, please set ``use_area=False``. - Default: ``True``. - iou_type (str): The same parameter as `iouType` in - :class:`xtcocotools.COCOeval`, which can be ``'keypoints'``, or - ``'keypoints_crowd'`` (used in CrowdPose dataset). - Defaults to ``'keypoints'``. - score_mode (str): The mode to score the prediction results which - should be one of the following options: - - - ``'bbox'``: Take the score of bbox as the score of the - prediction results. - - ``'bbox_keypoint'``: Use keypoint score to rescore the - prediction results. - - ``'bbox_rle'``: Use rle_score to rescore the - prediction results. - - Defaults to ``'bbox_keypoint'` - keypoint_score_thr (float): The threshold of keypoint score. The - keypoints with score lower than it will not be included to - rescore the prediction results. Valid only when ``score_mode`` is - ``bbox_keypoint``. Defaults to ``0.2`` - nms_mode (str): The mode to perform Non-Maximum Suppression (NMS), - which should be one of the following options: - - - ``'oks_nms'``: Use Object Keypoint Similarity (OKS) to - perform NMS. - - ``'soft_oks_nms'``: Use Object Keypoint Similarity (OKS) - to perform soft NMS. - - ``'none'``: Do not perform NMS. Typically for bottomup mode - output. - - Defaults to ``'oks_nms'` - nms_thr (float): The Object Keypoint Similarity (OKS) threshold - used in NMS when ``nms_mode`` is ``'oks_nms'`` or - ``'soft_oks_nms'``. Will retain the prediction results with OKS - lower than ``nms_thr``. Defaults to ``0.9`` - format_only (bool): Whether only format the output results without - doing quantitative evaluation. This is designed for the need of - test submission when the ground truth annotations are absent. If - set to ``True``, ``outfile_prefix`` should specify the path to - store the output results. Default: ``False``. - outfile_prefix (str | None): The prefix of json files. It includes - the file path and the prefix of filename, e.g., ``'a/b/prefix'``. - If not specified, a temp file will be created. Default: ``None``. - collect_device (str): Device name used for collecting results from - different ranks during distributed training. Must be ``'cpu'`` or - ``'gpu'``. Default: ``'cpu'``. - prefix (str, optional): The prefix that will be added in the metric - names to disambiguate homonymous metrics of different evaluators. - If prefix is not provided in the argument, ``self.default_prefix`` - will be used instead. Default: ``None``. - """ - - def process(self, data_batch: Sequence[dict], - predictions: Sequence[dict]) -> None: - """Process one batch of data samples and predictions. The processed - results should be stored in ``self.results``, which will be used to - compute the metrics when all batches have been processed. - - Args: - data_batch (Sequence[dict]): A batch of data - from the dataloader. - predictions (Sequence[dict]): A batch of outputs from - the model, each of which has the following keys: - - - 'id': The id of the sample - - 'img_id': The image_id of the sample - - 'category': The category of instance(s) - - 'pred_instances': The prediction results of instance(s) - """ - for _, pred in zip(data_batch, predictions): - if 'pred_instances' not in pred: - raise ValueError( - '`pred_instances` are required to process the ' - f'predictions results in {self.__class__.__name__}. ') - - # keypoints.shape: [N, K, 2], - # N: number of instances, K: number of keypoints - # for topdown-style output, N is usually 1, while for - # bottomup-style output, N is the number of instances in the image - keypoints = pred['pred_instances']['keypoints'] - # [N, K], the scores for all keypoints of all instances - keypoint_scores = pred['pred_instances']['keypoint_scores'] - assert keypoint_scores.shape == keypoints.shape[:2] - - result = dict() - result['id'] = pred['id'] - result['img_id'] = pred['img_id'] - result['keypoints'] = keypoints - result['keypoint_scores'] = keypoint_scores - result['bbox_scores'] = pred['pred_instances']['bbox_scores'] - result['category'] = pred['category'] - - # get area information - if 'bbox_scales' in pred['gt_instances']: - result['areas'] = np.prod( - pred['gt_instances']['bbox_scales'], axis=1) - # add converted result to the results list - self.results.append(result) - - def compute_metrics(self, results: list) -> Dict[str, float]: - """Compute the metrics from processed results. - - Args: - results (list): The processed results of each batch. - - Returns: - Dict[str, float]: The computed metrics. The keys are the names of - the metrics, and the values are corresponding results. - """ - logger: MMLogger = MMLogger.get_current_instance() - - tmp_dir = None - if self.outfile_prefix is None: - tmp_dir = tempfile.TemporaryDirectory() - outfile_prefix = osp.join(tmp_dir.name, 'results') - else: - outfile_prefix = self.outfile_prefix - - kpts = defaultdict(list) - - # group the results by img_id - for result in results: - img_id = result['img_id'] - for idx in range(len(result['bbox_scores'])): - instance = { - 'id': result['id'], - 'img_id': result['img_id'], - 'keypoints': result['keypoints'][idx], - 'keypoint_scores': result['keypoint_scores'][idx], - 'bbox_score': result['bbox_scores'][idx], - 'category': result['category'], - } - - if 'areas' in result: - instance['area'] = result['areas'][idx] - else: - # use keypoint to calculate bbox and get area - keypoints = result['keypoints'][idx] - area = ( - np.max(keypoints[:, 0]) - np.min(keypoints[:, 0])) * ( - np.max(keypoints[:, 1]) - np.min(keypoints[:, 1])) - instance['area'] = area - - kpts[img_id].append(instance) - - # sort keypoint results according to id and remove duplicate ones - kpts = self._sort_and_unique_bboxes(kpts, key='id') - - # score the prediction results according to `score_mode` - # and perform NMS according to `nms_mode` - valid_kpts = defaultdict(list) - num_keypoints = self.dataset_meta['num_keypoints'] - for img_id, instances in kpts.items(): - for instance in instances: - # concatenate the keypoint coordinates and scores - instance['keypoints'] = np.concatenate([ - instance['keypoints'], instance['keypoint_scores'][:, None] - ], - axis=-1) - if self.score_mode == 'bbox': - instance['score'] = instance['bbox_score'] - else: - bbox_score = instance['bbox_score'] - if self.score_mode == 'bbox_rle': - keypoint_scores = instance['keypoint_scores'] - instance['score'] = float(bbox_score + - np.mean(keypoint_scores) + - np.max(keypoint_scores)) - - else: # self.score_mode == 'bbox_keypoint': - mean_kpt_score = 0 - valid_num = 0 - for kpt_idx in range(num_keypoints): - kpt_score = instance['keypoint_scores'][kpt_idx] - if kpt_score > self.keypoint_score_thr: - mean_kpt_score += kpt_score - valid_num += 1 - if valid_num != 0: - mean_kpt_score /= valid_num - instance['score'] = bbox_score * mean_kpt_score - # perform nms - if self.nms_mode == 'none': - valid_kpts[img_id] = instances - else: - nms = oks_nms if self.nms_mode == 'oks_nms' else soft_oks_nms - keep = nms( - instances, - self.nms_thr, - sigmas=self.dataset_meta['sigmas']) - valid_kpts[img_id] = [instances[_keep] for _keep in keep] - - # convert results to coco style and dump into a json file - self.results2json(valid_kpts, outfile_prefix=outfile_prefix) - - # only format the results without doing quantitative evaluation - if self.format_only: - logger.info('results are saved in ' - f'{osp.dirname(outfile_prefix)}') - return {} - - # evaluation results - eval_results = OrderedDict() - logger.info(f'Evaluating {self.__class__.__name__}...') - info_str = self._do_python_keypoint_eval(outfile_prefix) - name_value = OrderedDict(info_str) - eval_results.update(name_value) - - if tmp_dir is not None: - tmp_dir.cleanup() - return eval_results - - def results2json(self, keypoints: Dict[int, list], - outfile_prefix: str) -> str: - """Dump the keypoint detection results to a COCO style json file. - - Args: - keypoints (Dict[int, list]): Keypoint detection results - of the dataset. - outfile_prefix (str): The filename prefix of the json files. If the - prefix is "somepath/xxx", the json files will be named - "somepath/xxx.keypoints.json", - - Returns: - str: The json file name of keypoint results. - """ - cat_results = [] - - for _, img_kpts in keypoints.items(): - _keypoints = np.array( - [img_kpt['keypoints'] for img_kpt in img_kpts]) - num_keypoints = self.dataset_meta['num_keypoints'] - # collect all the person keypoints in current image - _keypoints = _keypoints.reshape(-1, num_keypoints * 3) - - result = [{ - 'image_id': img_kpt['img_id'], - 'category_id': img_kpt['category'], - 'keypoints': keypoint.tolist(), - 'score': float(img_kpt['score']), - } for img_kpt, keypoint in zip(img_kpts, _keypoints)] - - cat_results.extend(result) - - res_file = f'{outfile_prefix}.keypoints.json' - dump(cat_results, res_file, sort_keys=True, indent=4) diff --git a/mmpose/evaluation/metrics/coco_wholebody_metric.py b/mmpose/evaluation/metrics/coco_wholebody_metric.py index 34d81aed20..c5675f54c8 100644 --- a/mmpose/evaluation/metrics/coco_wholebody_metric.py +++ b/mmpose/evaluation/metrics/coco_wholebody_metric.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, Optional +import datetime +from typing import Dict, Optional, Sequence import numpy as np from mmengine.fileio import dump @@ -19,15 +20,17 @@ class CocoWholeBodyMetric(CocoMetric): for more details. Args: - ann_file (str): Path to the coco format annotation file. + ann_file (str, optional): Path to the coco format annotation file. + If not specified, ground truth annotations from the dataset will + be converted to coco format. Defaults to None use_area (bool): Whether to use ``'area'`` message in the annotations. If the ground truth annotations (e.g. CrowdPose, AIC) do not have the field ``'area'``, please set ``use_area=False``. - Default: ``True``. + Defaults to ``True`` iou_type (str): The same parameter as `iouType` in :class:`xtcocotools.COCOeval`, which can be ``'keypoints'``, or ``'keypoints_crowd'`` (used in CrowdPose dataset). - Defaults to ``'keypoints'``. + Defaults to ``'keypoints'`` score_mode (str): The mode to score the prediction results which should be one of the following options: @@ -62,17 +65,11 @@ class CocoWholeBodyMetric(CocoMetric): doing quantitative evaluation. This is designed for the need of test submission when the ground truth annotations are absent. If set to ``True``, ``outfile_prefix`` should specify the path to - store the output results. Default: ``False``. + store the output results. Defaults to ``False`` outfile_prefix (str | None): The prefix of json files. It includes the file path and the prefix of filename, e.g., ``'a/b/prefix'``. - If not specified, a temp file will be created. Default: ``None``. - collect_device (str): Device name used for collecting results from - different ranks during distributed training. Must be ``'cpu'`` or - ``'gpu'``. Default: ``'cpu'``. - prefix (str, optional): The prefix that will be added in the metric - names to disambiguate homonymous metrics of different evaluators. - If prefix is not provided in the argument, ``self.default_prefix`` - will be used instead. Default: ``None``. + If not specified, a temp file will be created. Defaults to ``None`` + **kwargs: Keyword parameters passed to :class:`mmeval.BaseMetric` """ default_prefix: Optional[str] = 'coco-wholebody' body_num = 17 @@ -81,6 +78,101 @@ class CocoWholeBodyMetric(CocoMetric): left_hand_num = 21 right_hand_num = 21 + def gt_to_coco_json(self, gt_dicts: Sequence[dict], + outfile_prefix: str) -> str: + """Convert ground truth to coco format json file. + + Args: + gt_dicts (Sequence[dict]): Ground truth of the dataset. Each dict + contains the ground truth information about the data sample. + Required keys of the each `gt_dict` in `gt_dicts`: + - `img_id`: image id of the data sample + - `width`: original image width + - `height`: original image height + - `raw_ann_info`: the raw annotation information + Optional keys: + - `crowd_index`: measure the crowding level of an image, + defined in CrowdPose dataset + It is worth mentioning that, in order to compute `CocoMetric`, + there are some required keys in the `raw_ann_info`: + - `id`: the id to distinguish different annotations + - `image_id`: the image id of this annotation + - `category_id`: the category of the instance. + - `bbox`: the object bounding box + - `keypoints`: the keypoints cooridinates along with their + visibilities. Note that it need to be aligned + with the official COCO format, e.g., a list with length + N * 3, in which N is the number of keypoints. And each + triplet represent the [x, y, visible] of the keypoint. + - 'keypoints' + - `iscrowd`: indicating whether the annotation is a crowd. + It is useful when matching the detection results to + the ground truth. + There are some optional keys as well: + - `area`: it is necessary when `self.use_area` is `True` + - `num_keypoints`: it is necessary when `self.iou_type` + is set as `keypoints_crowd`. + outfile_prefix (str): The filename prefix of the json files. If the + prefix is "somepath/xxx", the json file will be named + "somepath/xxx.gt.json". + Returns: + str: The filename of the json file. + """ + image_infos = [] + annotations = [] + img_ids = [] + ann_ids = [] + + for gt_dict in gt_dicts: + # filter duplicate image_info + if gt_dict['img_id'] not in img_ids: + image_info = dict( + id=gt_dict['img_id'], + width=gt_dict['width'], + height=gt_dict['height'], + ) + if self.iou_type == 'keypoints_crowd': + image_info['crowdIndex'] = gt_dict['crowd_index'] + + image_infos.append(image_info) + img_ids.append(gt_dict['img_id']) + + # filter duplicate annotations + for ann in gt_dict['raw_ann_info']: + annotation = dict( + id=ann['id'], + image_id=ann['image_id'], + category_id=ann['category_id'], + bbox=ann['bbox'], + keypoints=ann['keypoints'], + foot_kpts=ann['foot_kpts'], + face_kpts=ann['face_kpts'], + lefthand_kpts=ann['lefthand_kpts'], + righthand_kpts=ann['righthand_kpts'], + iscrowd=ann['iscrowd'], + ) + if self.use_area: + assert 'area' in ann, \ + '`area` is required when `self.use_area` is `True`' + annotation['area'] = ann['area'] + + annotations.append(annotation) + ann_ids.append(ann['id']) + + info = dict( + date_created=str(datetime.datetime.now()), + description='Coco json file converted by mmpose CocoMetric.') + coco_json: dict = dict( + info=info, + images=image_infos, + categories=self.dataset_meta['CLASSES'], + licenses=None, + annotations=annotations, + ) + converted_json_path = f'{outfile_prefix}.gt.json' + dump(coco_json, converted_json_path, sort_keys=True, indent=4) + return converted_json_path + def results2json(self, keypoints: Dict[int, list], outfile_prefix: str) -> str: """Dump the keypoint detection results to a COCO style json file. diff --git a/mmpose/evaluation/metrics/keypoint_2d_metrics.py b/mmpose/evaluation/metrics/keypoint_2d_metrics.py index 204000bdf7..c6a63f1e51 100644 --- a/mmpose/evaluation/metrics/keypoint_2d_metrics.py +++ b/mmpose/evaluation/metrics/keypoint_2d_metrics.py @@ -37,8 +37,35 @@ class PCKAccuracy(BaseMetric): names to disambiguate homonymous metrics of different evaluators. If prefix is not provided in the argument, ``self.default_prefix`` will be used instead. Default: ``None``. + + Examples: + + >>> from mmpose.evaluation.metrics import PCKAccuracy + >>> import numpy as np + >>> from mmengine.structures import InstanceData + >>> num_keypoints = 15 + >>> keypoints = np.random.random((1, num_keypoints, 2)) * 10 + >>> gt_instances = InstanceData() + >>> gt_instances.keypoints = keypoints + >>> gt_instances.keypoints_visible = np.ones( + ... (1, num_keypoints, 1)).astype(bool) + >>> gt_instances.bboxes = np.random.random((1, 4)) * 20 + >>> pred_instances = InstanceData() + >>> pred_instances.keypoints = keypoints + >>> data_sample = { + ... 'gt_instances': gt_instances.to_dict(), + ... 'pred_instances': pred_instances.to_dict(), + ... } + >>> data_samples = [data_sample] + >>> data_batch = [{'inputs': None}] + >>> pck_metric = PCKAccuracy(thr=0.5, norm_item='bbox') + ...: UserWarning: The prefix is not set in metric class PCKAccuracy. + >>> pck_metric.process(data_batch, data_samples) + >>> pck_metric.evaluate(1) + 10/26 15:37:57 - mmengine - INFO - Evaluating PCKAccuracy (normalized by ``"bbox_size"``)... # noqa + {'PCK': 1.0} + """ - default_prefix: Optional[str] = 'pck' def __init__(self, thr: float = 0.05, @@ -126,6 +153,10 @@ def compute_metrics(self, results: list) -> Dict[str, float]: Returns: Dict[str, float]: The computed metrics. The keys are the names of the metrics, and the values are corresponding results. + The returned result dict may have the following keys: + - 'PCK': The pck accuracy normalized by `bbox_size`. + - 'PCKh': The pck accuracy normalized by `head_size`. + - 'tPCK': The pck accuracy normalized by `torso_size`. """ logger: MMLogger = MMLogger.get_current_instance() @@ -147,7 +178,7 @@ def compute_metrics(self, results: list) -> Dict[str, float]: _, pck, _ = keypoint_pck_accuracy(pred_coords, gt_coords, mask, self.thr, norm_size_bbox) - metrics[f'PCK@{self.thr}'] = pck + metrics['PCK'] = pck if 'head' in self.norm_item: norm_size_head = np.concatenate( @@ -158,7 +189,7 @@ def compute_metrics(self, results: list) -> Dict[str, float]: _, pckh, _ = keypoint_pck_accuracy(pred_coords, gt_coords, mask, self.thr, norm_size_head) - metrics[f'PCKh@{self.thr}'] = pckh + metrics['PCKh'] = pckh if 'torso' in self.norm_item: norm_size_torso = np.concatenate( @@ -169,7 +200,7 @@ def compute_metrics(self, results: list) -> Dict[str, float]: _, tpck, _ = keypoint_pck_accuracy(pred_coords, gt_coords, mask, self.thr, norm_size_torso) - metrics[f'tPCK@{self.thr}'] = tpck + metrics['tPCK'] = tpck return metrics @@ -195,7 +226,7 @@ class MpiiPCKAccuracy(PCKAccuracy): thr(float): Threshold of PCK calculation. Default: 0.05. norm_item (str | Sequence[str]): The item used for normalization. Valid items include 'bbox', 'head', 'torso', which correspond - to 'PCK', 'PCKh' and 'tPCK' respectively. Default: ``'bbox'``. + to 'PCK', 'PCKh' and 'tPCK' respectively. Default: ``'head'``. collect_device (str): Device name used for collecting results from different ranks during distributed training. Must be ``'cpu'`` or ``'gpu'``. Default: ``'cpu'``. @@ -203,8 +234,36 @@ class MpiiPCKAccuracy(PCKAccuracy): names to disambiguate homonymous metrics of different evaluators. If prefix is not provided in the argument, ``self.default_prefix`` will be used instead. Default: ``None``. + + Examples: + + >>> from mmpose.evaluation.metrics import MpiiPCKAccuracy + >>> import numpy as np + >>> from mmengine.structures import InstanceData + >>> num_keypoints = 16 + >>> keypoints = np.random.random((1, num_keypoints, 2)) * 10 + >>> gt_instances = InstanceData() + >>> gt_instances.keypoints = keypoints + 1.0 + >>> gt_instances.keypoints_visible = np.ones( + ... (1, num_keypoints, 1)).astype(bool) + >>> gt_instances.head_size = np.random.random((1, 1)) * 10 + >>> pred_instances = InstanceData() + >>> pred_instances.keypoints = keypoints + >>> data_sample = { + ... 'gt_instances': gt_instances.to_dict(), + ... 'pred_instances': pred_instances.to_dict(), + ... } + >>> data_samples = [data_sample] + >>> data_batch = [{'inputs': None}] + >>> mpii_pck_metric = MpiiPCKAccuracy(thr=0.3, norm_item='head') + ... UserWarning: The prefix is not set in metric class MpiiPCKAccuracy. + >>> mpii_pck_metric.process(data_batch, data_samples) + >>> mpii_pck_metric.evaluate(1) + 10/26 17:43:39 - mmengine - INFO - Evaluating MpiiPCKAccuracy (normalized by ``"head_size"``)... # noqa + {'Head PCK': 100.0, 'Shoulder PCK': 100.0, 'Elbow PCK': 100.0, + Wrist PCK': 100.0, 'Hip PCK': 100.0, 'Knee PCK': 100.0, + 'Ankle PCK': 100.0, 'PCK': 100.0, 'PCK@0.1': 100.0} """ - default_prefix: Optional[str] = 'pck' def __init__(self, thr: float = 0.5, @@ -226,6 +285,17 @@ def compute_metrics(self, results: list) -> Dict[str, float]: Returns: Dict[str, float]: The computed metrics. The keys are the names of the metrics, and the values are corresponding results. + If `'head'` in `self.norm_item`, the returned results are the pck + accuracy normalized by `head_size`, which have the following keys: + - 'Head PCK': The PCK of head + - 'Shoulder PCK': The PCK of shoulder + - 'Elbow PCK': The PCK of elbow + - 'Wrist PCK': The PCK of wrist + - 'Hip PCK': The PCK of hip + - 'Knee PCK': The PCK of knee + - 'Ankle PCK': The PCK of ankle + - 'PCK': The mean PCK over all keypoints + - 'PCK@0.1': The mean PCK at threshold 0.1 """ logger: MMLogger = MMLogger.get_current_instance() @@ -241,8 +311,7 @@ def compute_metrics(self, results: list) -> Dict[str, float]: # convert 0-based index to 1-based index pred_coords = pred_coords + 1.0 - metrics = super().compute_metrics(results) - + metrics = {} if 'head' in self.norm_item: norm_size_head = np.concatenate( [result['head_size'] for result in results]) @@ -281,18 +350,17 @@ def compute_metrics(self, results: list) -> Dict[str, float]: # lkne 4 rkne 1 # lank 5 rank 0 stats = { - 'Head': PCKh[9], - 'Shoulder': 0.5 * (PCKh[13] + PCKh[12]), - 'Elbow': 0.5 * (PCKh[14] + PCKh[11]), - 'Wrist': 0.5 * (PCKh[15] + PCKh[10]), - 'Hip': 0.5 * (PCKh[3] + PCKh[2]), - 'Knee': 0.5 * (PCKh[4] + PCKh[1]), - 'Ankle': 0.5 * (PCKh[5] + PCKh[0]), - 'PCKh': np.sum(PCKh * jnt_ratio), - 'PCKh@0.1': np.sum(pckAll[10, :] * jnt_ratio) + 'Head PCK': PCKh[9], + 'Shoulder PCK': 0.5 * (PCKh[13] + PCKh[12]), + 'Elbow PCK': 0.5 * (PCKh[14] + PCKh[11]), + 'Wrist PCK': 0.5 * (PCKh[15] + PCKh[10]), + 'Hip PCK': 0.5 * (PCKh[3] + PCKh[2]), + 'Knee PCK': 0.5 * (PCKh[4] + PCKh[1]), + 'Ankle PCK': 0.5 * (PCKh[5] + PCKh[0]), + 'PCK': np.sum(PCKh * jnt_ratio), + 'PCK@0.1': np.sum(pckAll[10, :] * jnt_ratio) } - del metrics[f'PCKh@{self.thr}'] for stats_name, stat in stats.items(): metrics[stats_name] = stat @@ -328,8 +396,39 @@ class JhmdbPCKAccuracy(PCKAccuracy): names to disambiguate homonymous metrics of different evaluators. If prefix is not provided in the argument, ``self.default_prefix`` will be used instead. Default: ``None``. + + Examples: + + >>> from mmpose.evaluation.metrics import JhmdbPCKAccuracy + >>> import numpy as np + >>> from mmengine.structures import InstanceData + >>> num_keypoints = 15 + >>> keypoints = np.random.random((1, num_keypoints, 2)) * 10 + >>> gt_instances = InstanceData() + >>> gt_instances.keypoints = keypoints + >>> gt_instances.keypoints_visible = np.ones( + ... (1, num_keypoints, 1)).astype(bool) + >>> gt_instances.bboxes = np.random.random((1, 4)) * 20 + >>> gt_instances.head_size = np.random.random((1, 1)) * 10 + >>> pred_instances = InstanceData() + >>> pred_instances.keypoints = keypoints + >>> data_sample = { + ... 'gt_instances': gt_instances.to_dict(), + ... 'pred_instances': pred_instances.to_dict(), + ... } + >>> data_samples = [data_sample] + >>> data_batch = [{'inputs': None}] + >>> jhmdb_pck_metric = JhmdbPCKAccuracy(thr=0.2, norm_item=['bbox', 'torso']) + ... UserWarning: The prefix is not set in metric class JhmdbPCKAccuracy. + >>> jhmdb_pck_metric.process(data_batch, data_samples) + >>> jhmdb_pck_metric.evaluate(1) + 10/26 17:48:09 - mmengine - INFO - Evaluating JhmdbPCKAccuracy (normalized by ``"bbox_size"``)... # noqa + 10/26 17:48:09 - mmengine - INFO - Evaluating JhmdbPCKAccuracy (normalized by ``"torso_size"``)... # noqa + {'Head PCK': 1.0, 'Sho PCK': 1.0, 'Elb PCK': 1.0, 'Wri PCK': 1.0, + 'Hip PCK': 1.0, 'Knee PCK': 1.0, 'Ank PCK': 1.0, 'PCK': 1.0, + 'Head tPCK': 1.0, 'Sho tPCK': 1.0, 'Elb tPCK': 1.0, 'Wri tPCK': 1.0, + 'Hip tPCK': 1.0, 'Knee tPCK': 1.0, 'Ank tPCK': 1.0, 'tPCK': 1.0} """ - default_prefix: Optional[str] = 'pck' def __init__(self, thr: float = 0.05, @@ -351,6 +450,26 @@ def compute_metrics(self, results: list) -> Dict[str, float]: Returns: Dict[str, float]: The computed metrics. The keys are the names of the metrics, and the values are corresponding results. + If `'bbox'` in `self.norm_item`, the returned results are the pck + accuracy normalized by `bbox_size`, which have the following keys: + - 'Head PCK': The PCK of head + - 'Sho PCK': The PCK of shoulder + - 'Elb PCK': The PCK of elbow + - 'Wri PCK': The PCK of wrist + - 'Hip PCK': The PCK of hip + - 'Knee PCK': The PCK of knee + - 'Ank PCK': The PCK of ankle + - 'PCK': The mean PCK over all keypoints + If `'torso'` in `self.norm_item`, the returned results are the pck + accuracy normalized by `torso_size`, which have the following keys: + - 'Head tPCK': The PCK of head + - 'Sho tPCK': The PCK of shoulder + - 'Elb tPCK': The PCK of elbow + - 'Wri tPCK': The PCK of wrist + - 'Hip tPCK': The PCK of hip + - 'Knee tPCK': The PCK of knee + - 'Ank tPCK': The PCK of ankle + - 'tPCK': The mean PCK over all keypoints """ logger: MMLogger = MMLogger.get_current_instance() @@ -362,8 +481,7 @@ def compute_metrics(self, results: list) -> Dict[str, float]: # mask: [N, K] mask = np.concatenate([result['mask'] for result in results]) - metrics = super().compute_metrics(results) - + metrics = dict() if 'bbox' in self.norm_item: norm_size_bbox = np.concatenate( [result['bbox_size'] for result in results]) @@ -373,22 +491,19 @@ def compute_metrics(self, results: list) -> Dict[str, float]: pck_p, pck, _ = keypoint_pck_accuracy(pred_coords, gt_coords, mask, self.thr, norm_size_bbox) - metrics[f'@{self.thr}'] = pck - stats = { - 'Head': pck_p[2], - 'Sho': 0.5 * pck_p[3] + 0.5 * pck_p[4], - 'Elb': 0.5 * pck_p[7] + 0.5 * pck_p[8], - 'Wri': 0.5 * pck_p[11] + 0.5 * pck_p[12], - 'Hip': 0.5 * pck_p[5] + 0.5 * pck_p[6], - 'Knee': 0.5 * pck_p[9] + 0.5 * pck_p[10], - 'Ank': 0.5 * pck_p[13] + 0.5 * pck_p[14], - 'Mean': pck + 'Head PCK': pck_p[2], + 'Sho PCK': 0.5 * pck_p[3] + 0.5 * pck_p[4], + 'Elb PCK': 0.5 * pck_p[7] + 0.5 * pck_p[8], + 'Wri PCK': 0.5 * pck_p[11] + 0.5 * pck_p[12], + 'Hip PCK': 0.5 * pck_p[5] + 0.5 * pck_p[6], + 'Knee PCK': 0.5 * pck_p[9] + 0.5 * pck_p[10], + 'Ank PCK': 0.5 * pck_p[13] + 0.5 * pck_p[14], + 'PCK': pck } - del metrics[f'PCK@{self.thr}'] for stats_name, stat in stats.items(): - metrics[f'{stats_name} PCK'] = stat + metrics[stats_name] = stat if 'torso' in self.norm_item: norm_size_torso = np.concatenate( @@ -401,19 +516,18 @@ def compute_metrics(self, results: list) -> Dict[str, float]: self.thr, norm_size_torso) stats = { - 'Head': pck_p[2], - 'Sho': 0.5 * pck_p[3] + 0.5 * pck_p[4], - 'Elb': 0.5 * pck_p[7] + 0.5 * pck_p[8], - 'Wri': 0.5 * pck_p[11] + 0.5 * pck_p[12], - 'Hip': 0.5 * pck_p[5] + 0.5 * pck_p[6], - 'Knee': 0.5 * pck_p[9] + 0.5 * pck_p[10], - 'Ank': 0.5 * pck_p[13] + 0.5 * pck_p[14], - 'Mean': pck + 'Head tPCK': pck_p[2], + 'Sho tPCK': 0.5 * pck_p[3] + 0.5 * pck_p[4], + 'Elb tPCK': 0.5 * pck_p[7] + 0.5 * pck_p[8], + 'Wri tPCK': 0.5 * pck_p[11] + 0.5 * pck_p[12], + 'Hip tPCK': 0.5 * pck_p[5] + 0.5 * pck_p[6], + 'Knee tPCK': 0.5 * pck_p[9] + 0.5 * pck_p[10], + 'Ank tPCK': 0.5 * pck_p[13] + 0.5 * pck_p[14], + 'tPCK': pck } - del metrics[f'tPCK@{self.thr}'] for stats_name, stat in stats.items(): - metrics[f'{stats_name} tPCK'] = stat + metrics[stats_name] = stat return metrics @@ -443,7 +557,6 @@ class AUC(BaseMetric): If prefix is not provided in the argument, ``self.default_prefix`` will be used instead. Default: ``None``. """ - default_prefix: Optional[str] = 'auc' def __init__(self, norm_factor: float = 30, @@ -510,7 +623,7 @@ def compute_metrics(self, results: list) -> Dict[str, float]: self.num_thrs) metrics = dict() - metrics[f'@{self.num_thrs}thrs'] = auc + metrics['AUC'] = auc return metrics @@ -535,7 +648,6 @@ class EPE(BaseMetric): If prefix is not provided in the argument, ``self.default_prefix`` will be used instead. Default: ``None``. """ - default_prefix: Optional[str] = 'epe' def process(self, data_batch: Sequence[dict], data_samples: Sequence[dict]) -> None: @@ -592,7 +704,7 @@ def compute_metrics(self, results: list) -> Dict[str, float]: epe = keypoint_epe(pred_coords, gt_coords, mask) metrics = dict() - metrics['epe'] = epe + metrics['EPE'] = epe return metrics @@ -635,7 +747,6 @@ class NME(BaseMetric): If prefix is not provided in the argument, ``self.default_prefix`` will be used instead. Default: ``None``. """ - default_prefix: Optional[str] = 'nme' DEFAULT_KEYPOINT_INDICES = { # horse10: corresponding to `nose` and `eye` keypoints @@ -748,7 +859,7 @@ def compute_metrics(self, results: list) -> Dict[str, float]: # normalize_factor: [N, 2] normalize_factor = np.tile(normalize_factor_, [1, 2]) nme = keypoint_nme(pred_coords, gt_coords, mask, normalize_factor) - metrics[f'@{self.norm_item}'] = nme + metrics['NME'] = nme else: if self.keypoint_indices is None: @@ -774,7 +885,7 @@ def compute_metrics(self, results: list) -> Dict[str, float]: # normalize_factor: [N, 2] normalize_factor = self._get_normalize_factor(gt_coords=gt_coords) nme = keypoint_nme(pred_coords, gt_coords, mask, normalize_factor) - metrics[f'@{self.keypoint_indices}'] = nme + metrics['NME'] = nme return metrics diff --git a/mmpose/evaluation/metrics/keypoint_partition_metric.py b/mmpose/evaluation/metrics/keypoint_partition_metric.py new file mode 100644 index 0000000000..fb30eca0d5 --- /dev/null +++ b/mmpose/evaluation/metrics/keypoint_partition_metric.py @@ -0,0 +1,203 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from collections import OrderedDict +from copy import deepcopy +from typing import Sequence + +import numpy as np +from mmengine.evaluator import BaseMetric + +from mmpose.registry import METRICS + + +@METRICS.register_module() +class KeypointPartitionMetric(BaseMetric): + """Wrapper metric for evaluating pose metric on user-defined body parts. + + Sometimes one may be interested in the performance of a pose model on + certain body parts rather than on all the keypoints. For example, + ``CocoWholeBodyMetric`` evaluates coco metric on body, foot, face, + lefthand and righthand. However, ``CocoWholeBodyMetric`` cannot be + applied to arbitrary custom datasets. This wrapper metric solves this + problem. + + Supported metrics: + ``CocoMetric`` Note 1: all keypoint ground truth should be stored in + `keypoints` not other data fields. Note 2: `ann_file` is not + supported, it will be ignored. Note 3: `score_mode` other than + 'bbox' may produce results different from the + ``CocoWholebodyMetric``. Note 4: `nms_mode` other than 'none' may + produce results different from the ``CocoWholebodyMetric``. + ``PCKAccuracy`` Note 1: data fields required by ``PCKAccuracy`` should + be provided, such as bbox, head_size, etc. Note 2: In terms of + 'torso', since it is specifically designed for ``JhmdbDataset``, it is + not recommended to use it for other datasets. + ``AUC`` supported without limitations. + ``EPE`` supported without limitations. + ``NME`` only `norm_mode` = 'use_norm_item' is supported, + 'keypoint_distance' is incompatible with ``KeypointPartitionMetric``. + + Incompatible metrics: + The following metrics are dataset specific metrics: + ``CocoWholeBodyMetric`` + ``MpiiPCKAccuracy`` + ``JhmdbPCKAccuracy`` + ``PoseTrack18Metric`` + Keypoint partitioning is included in these metrics. + + Args: + metric (dict): arguments to instantiate a metric, please refer to the + arguments required by the metric of your choice. + partitions (dict): definition of body partitions. For example, if we + have 10 keypoints in total, the first 7 keypoints belong to body + and the last 3 keypoints belong to foot, this field can be like + this: + dict( + body=[0, 1, 2, 3, 4, 5, 6], + foot=[7, 8, 9], + all=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + ) + where the numbers are the indices of keypoints and they can be + discontinuous. + """ + + def __init__( + self, + metric: dict, + partitions: dict, + ) -> None: + super().__init__() + # check metric type + supported_metric_types = [ + 'CocoMetric', 'PCKAccuracy', 'AUC', 'EPE', 'NME' + ] + if metric['type'] not in supported_metric_types: + raise ValueError( + 'Metrics supported by KeypointPartitionMetric are CocoMetric, ' + 'PCKAccuracy, AUC, EPE and NME, ' + f"but got {metric['type']}") + + # check CocoMetric arguments + if metric['type'] == 'CocoMetric': + if 'ann_file' in metric: + warnings.warn( + 'KeypointPartitionMetric does not support the ann_file ' + 'argument of CocoMetric, this argument will be ignored.') + metric['ann_file'] = None + score_mode = metric.get('score_mode', 'bbox_keypoint') + if score_mode != 'bbox': + warnings.warn( + 'When using KeypointPartitionMetric with CocoMetric, ' + "if score_mode is not 'bbox', pose scores will be " + "calculated part by part rather than by 'wholebody'. " + 'Therefore, this may produce results different from the ' + 'CocoWholebodyMetric.') + nms_mode = metric.get('nms_mode', 'oks_nms') + if nms_mode != 'none': + warnings.warn( + 'When using KeypointPartitionMetric with CocoMetric, ' + 'oks_nms and soft_oks_nms will be calculated part by part ' + "rather than by 'wholebody'. Therefore, this may produce " + 'results different from the CocoWholebodyMetric.') + + # check PCKAccuracy arguments + if metric['type'] == 'PCKAccuracy': + norm_item = metric.get('norm_item', 'bbox') + if norm_item == 'torso' or 'torso' in norm_item: + warnings.warn( + 'norm_item torso is used in JhmdbDataset, it may not be ' + 'compatible with other datasets, use at your own risk.') + + # check NME arguments + if metric['type'] == 'NME': + assert 'norm_mode' in metric, \ + 'Missing norm_mode required by the NME metric.' + if metric['norm_mode'] != 'use_norm_item': + raise ValueError( + "NME norm_mode 'keypoint_distance' is incompatible with " + 'KeypointPartitionMetric.') + + # check partitions + assert len(partitions) > 0, 'There should be at least one partition.' + for partition_name, partition in partitions.items(): + assert isinstance(partition, Sequence), \ + 'Each partition should be a sequence.' + assert len(partition) > 0, \ + 'Each partition should have at least one element.' + self.partitions = partitions + + # instantiate metrics for each partition + self.metrics = {} + for partition_name in partitions.keys(): + _metric = deepcopy(metric) + if 'outfile_prefix' in _metric: + _metric['outfile_prefix'] = _metric[ + 'outfile_prefix'] + '.' + partition_name + self.metrics[partition_name] = METRICS.build(_metric) + + @BaseMetric.dataset_meta.setter + def dataset_meta(self, dataset_meta: dict) -> None: + """Set the dataset meta info to the metric.""" + self._dataset_meta = dataset_meta + # sigmas required by coco metric have to be split as well + for partition_name, keypoint_ids in self.partitions.items(): + _dataset_meta = deepcopy(dataset_meta) + _dataset_meta['num_keypoints'] = len(keypoint_ids) + _dataset_meta['sigmas'] = _dataset_meta['sigmas'][keypoint_ids] + self.metrics[partition_name].dataset_meta = _dataset_meta + + def process(self, data_batch: Sequence[dict], + data_samples: Sequence[dict]) -> None: + """Split data samples by partitions, then call metric.process part by + part.""" + parted_data_samples = { + partition_name: [] + for partition_name in self.partitions.keys() + } + for data_sample in data_samples: + for partition_name, keypoint_ids in self.partitions.items(): + _data_sample = deepcopy(data_sample) + if 'keypoint_scores' in _data_sample['pred_instances']: + _data_sample['pred_instances'][ + 'keypoint_scores'] = _data_sample['pred_instances'][ + 'keypoint_scores'][:, keypoint_ids] + _data_sample['pred_instances']['keypoints'] = _data_sample[ + 'pred_instances']['keypoints'][:, keypoint_ids] + _data_sample['gt_instances']['keypoints'] = _data_sample[ + 'gt_instances']['keypoints'][:, keypoint_ids] + _data_sample['gt_instances'][ + 'keypoints_visible'] = _data_sample['gt_instances'][ + 'keypoints_visible'][:, keypoint_ids] + + # for coco metric + if 'raw_ann_info' in _data_sample: + raw_ann_info = _data_sample['raw_ann_info'] + anns = raw_ann_info if isinstance( + raw_ann_info, list) else [raw_ann_info] + for ann in anns: + if 'keypoints' in ann: + keypoints = np.array(ann['keypoints']).reshape( + -1, 3) + keypoints = keypoints[keypoint_ids] + num_keypoints = np.sum(keypoints[:, 2] > 0) + ann['keypoints'] = keypoints.flatten().tolist() + ann['num_keypoints'] = num_keypoints + + parted_data_samples[partition_name].append(_data_sample) + + for partition_name, metric in self.metrics.items(): + metric.process(data_batch, parted_data_samples[partition_name]) + + def compute_metrics(self, results: list) -> dict: + pass + + def evaluate(self, size: int) -> dict: + """Run evaluation for each partition.""" + eval_results = OrderedDict() + for partition_name, metric in self.metrics.items(): + _eval_results = metric.evaluate(size) + for key in list(_eval_results.keys()): + new_key = partition_name + '/' + key + _eval_results[new_key] = _eval_results.pop(key) + eval_results.update(_eval_results) + return eval_results diff --git a/mmpose/evaluation/metrics/posetrack18_metric.py b/mmpose/evaluation/metrics/posetrack18_metric.py index a9e36652db..86f801455a 100644 --- a/mmpose/evaluation/metrics/posetrack18_metric.py +++ b/mmpose/evaluation/metrics/posetrack18_metric.py @@ -28,7 +28,9 @@ class PoseTrack18Metric(CocoMetric): for more details. Args: - ann_file (str): Path to the annotation file. + ann_file (str, optional): Path to the coco format annotation file. + If not specified, ground truth annotations from the dataset will + be converted to coco format. Defaults to None score_mode (str): The mode to score the prediction results which should be one of the following options: @@ -37,7 +39,7 @@ class PoseTrack18Metric(CocoMetric): - ``'bbox_keypoint'``: Use keypoint score to rescore the prediction results. - Defaults to ``'bbox'` + Defaults to ``'bbox_keypoint'` keypoint_score_thr (float): The threshold of keypoint score. The keypoints with score lower than it will not be included to rescore the prediction results. Valid only when ``score_mode`` is @@ -61,22 +63,16 @@ class PoseTrack18Metric(CocoMetric): doing quantitative evaluation. This is designed for the need of test submission when the ground truth annotations are absent. If set to ``True``, ``outfile_prefix`` should specify the path to - store the output results. Default: ``False``. + store the output results. Defaults to ``False`` outfile_prefix (str | None): The prefix of json files. It includes the file path and the prefix of filename, e.g., ``'a/b/prefix'``. - If not specified, a temp file will be created. Default: ``None``. - collect_device (str): Device name used for collecting results from - different ranks during distributed training. Must be ``'cpu'`` or - ``'gpu'``. Default: ``'cpu'``. - prefix (str, optional): The prefix that will be added in the metric - names to disambiguate homonymous metrics of different evaluators. - If prefix is not provided in the argument, ``self.default_prefix`` - will be used instead. Default: ``None``. + If not specified, a temp file will be created. Defaults to ``None`` + **kwargs: Keyword parameters passed to :class:`mmeval.BaseMetric` """ default_prefix: Optional[str] = 'posetrack18' def __init__(self, - ann_file: str, + ann_file: Optional[str] = None, score_mode: str = 'bbox_keypoint', keypoint_score_thr: float = 0.2, nms_mode: str = 'oks_nms', @@ -90,39 +86,16 @@ def __init__(self, raise ImportError('Please install ``poseval`` package for ' 'evaluation on PoseTrack dataset ' '(see `requirements/optional.txt`)') - super(CocoMetric, self).__init__( - collect_device=collect_device, prefix=prefix) - self.ann_file = ann_file - - allowed_score_modes = ['bbox', 'bbox_keypoint'] - if score_mode not in allowed_score_modes: - raise ValueError( - "`score_mode` should be one of 'bbox', 'bbox_keypoint', " - f"'bbox_rle', but got {score_mode}") - self.score_mode = score_mode - self.keypoint_score_thr = keypoint_score_thr - - allowed_nms_modes = ['oks_nms', 'soft_oks_nms', 'none'] - if nms_mode not in allowed_nms_modes: - raise ValueError( - "`nms_mode` should be one of 'oks_nms', 'soft_oks_nms', " - f"'none', but got {nms_mode}") - self.nms_mode = nms_mode - self.nms_thr = nms_thr - - if format_only: - assert outfile_prefix is not None, '`outfile_prefix` can not be '\ - 'None when `format_only` is True, otherwise the result file '\ - 'will be saved to a temp directory which will be cleaned up '\ - 'in the end.' - else: - # do evaluation only if the ground truth annotations exist - assert 'annotations' in load(ann_file), \ - 'Ground truth annotations are required for evaluation '\ - 'when `format_only` is False.' - self.format_only = format_only - - self.outfile_prefix = outfile_prefix + super().__init__( + ann_file=ann_file, + score_mode=score_mode, + keypoint_score_thr=keypoint_score_thr, + nms_mode=nms_mode, + nms_thr=nms_thr, + format_only=format_only, + outfile_prefix=outfile_prefix, + collect_device=collect_device, + prefix=prefix) def results2json(self, keypoints: Dict[int, list], outfile_prefix: str) -> str: @@ -239,7 +212,7 @@ def _do_python_keypoint_eval(self, outfile_prefix: str) -> List[tuple]: stats_names = [ 'Head AP', 'Shou AP', 'Elb AP', 'Wri AP', 'Hip AP', 'Knee AP', - 'Ankl AP', 'Total AP' + 'Ankl AP', 'AP' ] info_str = list(zip(stats_names, stats)) diff --git a/mmpose/models/backbones/swin.py b/mmpose/models/backbones/swin.py index a3462931b2..a8f7c97278 100644 --- a/mmpose/models/backbones/swin.py +++ b/mmpose/models/backbones/swin.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from collections import OrderedDict from copy import deepcopy import torch @@ -668,13 +667,11 @@ def init_weights(self, pretrained=None): and self.init_cfg['type'] == 'Pretrained'): # Suppress zero_init_residual if use pretrained model. logger = get_root_logger() - _state_dict = get_state_dict( + state_dict = get_state_dict( self.init_cfg['checkpoint'], map_location='cpu') if self.convert_weights: - # supported loading weight from original repo, - _state_dict = swin_converter(_state_dict) - - state_dict = OrderedDict() + # supported loading weight from original repo + state_dict = swin_converter(state_dict) # strip prefix of state_dict if list(state_dict.keys())[0].startswith('module.'): diff --git a/mmpose/models/heads/__init__.py b/mmpose/models/heads/__init__.py index f242a0d4d6..8b4d988a5f 100644 --- a/mmpose/models/heads/__init__.py +++ b/mmpose/models/heads/__init__.py @@ -1,12 +1,14 @@ # Copyright (c) OpenMMLab. All rights reserved. from .base_head import BaseHead -from .heatmap_heads import (CPMHead, HeatmapHead, MSPNHead, SimCCHead, - ViPNASHead) +from .coord_cls_heads import RTMCCHead, SimCCHead +from .heatmap_heads import (AssociativeEmbeddingHead, CIDHead, CPMHead, + HeatmapHead, MSPNHead, ViPNASHead) +from .hybrid_heads import DEKRHead from .regression_heads import (DSNTHead, IntegralRegressionHead, RegressionHead, RLEHead) __all__ = [ 'BaseHead', 'HeatmapHead', 'CPMHead', 'MSPNHead', 'ViPNASHead', 'RegressionHead', 'IntegralRegressionHead', 'SimCCHead', 'RLEHead', - 'DSNTHead' + 'DSNTHead', 'AssociativeEmbeddingHead', 'DEKRHead', 'CIDHead', 'RTMCCHead' ] diff --git a/mmpose/models/heads/base_head.py b/mmpose/models/heads/base_head.py index fb174f6868..40da595051 100644 --- a/mmpose/models/heads/base_head.py +++ b/mmpose/models/heads/base_head.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. +import warnings from abc import ABCMeta, abstractmethod -from typing import List, Tuple, Union +from typing import List, Sequence, Tuple, Union import torch import torch.nn.functional as F @@ -8,6 +9,7 @@ from mmengine.structures import InstanceData from torch import Tensor +from mmpose.models.utils.ops import resize from mmpose.utils.tensor_utils import to_numpy from mmpose.utils.typing import (Features, InstanceList, OptConfigType, OptSampleList, Predictions) @@ -58,14 +60,22 @@ def _get_in_channels(self) -> Union[int, List[int]]: else: in_channels = [feat_channels[i] for i in self.input_index] else: - raise (ValueError, - f'Invalid input transform mode "{self.input_transform}"') + raise ValueError( + f'Invalid input transform mode "{self.input_transform}"') return in_channels - def _transform_inputs(self, feats: Tuple[Tensor] - ) -> Union[Tensor, Tuple[Tensor]]: + def _transform_inputs( + self, + feats: Union[Tensor, Sequence[Tensor]], + ) -> Union[Tensor, Tuple[Tensor]]: """Transform multi scale features into the network input.""" + if not isinstance(feats, Sequence): + warnings.warn(f'the input of {self._get_name()} is a tensor ' + f'instead of a tuple or list. The argument ' + f'`input_transform` will be ignored.') + return feats + if self.input_transform == 'resize_concat': inputs = [feats[i] for i in self.input_index] resized_inputs = [ @@ -79,6 +89,12 @@ def _transform_inputs(self, feats: Tuple[Tensor] elif self.input_transform == 'select': if isinstance(self.input_index, int): inputs = feats[self.input_index] + if hasattr(self, 'upsample') and self.upsample > 0: + inputs = resize( + input=F.relu(inputs), + scale_factor=self.upsample, + mode='bilinear', + align_corners=self.align_corners) else: inputs = tuple(feats[i] for i in self.input_index) else: @@ -100,6 +116,11 @@ def decode(self, batch_outputs: Union[Tensor, decoded pose information of the instances of one data sample. """ + def _pack_and_call(args, func): + if not isinstance(args, tuple): + args = (args, ) + return func(*args) + if self.decoder is None: raise RuntimeError( f'The decoder has not been set in {self.__class__.__name__}. ' @@ -107,15 +128,16 @@ def decode(self, batch_outputs: Union[Tensor, 'enable head methods `head.predict()` and `head.decode()`') if self.decoder.support_batch_decoding: - batch_keypoints, batch_scores = self.decoder.batch_decode( - batch_outputs) + batch_keypoints, batch_scores = _pack_and_call( + batch_outputs, self.decoder.batch_decode) else: batch_output_np = to_numpy(batch_outputs, unzip=True) batch_keypoints = [] batch_scores = [] for outputs in batch_output_np: - keypoints, scores = self.decoder.decode(outputs) + keypoints, scores = _pack_and_call(outputs, + self.decoder.decode) batch_keypoints.append(keypoints) batch_scores.append(scores) diff --git a/mmpose/models/heads/coord_cls_heads/__init__.py b/mmpose/models/heads/coord_cls_heads/__init__.py new file mode 100644 index 0000000000..104ff91308 --- /dev/null +++ b/mmpose/models/heads/coord_cls_heads/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .rtmcc_head import RTMCCHead +from .simcc_head import SimCCHead + +__all__ = ['SimCCHead', 'RTMCCHead'] diff --git a/mmpose/models/heads/coord_cls_heads/rtmcc_head.py b/mmpose/models/heads/coord_cls_heads/rtmcc_head.py new file mode 100644 index 0000000000..8892abffe3 --- /dev/null +++ b/mmpose/models/heads/coord_cls_heads/rtmcc_head.py @@ -0,0 +1,313 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence, Tuple, Union + +import torch +from mmengine.structures import PixelData +from torch import Tensor, nn + +from mmpose.evaluation.functional import simcc_pck_accuracy +from mmpose.models.utils.rtmcc_block import RTMCCBlock, ScaleNorm +from mmpose.models.utils.tta import flip_vectors +from mmpose.registry import KEYPOINT_CODECS, MODELS +from mmpose.utils.tensor_utils import to_numpy +from mmpose.utils.typing import (ConfigType, InstanceList, OptConfigType, + OptSampleList) +from ..base_head import BaseHead + +OptIntSeq = Optional[Sequence[int]] + + +@MODELS.register_module() +class RTMCCHead(BaseHead): + """Top-down head introduced in RTMPose (2023). The head is composed of a + large-kernel convolutional layer, a fully-connected layer and a Gated + Attention Unit to generate 1d representation from low-resolution feature + maps. + + Args: + in_channels (int | sequence[int]): Number of channels in the input + feature map. + out_channels (int): Number of channels in the output heatmap. + input_size (tuple): Size of input image in shape [w, h]. + in_featuremap_size (int | sequence[int]): Size of input feature map. + simcc_split_ratio (float): Split ratio of pixels. + Default: 2.0. + final_layer_kernel_size (int): Kernel size of the convolutional layer. + Default: 1. + gau_cfg (Config): Config dict for the Gated Attention Unit. + Default: dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='ReLU', + use_rel_bias=False, + pos_enc=False). + input_transform (str): Transformation of input features which should + be one of the following options: + + - ``'resize_concat'``: Resize multiple feature maps specified + by ``input_index`` to the same size as the first one and + concat these feature maps + - ``'select'``: Select feature map(s) specified by + ``input_index``. Multiple selected features will be + bundled into a tuple + + Defaults to ``'select'`` + input_index (int | sequence[int]): The feature map index used in the + input transformation. See also ``input_transform``. Defaults to -1 + align_corners (bool): `align_corners` argument of + :func:`torch.nn.functional.interpolate` used in the input + transformation. Defaults to ``False`` + loss (Config): Config of the keypoint loss. Defaults to use + :class:`KLDiscretLoss` + decoder (Config, optional): The decoder config that controls decoding + keypoint coordinates from the network output. Defaults to ``None`` + init_cfg (Config, optional): Config to control the initialization. See + :attr:`default_init_cfg` for default settings + """ + + def __init__( + self, + in_channels: Union[int, Sequence[int]], + out_channels: int, + input_size: Tuple[int, int], + in_featuremap_size: Tuple[int, int], + simcc_split_ratio: float = 2.0, + final_layer_kernel_size: int = 1, + gau_cfg: ConfigType = dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='ReLU', + use_rel_bias=False, + pos_enc=False), + input_transform: str = 'select', + input_index: Union[int, Sequence[int]] = -1, + align_corners: bool = False, + loss: ConfigType = dict(type='KLDiscretLoss', use_target_weight=True), + decoder: OptConfigType = None, + init_cfg: OptConfigType = None, + ): + + if init_cfg is None: + init_cfg = self.default_init_cfg + + super().__init__(init_cfg) + + self.in_channels = in_channels + self.out_channels = out_channels + self.input_size = input_size + self.in_featuremap_size = in_featuremap_size + self.simcc_split_ratio = simcc_split_ratio + self.align_corners = align_corners + self.input_transform = input_transform + self.input_index = input_index + + self.loss_module = MODELS.build(loss) + if decoder is not None: + self.decoder = KEYPOINT_CODECS.build(decoder) + else: + self.decoder = None + + if isinstance(in_channels, (tuple, list)): + raise ValueError( + f'{self.__class__.__name__} does not support selecting ' + 'multiple input features.') + + in_channels = self._get_in_channels() + + # Define SimCC layers + flatten_dims = self.in_featuremap_size[0] * self.in_featuremap_size[1] + + self.final_layer = nn.Conv2d( + in_channels, + out_channels, + kernel_size=final_layer_kernel_size, + stride=1, + padding=final_layer_kernel_size // 2) + self.mlp = nn.Sequential( + ScaleNorm(flatten_dims), + nn.Linear(flatten_dims, gau_cfg['hidden_dims'], bias=False)) + + W = int(self.input_size[0] * self.simcc_split_ratio) + H = int(self.input_size[1] * self.simcc_split_ratio) + + self.gau = RTMCCBlock( + self.out_channels, + gau_cfg['hidden_dims'], + gau_cfg['hidden_dims'], + s=gau_cfg['s'], + expansion_factor=gau_cfg['expansion_factor'], + dropout_rate=gau_cfg['dropout_rate'], + drop_path=gau_cfg['drop_path'], + attn_type='self-attn', + act_fn=gau_cfg['act_fn'], + use_rel_bias=gau_cfg['use_rel_bias'], + pos_enc=gau_cfg['pos_enc']) + + self.cls_x = nn.Linear(gau_cfg['hidden_dims'], W, bias=False) + self.cls_y = nn.Linear(gau_cfg['hidden_dims'], H, bias=False) + + def forward(self, feats: Tuple[Tensor]) -> Tuple[Tensor, Tensor]: + """Forward the network. + + The input is multi scale feature maps and the + output is the heatmap. + + Args: + feats (Tuple[Tensor]): Multi scale feature maps. + + Returns: + pred_x (Tensor): 1d representation of x. + pred_y (Tensor): 1d representation of y. + """ + feats = self._transform_inputs(feats) + + feats = self.final_layer(feats) # -> B, K, H, W + + # flatten the output heatmap + feats = torch.flatten(feats, 2) + + feats = self.mlp(feats) # -> B, K, hidden + + feats = self.gau(feats) + + pred_x = self.cls_x(feats) + pred_y = self.cls_y(feats) + + return pred_x, pred_y + + def predict( + self, + feats: Tuple[Tensor], + batch_data_samples: OptSampleList, + test_cfg: OptConfigType = {}, + ) -> InstanceList: + """Predict results from features. + + Args: + feats (Tuple[Tensor] | List[Tuple[Tensor]]): The multi-stage + features (or multiple multi-stage features in TTA) + batch_data_samples (List[:obj:`PoseDataSample`]): The batch + data samples + test_cfg (dict): The runtime config for testing process. Defaults + to {} + + Returns: + List[InstanceData]: The pose predictions, each contains + the following fields: + - keypoints (np.ndarray): predicted keypoint coordinates in + shape (num_instances, K, D) where K is the keypoint number + and D is the keypoint dimension + - keypoint_scores (np.ndarray): predicted keypoint scores in + shape (num_instances, K) + - keypoint_x_labels (np.ndarray, optional): The predicted 1-D + intensity distribution in the x direction + - keypoint_y_labels (np.ndarray, optional): The predicted 1-D + intensity distribution in the y direction + """ + + if test_cfg.get('flip_test', False): + # TTA: flip test -> feats = [orig, flipped] + assert isinstance(feats, list) and len(feats) == 2 + flip_indices = batch_data_samples[0].metainfo['flip_indices'] + _feats, _feats_flip = feats + + _batch_pred_x, _batch_pred_y = self.forward(_feats) + + _batch_pred_x_flip, _batch_pred_y_flip = self.forward(_feats_flip) + _batch_pred_x_flip, _batch_pred_y_flip = flip_vectors( + _batch_pred_x_flip, + _batch_pred_y_flip, + flip_indices=flip_indices) + + batch_pred_x = (_batch_pred_x + _batch_pred_x_flip) * 0.5 + batch_pred_y = (_batch_pred_y + _batch_pred_y_flip) * 0.5 + else: + batch_pred_x, batch_pred_y = self.forward(feats) + + preds = self.decode((batch_pred_x, batch_pred_y)) + + if test_cfg.get('output_heatmaps', False): + B, K, _ = batch_pred_x.shape + # B, K, Wx -> B, K, Wx, 1 + x = batch_pred_x.reshape(B, K, 1, -1) + # B, K, Wy -> B, K, 1, Wy + y = batch_pred_y.reshape(B, K, -1, 1) + # B, K, Wx, Wy + batch_heatmaps = torch.matmul(y, x) + pred_fields = [ + PixelData(heatmaps=hm) for hm in batch_heatmaps.detach() + ] + + for pred_instances, pred_x, pred_y in zip(preds, + to_numpy(batch_pred_x), + to_numpy(batch_pred_y)): + + pred_instances.keypoint_x_labels = pred_x[None] + pred_instances.keypoint_y_labels = pred_y[None] + + return preds, pred_fields + else: + return preds + + def loss( + self, + feats: Tuple[Tensor], + batch_data_samples: OptSampleList, + train_cfg: OptConfigType = {}, + ) -> dict: + """Calculate losses from a batch of inputs and data samples.""" + + pred_x, pred_y = self.forward(feats) + + gt_x = torch.cat([ + d.gt_instance_labels.keypoint_x_labels for d in batch_data_samples + ], + dim=0) + gt_y = torch.cat([ + d.gt_instance_labels.keypoint_y_labels for d in batch_data_samples + ], + dim=0) + keypoint_weights = torch.cat( + [ + d.gt_instance_labels.keypoint_weights + for d in batch_data_samples + ], + dim=0, + ) + + pred_simcc = (pred_x, pred_y) + gt_simcc = (gt_x, gt_y) + + # calculate losses + losses = dict() + loss = self.loss_module(pred_simcc, gt_simcc, keypoint_weights) + + losses.update(loss_kpt=loss) + + # calculate accuracy + _, avg_acc, _ = simcc_pck_accuracy( + output=to_numpy(pred_simcc), + target=to_numpy(gt_simcc), + simcc_split_ratio=self.simcc_split_ratio, + mask=to_numpy(keypoint_weights) > 0, + ) + + acc_pose = torch.tensor(avg_acc, device=gt_x.device) + losses.update(acc_pose=acc_pose) + + return losses + + @property + def default_init_cfg(self): + init_cfg = [ + dict(type='Normal', layer=['Conv2d'], std=0.001), + dict(type='Constant', layer='BatchNorm2d', val=1), + dict(type='Normal', layer=['Linear'], std=0.01, bias=0), + ] + return init_cfg diff --git a/mmpose/models/heads/heatmap_heads/simcc_head.py b/mmpose/models/heads/coord_cls_heads/simcc_head.py similarity index 96% rename from mmpose/models/heads/heatmap_heads/simcc_head.py rename to mmpose/models/heads/coord_cls_heads/simcc_head.py index 1a3e19f625..b56f43e363 100644 --- a/mmpose/models/heads/heatmap_heads/simcc_head.py +++ b/mmpose/models/heads/coord_cls_heads/simcc_head.py @@ -3,6 +3,7 @@ import torch from mmcv.cnn import build_conv_layer +from mmengine.structures import PixelData from torch import Tensor, nn from mmpose.evaluation.functional import simcc_pck_accuracy @@ -204,6 +205,7 @@ def _make_deconv_head(self, input_transform: str = 'select', input_index: Union[int, Sequence[int]] = -1, align_corners: bool = False) -> nn.Module: + """Create deconvolutional layers by given parameters.""" if deconv_type == 'heatmap': deconv_head = MODELS.build( @@ -315,6 +317,17 @@ def predict( preds = self.decode((batch_pred_x, batch_pred_y)) if test_cfg.get('output_heatmaps', False): + B, K, _ = batch_pred_x.shape + # B, K, Wx -> B, K, Wx, 1 + x = batch_pred_x.reshape(B, K, 1, -1) + # B, K, Wy -> B, K, 1, Wy + y = batch_pred_y.reshape(B, K, -1, 1) + # B, K, Wx, Wy + batch_heatmaps = torch.matmul(y, x) + pred_fields = [ + PixelData(heatmaps=hm) for hm in batch_heatmaps.detach() + ] + for pred_instances, pred_x, pred_y in zip(preds, to_numpy(batch_pred_x), to_numpy(batch_pred_y)): @@ -322,7 +335,9 @@ def predict( pred_instances.keypoint_x_labels = pred_x[None] pred_instances.keypoint_y_labels = pred_y[None] - return preds + return preds, pred_fields + else: + return preds def loss( self, diff --git a/mmpose/models/heads/heatmap_heads/__init__.py b/mmpose/models/heads/heatmap_heads/__init__.py index 37302a018e..b482216b36 100644 --- a/mmpose/models/heads/heatmap_heads/__init__.py +++ b/mmpose/models/heads/heatmap_heads/__init__.py @@ -1,8 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. +from .ae_head import AssociativeEmbeddingHead +from .cid_head import CIDHead from .cpm_head import CPMHead from .heatmap_head import HeatmapHead from .mspn_head import MSPNHead -from .simcc_head import SimCCHead from .vipnas_head import ViPNASHead -__all__ = ['HeatmapHead', 'CPMHead', 'MSPNHead', 'ViPNASHead', 'SimCCHead'] +__all__ = [ + 'HeatmapHead', 'CPMHead', 'MSPNHead', 'ViPNASHead', + 'AssociativeEmbeddingHead', 'CIDHead' +] diff --git a/mmpose/models/heads/heatmap_heads/ae_head.py b/mmpose/models/heads/heatmap_heads/ae_head.py new file mode 100644 index 0000000000..2321d56a99 --- /dev/null +++ b/mmpose/models/heads/heatmap_heads/ae_head.py @@ -0,0 +1,297 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Sequence, Tuple, Union + +import torch +from mmengine.structures import PixelData +from mmengine.utils import is_list_of +from torch import Tensor + +from mmpose.models.utils.tta import aggregate_heatmaps, flip_heatmaps +from mmpose.registry import MODELS +from mmpose.utils.typing import (ConfigType, Features, OptConfigType, + OptSampleList, Predictions) +from .heatmap_head import HeatmapHead + +OptIntSeq = Optional[Sequence[int]] + + +@MODELS.register_module() +class AssociativeEmbeddingHead(HeatmapHead): + + def __init__(self, + in_channels: Union[int, Sequence[int]], + num_keypoints: int, + tag_dim: int = 1, + tag_per_keypoint: bool = True, + deconv_out_channels: OptIntSeq = (256, 256, 256), + deconv_kernel_sizes: OptIntSeq = (4, 4, 4), + conv_out_channels: OptIntSeq = None, + conv_kernel_sizes: OptIntSeq = None, + has_final_layer: bool = True, + input_transform: str = 'select', + input_index: Union[int, Sequence[int]] = -1, + align_corners: bool = False, + keypoint_loss: ConfigType = dict(type='KeypointMSELoss'), + tag_loss: ConfigType = dict(type='AssociativeEmbeddingLoss'), + decoder: OptConfigType = None, + init_cfg: OptConfigType = None): + + if tag_per_keypoint: + out_channels = num_keypoints * (1 + tag_dim) + else: + out_channels = num_keypoints + tag_dim + + loss = dict( + type='CombinedLoss', + losses=dict(keypoint_loss=keypoint_loss, tag_loss=tag_loss)) + + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + deconv_out_channels=deconv_out_channels, + deconv_kernel_sizes=deconv_kernel_sizes, + conv_out_channels=conv_out_channels, + conv_kernel_sizes=conv_kernel_sizes, + has_final_layer=has_final_layer, + input_transform=input_transform, + input_index=input_index, + align_corners=align_corners, + loss=loss, + decoder=decoder, + init_cfg=init_cfg) + + self.num_keypoints = num_keypoints + self.tag_dim = tag_dim + self.tag_per_keypoint = tag_per_keypoint + + def predict(self, + feats: Features, + batch_data_samples: OptSampleList, + test_cfg: ConfigType = {}) -> Predictions: + """Predict results from features. + + Args: + feats (Features): The features which could be in following forms: + + - Tuple[Tensor]: multi-stage features from the backbone + - List[Tuple[Tensor]]: multiple features for TTA where either + `flip_test` or `multiscale_test` is applied + - List[List[Tuple[Tensor]]]: multiple features for TTA where + both `flip_test` and `multiscale_test` are applied + + batch_data_samples (List[:obj:`PoseDataSample`]): The batch + data samples + test_cfg (dict): The runtime config for testing process. Defaults + to {} + + Returns: + Union[InstanceList | Tuple[InstanceList | PixelDataList]]: If + ``test_cfg['output_heatmap']==True``, return both pose and heatmap + prediction; otherwise only return the pose prediction. + + The pose prediction is a list of ``InstanceData``, each contains + the following fields: + + - keypoints (np.ndarray): predicted keypoint coordinates in + shape (num_instances, K, D) where K is the keypoint number + and D is the keypoint dimension + - keypoint_scores (np.ndarray): predicted keypoint scores in + shape (num_instances, K) + + The heatmap prediction is a list of ``PixelData``, each contains + the following fields: + + - heatmaps (Tensor): The predicted heatmaps in shape (K, h, w) + """ + # test configs + multiscale_test = test_cfg.get('multiscale_test', False) + flip_test = test_cfg.get('flip_test', False) + shift_heatmap = test_cfg.get('shift_heatmap', False) + align_corners = test_cfg.get('align_corners', False) + restore_heatmap_size = test_cfg.get('restore_heatmap_size', False) + output_heatmaps = test_cfg.get('output_heatmaps', False) + + # enable multi-scale test + if multiscale_test: + # TTA: multi-scale test + assert is_list_of(feats, list if flip_test else tuple) + else: + assert is_list_of(feats, tuple if flip_test else Tensor) + feats = [feats] + + # resize heatmaps to align with with input size + if restore_heatmap_size: + img_shape = batch_data_samples[0].metainfo['img_shape'] + assert all(d.metainfo['img_shape'] == img_shape + for d in batch_data_samples) + img_h, img_w = img_shape + heatmap_size = (img_w, img_h) + else: + heatmap_size = None + + multiscale_heatmaps = [] + multiscale_tags = [] + + for scale_idx, _feats in enumerate(feats): + if not flip_test: + _heatmaps, _tags = self.forward(_feats) + + else: + # TTA: flip test + assert isinstance(_feats, list) and len(_feats) == 2 + flip_indices = batch_data_samples[0].metainfo['flip_indices'] + # original + _feats_orig, _feats_flip = _feats + _heatmaps_orig, _tags_orig = self.forward(_feats_orig) + + # flipped + _heatmaps_flip, _tags_flip = self.forward(_feats_flip) + _heatmaps_flip = flip_heatmaps( + _heatmaps_flip, + flip_mode='heatmap', + flip_indices=flip_indices, + shift_heatmap=shift_heatmap) + _tags_flip = self._flip_tags( + _tags_flip, + flip_indices=flip_indices, + shift_heatmap=shift_heatmap) + + # aggregated heatmaps + _heatmaps = aggregate_heatmaps( + [_heatmaps_orig, _heatmaps_flip], + size=heatmap_size, + align_corners=align_corners, + mode='average') + + # aggregated tags (only at original scale) + if scale_idx == 0: + _tags = aggregate_heatmaps([_tags_orig, _tags_flip], + size=heatmap_size, + align_corners=align_corners, + mode='concat') + else: + _tags = None + + multiscale_heatmaps.append(_heatmaps) + multiscale_tags.append(_tags) + + # aggregate multi-scale heatmaps + if len(feats) > 1: + batch_heatmaps = aggregate_heatmaps( + multiscale_heatmaps, + align_corners=align_corners, + mode='average') + else: + batch_heatmaps = multiscale_heatmaps[0] + # only keep tags at original scale + batch_tags = multiscale_tags[0] + + batch_outputs = tuple([batch_heatmaps, batch_tags]) + preds = self.decode(batch_outputs) + + if output_heatmaps: + pred_fields = [] + for _heatmaps, _tags in zip(batch_heatmaps.detach(), + batch_tags.detach()): + pred_fields.append(PixelData(heatmaps=_heatmaps, tags=_tags)) + + return preds, pred_fields + else: + return preds + + def _flip_tags(self, + tags: Tensor, + flip_indices: List[int], + shift_heatmap: bool = True): + """Flip the tagging heatmaps horizontally for test-time augmentation. + + Args: + tags (Tensor): batched tagging heatmaps to flip + flip_indices (List[int]): The indices of each keypoint's symmetric + keypoint + shift_heatmap (bool): Shift the flipped heatmaps to align with the + original heatmaps and improve accuracy. Defaults to ``True`` + + Returns: + Tensor: flipped tagging heatmaps + """ + B, C, H, W = tags.shape + K = self.num_keypoints + L = self.tag_dim + + tags = tags.flip(-1) + + if self.tag_per_keypoint: + assert C == K * L + tags = tags.view(B, L, K, H, W) + tags = tags[:, :, flip_indices] + tags = tags.view(B, C, H, W) + + if shift_heatmap: + tags[..., 1:] = tags[..., :-1].clone() + + return tags + + def forward(self, feats: Tuple[Tensor]) -> Tuple[Tensor, Tensor]: + """Forward the network. The input is multi scale feature maps and the + output is the heatmaps and tags. + + Args: + feats (Tuple[Tensor]): Multi scale feature maps. + + Returns: + tuple: + - heatmaps (Tensor): output heatmaps + - tags (Tensor): output tags + """ + + output = super().forward(feats) + heatmaps = output[:, :self.num_keypoints] + tags = output[:, self.num_keypoints:] + return heatmaps, tags + + def loss(self, + feats: Tuple[Tensor], + batch_data_samples: OptSampleList, + train_cfg: ConfigType = {}) -> dict: + """Calculate losses from a batch of inputs and data samples. + + Args: + feats (Tuple[Tensor]): The multi-stage features + batch_data_samples (List[:obj:`PoseDataSample`]): The batch + data samples + train_cfg (dict): The runtime config for training process. + Defaults to {} + + Returns: + dict: A dictionary of losses. + """ + pred_heatmaps, pred_tags = self.forward(feats) + + if not self.tag_per_keypoint: + pred_tags = pred_tags.repeat((1, self.num_keypoints, 1, 1)) + + gt_heatmaps = torch.stack( + [d.gt_fields.heatmaps for d in batch_data_samples]) + gt_masks = torch.stack( + [d.gt_fields.heatmap_mask for d in batch_data_samples]) + keypoint_weights = torch.cat([ + d.gt_instance_labels.keypoint_weights for d in batch_data_samples + ]) + keypoint_indices = [ + d.gt_instance_labels.keypoint_indices for d in batch_data_samples + ] + + loss_kpt = self.loss_module.keypoint_loss(pred_heatmaps, gt_heatmaps, + keypoint_weights, gt_masks) + + loss_pull, loss_push = self.loss_module.tag_loss( + pred_tags, keypoint_indices) + + losses = { + 'loss_kpt': loss_kpt, + 'loss_pull': loss_pull, + 'loss_push': loss_push + } + + return losses diff --git a/mmpose/models/heads/heatmap_heads/cid_head.py b/mmpose/models/heads/heatmap_heads/cid_head.py new file mode 100644 index 0000000000..67c0283e0d --- /dev/null +++ b/mmpose/models/heads/heatmap_heads/cid_head.py @@ -0,0 +1,774 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Dict, Optional, Sequence, Tuple, Union + +import numpy as np +import torch +import torch.nn as nn +from mmcv.cnn import build_conv_layer +from mmengine.model import BaseModule, ModuleDict +from mmengine.structures import InstanceData, PixelData +from torch import Tensor + +from mmpose.models.utils.tta import flip_heatmaps +from mmpose.registry import KEYPOINT_CODECS, MODELS +from mmpose.utils.typing import (ConfigType, Features, OptConfigType, + OptSampleList, Predictions) +from ..base_head import BaseHead + + +def smooth_heatmaps(heatmaps: Tensor, blur_kernel_size: int) -> Tensor: + """Smooth the heatmaps by blurring and averaging. + + Args: + heatmaps (Tensor): The heatmaps to smooth. + blur_kernel_size (int): The kernel size for blurring the heatmaps. + + Returns: + Tensor: The smoothed heatmaps. + """ + smoothed_heatmaps = torch.nn.functional.avg_pool2d( + heatmaps, blur_kernel_size, 1, (blur_kernel_size - 1) // 2) + smoothed_heatmaps = (heatmaps + smoothed_heatmaps) / 2.0 + return smoothed_heatmaps + + +class TruncSigmoid(nn.Sigmoid): + """A sigmoid activation function that truncates the output to the given + range. + + Args: + min (float, optional): The minimum value to clamp the output to. + Defaults to 0.0 + max (float, optional): The maximum value to clamp the output to. + Defaults to 1.0 + """ + + def __init__(self, min: float = 0.0, max: float = 1.0): + super(TruncSigmoid, self).__init__() + self.min = min + self.max = max + + def forward(self, input: Tensor) -> Tensor: + """Computes the truncated sigmoid activation of the input tensor.""" + output = torch.sigmoid(input) + output = output.clamp(min=self.min, max=self.max) + return output + + +class IIAModule(BaseModule): + """Instance Information Abstraction module introduced in `CID`. This module + extracts the feature representation vectors for each instance. + + Args: + in_channels (int): Number of channels in the input feature tensor + out_channels (int): Number of channels of the output heatmaps + clamp_delta (float, optional): A small value that prevents the sigmoid + activation from becoming saturated. Defaults to 1e-4. + init_cfg (Config, optional): Config to control the initialization. See + :attr:`default_init_cfg` for default settings + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + clamp_delta: float = 1e-4, + init_cfg: OptConfigType = None, + ): + super().__init__(init_cfg=init_cfg) + + self.keypoint_root_conv = build_conv_layer( + dict( + type='Conv2d', + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1)) + self.sigmoid = TruncSigmoid(min=clamp_delta, max=1 - clamp_delta) + + def forward(self, feats: Tensor): + heatmaps = self.keypoint_root_conv(feats) + heatmaps = self.sigmoid(heatmaps) + return heatmaps + + def _sample_feats(self, feats: Tensor, indices: Tensor) -> Tensor: + """Extract feature vectors at the specified indices from the input + feature map. + + Args: + feats (Tensor): Input feature map. + indices (Tensor): Indices of the feature vectors to extract. + + Returns: + Tensor: Extracted feature vectors. + """ + assert indices.dtype == torch.long + if indices.shape[1] == 3: + b, w, h = [ind.squeeze(-1) for ind in indices.split(1, -1)] + instance_feats = feats[b, :, h, w] + elif indices.shape[1] == 2: + w, h = [ind.squeeze(-1) for ind in indices.split(1, -1)] + instance_feats = feats[:, :, h, w] + instance_feats = instance_feats.permute(0, 2, 1) + instance_feats = instance_feats.reshape(-1, + instance_feats.shape[-1]) + + else: + raise ValueError(f'`indices` should have 2 or 3 channels, ' + f'but got f{indices.shape[1]}') + return instance_feats + + def _hierarchical_pool(self, heatmaps: Tensor) -> Tensor: + """Conduct max pooling on the input heatmaps with different kernel size + according to the input size. + + Args: + heatmaps (Tensor): Input heatmaps. + + Returns: + Tensor: Result of hierarchical pooling. + """ + map_size = (heatmaps.shape[-1] + heatmaps.shape[-2]) / 2.0 + if map_size > 300: + maxm = torch.nn.functional.max_pool2d(heatmaps, 7, 1, 3) + elif map_size > 200: + maxm = torch.nn.functional.max_pool2d(heatmaps, 5, 1, 2) + else: + maxm = torch.nn.functional.max_pool2d(heatmaps, 3, 1, 1) + return maxm + + def forward_train(self, feats: Tensor, instance_coords: Tensor, + instance_imgids: Tensor) -> Tuple[Tensor, Tensor]: + """Forward pass during training. + + Args: + feats (Tensor): Input feature tensor. + instance_coords (Tensor): Coordinates of the instance roots. + instance_imgids (Tensor): Sample indices of each instances + in the batch. + + Returns: + Tuple[Tensor, Tensor]: Extracted feature vectors and heatmaps + for the instances. + """ + heatmaps = self.forward(feats) + indices = torch.cat((instance_imgids[:, None], instance_coords), dim=1) + instance_feats = self._sample_feats(feats, indices) + + return instance_feats, heatmaps + + def forward_test( + self, feats: Tensor, test_cfg: Dict + ) -> Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor]]: + """Forward pass during testing. + + Args: + feats (Tensor): Input feature tensor. + test_cfg (Dict): Testing configuration, including: + - blur_kernel_size (int, optional): Kernel size for blurring + the heatmaps. Defaults to 3. + - max_instances (int, optional): Maximum number of instances + to extract. Defaults to 30. + - score_threshold (float, optional): Minimum score for + extracting an instance. Defaults to 0.01. + - flip_test (bool, optional): Whether to compute the average + of the heatmaps across the batch dimension. + Defaults to False. + + Returns: + A tuple of Tensor including extracted feature vectors, + coordinates, and scores of the instances. Any of these can be + empty Tensor if no instances are extracted. + """ + blur_kernel_size = test_cfg.get('blur_kernel_size', 3) + max_instances = test_cfg.get('max_instances', 30) + score_threshold = test_cfg.get('score_threshold', 0.01) + H, W = feats.shape[-2:] + + # compute heatmaps + heatmaps = self.forward(feats).narrow(1, -1, 1) + if test_cfg.get('flip_test', False): + heatmaps = heatmaps.mean(dim=0, keepdims=True) + smoothed_heatmaps = smooth_heatmaps(heatmaps, blur_kernel_size) + + # decode heatmaps + maximums = self._hierarchical_pool(smoothed_heatmaps) + maximums = torch.eq(maximums, smoothed_heatmaps).float() + maximums = (smoothed_heatmaps * maximums).reshape(-1) + scores, pos_ind = maximums.topk(max_instances, dim=0) + select_ind = (scores > (score_threshold)).nonzero().squeeze(1) + scores, pos_ind = scores[select_ind], pos_ind[select_ind] + + # sample feature vectors from feature map + instance_coords = torch.stack((pos_ind % W, pos_ind // W), dim=1) + instance_feats = self._sample_feats(feats, instance_coords) + + return instance_feats, instance_coords, scores + + +class ChannelAttention(nn.Module): + """Channel-wise attention module introduced in `CID`. + + Args: + in_channels (int): The number of channels of the input instance + vectors. + out_channels (int): The number of channels of the transformed instance + vectors. + """ + + def __init__(self, in_channels: int, out_channels: int): + super(ChannelAttention, self).__init__() + self.atn = nn.Linear(in_channels, out_channels) + + def forward(self, global_feats: Tensor, instance_feats: Tensor) -> Tensor: + """Applies attention to the channel dimension of the input tensor.""" + + instance_feats = self.atn(instance_feats).unsqueeze(2).unsqueeze(3) + return global_feats * instance_feats + + +class SpatialAttention(nn.Module): + """Spatial-wise attention module introduced in `CID`. + + Args: + in_channels (int): The number of channels of the input instance + vectors. + out_channels (int): The number of channels of the transformed instance + vectors. + """ + + def __init__(self, in_channels, out_channels): + super(SpatialAttention, self).__init__() + self.atn = nn.Linear(in_channels, out_channels) + self.feat_stride = 4 + self.conv = nn.Conv2d(3, 1, 5, 1, 2) + + def _get_pixel_coords(self, heatmap_size: Tuple, device: str = 'cpu'): + """Get pixel coordinates for each element in the heatmap. + + Args: + heatmap_size (tuple): Size of the heatmap in (W, H) format. + device (str): Device to put the resulting tensor on. + + Returns: + Tensor of shape (batch_size, num_pixels, 2) containing the pixel + coordinates for each element in the heatmap. + """ + w, h = heatmap_size + y, x = torch.meshgrid(torch.arange(h), torch.arange(w)) + pixel_coords = torch.stack((x, y), dim=-1).reshape(-1, 2) + pixel_coords = pixel_coords.float().to(device) + 0.5 + return pixel_coords + + def forward(self, global_feats: Tensor, instance_feats: Tensor, + instance_coords: Tensor) -> Tensor: + """Perform spatial attention. + + Args: + global_feats (Tensor): Tensor containing the global features. + instance_feats (Tensor): Tensor containing the instance feature + vectors. + instance_coords (Tensor): Tensor containing the root coordinates + of the instances. + + Returns: + Tensor containing the modulated global features. + """ + B, C, H, W = global_feats.size() + + instance_feats = self.atn(instance_feats).reshape(B, C, 1, 1) + feats = global_feats * instance_feats.expand_as(global_feats) + fsum = torch.sum(feats, dim=1, keepdim=True) + + pixel_coords = self._get_pixel_coords((W, H), feats.device) + relative_coords = instance_coords.reshape( + -1, 1, 2) - pixel_coords.reshape(1, -1, 2) + relative_coords = relative_coords.permute(0, 2, 1) / 32.0 + relative_coords = relative_coords.reshape(B, 2, H, W) + + input_feats = torch.cat((fsum, relative_coords), dim=1) + mask = self.conv(input_feats).sigmoid() + return global_feats * mask + + +class GFDModule(BaseModule): + """Global Feature Decoupling module introduced in `CID`. This module + extracts the decoupled heatmaps for each instance. + + Args: + in_channels (int): Number of channels in the input feature map + out_channels (int): Number of channels of the output heatmaps + for each instance + gfd_channels (int): Number of channels in the transformed feature map + clamp_delta (float, optional): A small value that prevents the sigmoid + activation from becoming saturated. Defaults to 1e-4. + init_cfg (Config, optional): Config to control the initialization. See + :attr:`default_init_cfg` for default settings + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + gfd_channels: int, + clamp_delta: float = 1e-4, + init_cfg: OptConfigType = None, + ): + super().__init__(init_cfg=init_cfg) + + self.conv_down = build_conv_layer( + dict( + type='Conv2d', + in_channels=in_channels, + out_channels=gfd_channels, + kernel_size=1)) + + self.channel_attention = ChannelAttention(in_channels, gfd_channels) + self.spatial_attention = SpatialAttention(in_channels, gfd_channels) + self.fuse_attention = build_conv_layer( + dict( + type='Conv2d', + in_channels=gfd_channels * 2, + out_channels=gfd_channels, + kernel_size=1)) + self.heatmap_conv = build_conv_layer( + dict( + type='Conv2d', + in_channels=gfd_channels, + out_channels=out_channels, + kernel_size=1)) + self.sigmoid = TruncSigmoid(min=clamp_delta, max=1 - clamp_delta) + + def forward( + self, + feats: Tensor, + instance_feats: Tensor, + instance_coords: Tensor, + instance_imgids: Tensor, + ) -> Tensor: + """Extract decoupled heatmaps for each instance. + + Args: + feats (Tensor): Input feature maps. + instance_feats (Tensor): Tensor containing the instance feature + vectors. + instance_coords (Tensor): Tensor containing the root coordinates + of the instances. + instance_imgids (Tensor): Sample indices of each instances + in the batch. + + Returns: + A tensor containing decoupled heatmaps. + """ + + global_feats = self.conv_down(feats) + global_feats = global_feats[instance_imgids] + cond_instance_feats = torch.cat( + (self.channel_attention(global_feats, instance_feats), + self.spatial_attention(global_feats, instance_feats, + instance_coords)), + dim=1) + + cond_instance_feats = self.fuse_attention(cond_instance_feats) + cond_instance_feats = torch.nn.functional.relu(cond_instance_feats) + cond_instance_feats = self.heatmap_conv(cond_instance_feats) + heatmaps = self.sigmoid(cond_instance_feats) + + return heatmaps + + +@MODELS.register_module() +class CIDHead(BaseHead): + """Contextual Instance Decoupling head introduced in `Contextual Instance + Decoupling for Robust Multi-Person Pose Estimation (CID)`_ by Wang et al + (2022). The head is composed of an Instance Information Abstraction (IIA) + module and a Global Feature Decoupling (GFD) module. + + Args: + in_channels (int | Sequence[int]): Number of channels in the input + feature map + num_keypoints (int): Number of keypoints + gfd_channels (int): Number of filters in GFD module + max_train_instances (int): Maximum number of instances in a batch + during training. Defaults to 200 + input_transform (str): Transformation of input features which should + be one of the following options: + + - ``'resize_concat'``: Resize multiple feature maps specified + by ``input_index`` to the same size as the first one and + concat these feature maps + - ``'select'``: Select feature map(s) specified by + ``input_index``. Multiple selected features will be + bundled into a tuple + + Defaults to ``'select'`` + input_index (int | Sequence[int]): The feature map index used in the + input transformation. See also ``input_transform``. Defaults to -1 + align_corners (bool): `align_corners` argument of + :func:`torch.nn.functional.interpolate` used in the input + transformation. Defaults to ``False`` + heatmap_loss (Config): Config of the heatmap loss. Defaults to use + :class:`KeypointMSELoss` + coupled_heatmap_loss (Config): Config of the loss for coupled heatmaps. + Defaults to use :class:`SoftWeightSmoothL1Loss` + decoupled_heatmap_loss (Config): Config of the loss for decoupled + heatmaps. Defaults to use :class:`SoftWeightSmoothL1Loss` + contrastive_loss (Config): Config of the contrastive loss for + representation vectors of instances. Defaults to use + :class:`InfoNCELoss` + decoder (Config, optional): The decoder config that controls decoding + keypoint coordinates from the network output. Defaults to ``None`` + init_cfg (Config, optional): Config to control the initialization. See + :attr:`default_init_cfg` for default settings + + .. _`CID`: https://openaccess.thecvf.com/content/CVPR2022/html/Wang_ + Contextual_Instance_Decoupling_for_Robust_Multi-Person_Pose_Estimation_ + CVPR_2022_paper.html + """ + _version = 2 + + def __init__(self, + in_channels: Union[int, Sequence[int]], + gfd_channels: int, + num_keypoints: int, + prior_prob: float = 0.01, + input_transform: str = 'select', + input_index: Union[int, Sequence[int]] = -1, + align_corners: bool = False, + coupled_heatmap_loss: OptConfigType = dict( + type='FocalHeatmapLoss'), + decoupled_heatmap_loss: OptConfigType = dict( + type='FocalHeatmapLoss'), + contrastive_loss: OptConfigType = dict(type='InfoNCELoss'), + decoder: OptConfigType = None, + init_cfg: OptConfigType = None): + + if init_cfg is None: + init_cfg = self.default_init_cfg + + super().__init__(init_cfg) + + self.in_channels = in_channels + self.num_keypoints = num_keypoints + self.align_corners = align_corners + self.input_transform = input_transform + self.input_index = input_index + if decoder is not None: + self.decoder = KEYPOINT_CODECS.build(decoder) + else: + self.decoder = None + + # Get model input channels according to feature + in_channels = self._get_in_channels() + if isinstance(in_channels, list): + raise ValueError( + f'{self.__class__.__name__} does not support selecting ' + 'multiple input features.') + + # build sub-modules + bias_value = -math.log((1 - prior_prob) / prior_prob) + self.iia_module = IIAModule( + in_channels, + num_keypoints + 1, + init_cfg=init_cfg + [ + dict( + type='Normal', + layer=['Conv2d', 'Linear'], + std=0.001, + override=dict( + name='keypoint_root_conv', + type='Normal', + std=0.001, + bias=bias_value)) + ]) + self.gfd_module = GFDModule( + in_channels, + num_keypoints, + gfd_channels, + init_cfg=init_cfg + [ + dict( + type='Normal', + layer=['Conv2d', 'Linear'], + std=0.001, + override=dict( + name='heatmap_conv', + type='Normal', + std=0.001, + bias=bias_value)) + ]) + + # build losses + self.loss_module = ModuleDict( + dict( + heatmap_coupled=MODELS.build(coupled_heatmap_loss), + heatmap_decoupled=MODELS.build(decoupled_heatmap_loss), + contrastive=MODELS.build(contrastive_loss), + )) + + # Register the hook to automatically convert old version state dicts + self._register_load_state_dict_pre_hook(self._load_state_dict_pre_hook) + + @property + def default_init_cfg(self): + init_cfg = [ + dict(type='Normal', layer=['Conv2d', 'Linear'], std=0.001), + dict(type='Constant', layer='BatchNorm2d', val=1) + ] + return init_cfg + + def forward(self, feats: Tuple[Tensor]) -> Tensor: + """Forward the network. The input is multi scale feature maps and the + output is the heatmap. + + Args: + feats (Tuple[Tensor]): Multi scale feature maps. + + Returns: + Tensor: output heatmap. + """ + feats = self._transform_inputs(feats) + instance_info = self.iia_module.forward_test(feats, {}) + instance_feats, instance_coords, instance_scores = instance_info + instance_imgids = torch.zeros( + instance_coords.size(0), dtype=torch.long, device=feats.device) + instance_heatmaps = self.gfd_module(feats, instance_feats, + instance_coords, instance_imgids) + + return instance_heatmaps + + def predict(self, + feats: Features, + batch_data_samples: OptSampleList, + test_cfg: ConfigType = {}) -> Predictions: + """Predict results from features. + + Args: + feats (Tuple[Tensor] | List[Tuple[Tensor]]): The multi-stage + features (or multiple multi-stage features in TTA) + batch_data_samples (List[:obj:`PoseDataSample`]): The batch + data samples + test_cfg (dict): The runtime config for testing process. Defaults + to {} + + Returns: + Union[InstanceList | Tuple[InstanceList | PixelDataList]]: If + ``test_cfg['output_heatmap']==True``, return both pose and heatmap + prediction; otherwise only return the pose prediction. + + The pose prediction is a list of ``InstanceData``, each contains + the following fields: + + - keypoints (np.ndarray): predicted keypoint coordinates in + shape (num_instances, K, D) where K is the keypoint number + and D is the keypoint dimension + - keypoint_scores (np.ndarray): predicted keypoint scores in + shape (num_instances, K) + + The heatmap prediction is a list of ``PixelData``, each contains + the following fields: + + - heatmaps (Tensor): The predicted heatmaps in shape (K, h, w) + """ + metainfo = batch_data_samples[0].metainfo + + if test_cfg.get('flip_test', False): + assert isinstance(feats, list) and len(feats) == 2 + + feats_flipped = flip_heatmaps( + self._transform_inputs(feats[1]), shift_heatmap=False) + feats = torch.cat( + (self._transform_inputs(feats[0]), feats_flipped)) + else: + feats = self._transform_inputs(feats) + + instance_info = self.iia_module.forward_test(feats, test_cfg) + instance_feats, instance_coords, instance_scores = instance_info + if len(instance_coords) > 0: + instance_imgids = torch.zeros( + instance_coords.size(0), dtype=torch.long, device=feats.device) + if test_cfg.get('flip_test', False): + instance_coords = torch.cat((instance_coords, instance_coords)) + instance_imgids = torch.cat( + (instance_imgids, instance_imgids + 1)) + instance_heatmaps = self.gfd_module(feats, instance_feats, + instance_coords, + instance_imgids) + if test_cfg.get('flip_test', False): + flip_indices = batch_data_samples[0].metainfo['flip_indices'] + instance_heatmaps, instance_heatmaps_flip = torch.chunk( + instance_heatmaps, 2, dim=0) + instance_heatmaps_flip = \ + instance_heatmaps_flip[:, flip_indices, :, :] + instance_heatmaps = (instance_heatmaps + + instance_heatmaps_flip) / 2.0 + instance_heatmaps = smooth_heatmaps( + instance_heatmaps, test_cfg.get('blur_kernel_size', 3)) + + preds = self.decode((instance_heatmaps, instance_scores[:, None])) + preds = InstanceData.cat(preds) + preds.keypoints[..., 0] += metainfo['input_size'][ + 0] / instance_heatmaps.shape[-1] / 2.0 + preds.keypoints[..., 1] += metainfo['input_size'][ + 1] / instance_heatmaps.shape[-2] / 2.0 + preds = [preds] + + else: + preds = [ + InstanceData( + keypoints=np.empty((0, self.num_keypoints, 2)), + keypoint_scores=np.empty((0, self.num_keypoints))) + ] + instance_heatmaps = torch.empty(0, self.num_keypoints, + *feats.shape[-2:]) + + if test_cfg.get('output_heatmaps', False): + pred_fields = [ + PixelData( + heatmaps=instance_heatmaps.reshape( + -1, *instance_heatmaps.shape[-2:])) + ] + return preds, pred_fields + else: + return preds + + def loss(self, + feats: Tuple[Tensor], + batch_data_samples: OptSampleList, + train_cfg: ConfigType = {}) -> dict: + """Calculate losses from a batch of inputs and data samples. + + Args: + feats (Tuple[Tensor]): The multi-stage features + batch_data_samples (List[:obj:`PoseDataSample`]): The batch + data samples + train_cfg (dict): The runtime config for training process. + Defaults to {} + + Returns: + dict: A dictionary of losses. + """ + + # load targets + gt_heatmaps, gt_instance_coords, keypoint_weights = [], [], [] + heatmap_mask = [] + instance_imgids, gt_instance_heatmaps = [], [] + for i, d in enumerate(batch_data_samples): + gt_heatmaps.append(d.gt_fields.heatmaps) + gt_instance_coords.append(d.gt_instance_labels.instance_coords) + keypoint_weights.append(d.gt_instance_labels.keypoint_weights) + instance_imgids.append( + torch.ones( + len(d.gt_instance_labels.instance_coords), + dtype=torch.long) * i) + + instance_heatmaps = d.gt_fields.instance_heatmaps.reshape( + -1, self.num_keypoints, + *d.gt_fields.instance_heatmaps.shape[1:]) + gt_instance_heatmaps.append(instance_heatmaps) + + if 'heatmap_mask' in d.gt_fields: + heatmap_mask.append(d.gt_fields.heatmap_mask) + + gt_heatmaps = torch.stack(gt_heatmaps) + heatmap_mask = torch.stack(heatmap_mask) if heatmap_mask else None + + gt_instance_coords = torch.cat(gt_instance_coords, dim=0) + gt_instance_heatmaps = torch.cat(gt_instance_heatmaps, dim=0) + keypoint_weights = torch.cat(keypoint_weights, dim=0) + instance_imgids = torch.cat(instance_imgids).to(gt_heatmaps.device) + + # feed-forward + feats = self._transform_inputs(feats) + pred_instance_feats, pred_heatmaps = self.iia_module.forward_train( + feats, gt_instance_coords, instance_imgids) + + # conpute contrastive loss + contrastive_loss = 0 + for i in range(len(batch_data_samples)): + pred_instance_feat = pred_instance_feats[instance_imgids == i] + contrastive_loss += self.loss_module['contrastive']( + pred_instance_feat) + contrastive_loss = contrastive_loss / max(1, len(instance_imgids)) + + # limit the number of instances + max_train_instances = train_cfg.get('max_train_instances', -1) + if (max_train_instances > 0 + and len(instance_imgids) > max_train_instances): + selected_indices = torch.randperm( + len(instance_imgids), + device=gt_heatmaps.device, + dtype=torch.long)[:max_train_instances] + gt_instance_coords = gt_instance_coords[selected_indices] + keypoint_weights = keypoint_weights[selected_indices] + gt_instance_heatmaps = gt_instance_heatmaps[selected_indices] + instance_imgids = instance_imgids[selected_indices] + pred_instance_feats = pred_instance_feats[selected_indices] + + # calculate the decoupled heatmaps for each instance + pred_instance_heatmaps = self.gfd_module(feats, pred_instance_feats, + gt_instance_coords, + instance_imgids) + + # calculate losses + losses = { + 'loss/heatmap_coupled': + self.loss_module['heatmap_coupled'](pred_heatmaps, gt_heatmaps, + None, heatmap_mask) + } + if len(instance_imgids) > 0: + losses.update({ + 'loss/heatmap_decoupled': + self.loss_module['heatmap_decoupled'](pred_instance_heatmaps, + gt_instance_heatmaps, + keypoint_weights), + 'loss/contrastive': + contrastive_loss + }) + + return losses + + def _load_state_dict_pre_hook(self, state_dict, prefix, local_meta, *args, + **kwargs): + """A hook function to convert old-version state dict of + :class:`CIDHead` (before MMPose v1.0.0) to a compatible format + of :class:`CIDHead`. + + The hook will be automatically registered during initialization. + """ + version = local_meta.get('version', None) + if version and version >= self._version: + return + + # convert old-version state dict + keys = list(state_dict.keys()) + for k in keys: + if 'keypoint_center_conv' in k: + v = state_dict.pop(k) + k = k.replace('keypoint_center_conv', + 'iia_module.keypoint_root_conv') + state_dict[k] = v + + if 'conv_down' in k: + v = state_dict.pop(k) + k = k.replace('conv_down', 'gfd_module.conv_down') + state_dict[k] = v + + if 'c_attn' in k: + v = state_dict.pop(k) + k = k.replace('c_attn', 'gfd_module.channel_attention') + state_dict[k] = v + + if 's_attn' in k: + v = state_dict.pop(k) + k = k.replace('s_attn', 'gfd_module.spatial_attention') + state_dict[k] = v + + if 'fuse_attn' in k: + v = state_dict.pop(k) + k = k.replace('fuse_attn', 'gfd_module.fuse_attention') + state_dict[k] = v + + if 'heatmap_conv' in k: + v = state_dict.pop(k) + k = k.replace('heatmap_conv', 'gfd_module.heatmap_conv') + state_dict[k] = v diff --git a/mmpose/models/heads/heatmap_heads/cpm_head.py b/mmpose/models/heads/heatmap_heads/cpm_head.py index e7636bae9f..44ad906e2c 100644 --- a/mmpose/models/heads/heatmap_heads/cpm_head.py +++ b/mmpose/models/heads/heatmap_heads/cpm_head.py @@ -94,7 +94,7 @@ def __init__(self, raise ValueError( '"deconv_out_channels" and "deconv_kernel_sizes" should ' 'be integer sequences with the same length. Got ' - f'unmatched values {deconv_out_channels} and ' + f'mismatched lengths {deconv_out_channels} and ' f'{deconv_kernel_sizes}') for _ in range(self.num_stages): diff --git a/mmpose/models/heads/heatmap_heads/heatmap_head.py b/mmpose/models/heads/heatmap_heads/heatmap_head.py index 859328f5db..02ca7a893a 100644 --- a/mmpose/models/heads/heatmap_heads/heatmap_head.py +++ b/mmpose/models/heads/heatmap_heads/heatmap_head.py @@ -64,6 +64,8 @@ class HeatmapHead(BaseHead): keypoint coordinates from the network output. Defaults to ``None`` init_cfg (Config, optional): Config to control the initialization. See :attr:`default_init_cfg` for default settings + extra (dict, optional): Extra configurations. + Defaults to ``None`` .. _`Simple Baselines`: https://arxiv.org/abs/1804.06208 """ @@ -84,7 +86,8 @@ def __init__(self, loss: ConfigType = dict( type='KeypointMSELoss', use_target_weight=True), decoder: OptConfigType = None, - init_cfg: OptConfigType = None): + init_cfg: OptConfigType = None, + extra=None): if init_cfg is None: init_cfg = self.default_init_cfg @@ -101,6 +104,21 @@ def __init__(self, self.decoder = KEYPOINT_CODECS.build(decoder) else: self.decoder = None + self.upsample = 0 + + if extra is not None and not isinstance(extra, dict): + raise TypeError('extra should be dict or None.') + + kernel_size = 1 + padding = 0 + if extra is not None: + if 'upsample' in extra: + self.upsample = extra['upsample'] + if 'final_conv_kernel' in extra: + assert extra['final_conv_kernel'] in [1, 3] + if extra['final_conv_kernel'] == 3: + padding = 1 + kernel_size = extra['final_conv_kernel'] # Get model input channels according to feature in_channels = self._get_in_channels() @@ -115,7 +133,7 @@ def __init__(self, raise ValueError( '"deconv_out_channels" and "deconv_kernel_sizes" should ' 'be integer sequences with the same length. Got ' - f'unmatched values {deconv_out_channels} and ' + f'mismatched lengths {deconv_out_channels} and ' f'{deconv_kernel_sizes}') self.deconv_layers = self._make_deconv_layers( @@ -132,8 +150,9 @@ def __init__(self, conv_kernel_sizes): raise ValueError( '"conv_out_channels" and "conv_kernel_sizes" should ' - 'be integer sequences with the same length. Got unmatched' - f' values {conv_out_channels} and {conv_kernel_sizes}') + 'be integer sequences with the same length. Got ' + f'mismatched lengths {conv_out_channels} and ' + f'{conv_kernel_sizes}') self.conv_layers = self._make_conv_layers( in_channels=in_channels, @@ -148,7 +167,8 @@ def __init__(self, type='Conv2d', in_channels=in_channels, out_channels=out_channels, - kernel_size=1) + padding=padding, + kernel_size=kernel_size) self.final_layer = build_conv_layer(cfg) else: self.final_layer = nn.Identity() diff --git a/mmpose/models/heads/heatmap_heads/vipnas_head.py b/mmpose/models/heads/heatmap_heads/vipnas_head.py index 7c21617254..c02976033f 100644 --- a/mmpose/models/heads/heatmap_heads/vipnas_head.py +++ b/mmpose/models/heads/heatmap_heads/vipnas_head.py @@ -116,14 +116,14 @@ def __init__(self, raise ValueError( '"deconv_out_channels" and "deconv_kernel_sizes" should ' 'be integer sequences with the same length. Got ' - f'unmatched values {deconv_out_channels} and ' + f'mismatched lengths {deconv_out_channels} and ' f'{deconv_kernel_sizes}') if deconv_num_groups is None or len(deconv_out_channels) != len( deconv_num_groups): raise ValueError( '"deconv_out_channels" and "deconv_num_groups" should ' 'be integer sequences with the same length. Got ' - f'unmatched values {deconv_out_channels} and ' + f'mismatched lengths {deconv_out_channels} and ' f'{deconv_num_groups}') self.deconv_layers = self._make_deconv_layers( @@ -141,8 +141,9 @@ def __init__(self, conv_kernel_sizes): raise ValueError( '"conv_out_channels" and "conv_kernel_sizes" should ' - 'be integer sequences with the same length. Got unmatched' - f' values {conv_out_channels} and {conv_kernel_sizes}') + 'be integer sequences with the same length. Got ' + f'mismatched lengths {conv_out_channels} and ' + f'{conv_kernel_sizes}') self.conv_layers = self._make_conv_layers( in_channels=in_channels, diff --git a/mmpose/models/heads/hybrid_heads/__init__.py b/mmpose/models/heads/hybrid_heads/__init__.py new file mode 100644 index 0000000000..55d5a211c1 --- /dev/null +++ b/mmpose/models/heads/hybrid_heads/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .dekr_head import DEKRHead + +__all__ = [ + 'DEKRHead', +] diff --git a/mmpose/models/heads/hybrid_heads/dekr_head.py b/mmpose/models/heads/hybrid_heads/dekr_head.py new file mode 100644 index 0000000000..56f215f939 --- /dev/null +++ b/mmpose/models/heads/hybrid_heads/dekr_head.py @@ -0,0 +1,605 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Sequence, Tuple, Union + +import torch +from mmcv.cnn import (ConvModule, build_activation_layer, build_conv_layer, + build_norm_layer) +from mmengine.model import BaseModule, ModuleDict, Sequential +from mmengine.structures import InstanceData, PixelData +from torch import Tensor + +from mmpose.evaluation.functional.nms import nearby_joints_nms +from mmpose.models.utils.tta import flip_heatmaps +from mmpose.registry import KEYPOINT_CODECS, MODELS +from mmpose.utils.tensor_utils import to_numpy +from mmpose.utils.typing import (ConfigType, Features, InstanceList, + OptConfigType, OptSampleList, Predictions) +from ...backbones.resnet import BasicBlock +from ..base_head import BaseHead + +try: + from mmcv.ops import DeformConv2d + has_mmcv_full = True +except (ImportError, ModuleNotFoundError): + has_mmcv_full = False + + +class AdaptiveActivationBlock(BaseModule): + """Adaptive activation convolution block. "Bottom-up human pose estimation + via disentangled keypoint regression", CVPR'2021. + + Args: + in_channels (int): Number of input channels + out_channels (int): Number of output channels + groups (int): Number of groups. Generally equal to the + number of joints. + norm_cfg (dict): Config for normalization layers. + act_cfg (dict): Config for activation layers. + """ + + def __init__(self, + in_channels, + out_channels, + groups=1, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + init_cfg=None): + super(AdaptiveActivationBlock, self).__init__(init_cfg=init_cfg) + + assert in_channels % groups == 0 and out_channels % groups == 0 + self.groups = groups + + regular_matrix = torch.tensor([[-1, -1, -1, 0, 0, 0, 1, 1, 1], + [-1, 0, 1, -1, 0, 1, -1, 0, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1]]) + self.register_buffer('regular_matrix', regular_matrix.float()) + + self.transform_matrix_conv = build_conv_layer( + dict(type='Conv2d'), + in_channels=in_channels, + out_channels=6 * groups, + kernel_size=3, + padding=1, + groups=groups, + bias=True) + + if has_mmcv_full: + self.adapt_conv = DeformConv2d( + in_channels, + out_channels, + kernel_size=3, + padding=1, + bias=False, + groups=groups, + deform_groups=groups) + else: + raise ImportError('Please install the full version of mmcv ' + 'to use `DeformConv2d`.') + + self.norm = build_norm_layer(norm_cfg, out_channels)[1] + self.act = build_activation_layer(act_cfg) + + def forward(self, x): + B, _, H, W = x.size() + residual = x + + affine_matrix = self.transform_matrix_conv(x) + affine_matrix = affine_matrix.permute(0, 2, 3, 1).contiguous() + affine_matrix = affine_matrix.view(B, H, W, self.groups, 2, 3) + offset = torch.matmul(affine_matrix, self.regular_matrix) + offset = offset.transpose(4, 5).reshape(B, H, W, self.groups * 18) + offset = offset.permute(0, 3, 1, 2).contiguous() + + x = self.adapt_conv(x, offset) + x = self.norm(x) + x = self.act(x + residual) + + return x + + +class RescoreNet(BaseModule): + """Rescore net used to predict the OKS score of predicted pose. We use the + off-the-shelf rescore net pretrained by authors of DEKR. + + Args: + in_channels (int): Input channels + norm_indexes (Tuple(int)): Indices of torso in skeleton + init_cfg (dict, optional): Initialization config dict + """ + + def __init__( + self, + in_channels, + norm_indexes, + init_cfg=None, + ): + super(RescoreNet, self).__init__(init_cfg=init_cfg) + + self.norm_indexes = norm_indexes + + hidden = 256 + + self.l1 = torch.nn.Linear(in_channels, hidden, bias=True) + self.l2 = torch.nn.Linear(hidden, hidden, bias=True) + self.l3 = torch.nn.Linear(hidden, 1, bias=True) + self.relu = torch.nn.ReLU() + + def make_feature(self, keypoints, keypoint_scores, skeleton): + """Combine original scores, joint distance and relative distance to + make feature. + + Args: + keypoints (torch.Tensor): predicetd keypoints + keypoint_scores (torch.Tensor): predicetd keypoint scores + skeleton (list(list(int))): joint links + + Returns: + torch.Tensor: feature for each instance + """ + joint_1, joint_2 = zip(*skeleton) + num_link = len(skeleton) + + joint_relate = (keypoints[:, joint_1] - + keypoints[:, joint_2])[:, :, :2] + joint_length = joint_relate.norm(dim=2) + + # To use the torso distance to normalize + normalize = (joint_length[:, self.norm_indexes[0]] + + joint_length[:, self.norm_indexes[1]]) / 2 + normalize = normalize.unsqueeze(1).expand(normalize.size(0), num_link) + normalize = normalize.clamp(min=1).contiguous() + + joint_length = joint_length / normalize[:, :] + joint_relate = joint_relate / normalize.unsqueeze(-1) + joint_relate = joint_relate.flatten(1) + + feature = torch.cat((joint_relate, joint_length, keypoint_scores), + dim=1).float() + return feature + + def forward(self, keypoints, keypoint_scores, skeleton): + feature = self.make_feature(keypoints, keypoint_scores, skeleton) + x = self.relu(self.l1(feature)) + x = self.relu(self.l2(x)) + x = self.l3(x) + return x.squeeze(1) + + +@MODELS.register_module() +class DEKRHead(BaseHead): + """DisEntangled Keypoint Regression head introduced in `Bottom-up human + pose estimation via disentangled keypoint regression`_ by Geng et al + (2021). The head is composed of a heatmap branch and a displacement branch. + + Args: + in_channels (int | Sequence[int]): Number of channels in the input + feature map + num_joints (int): Number of joints + num_heatmap_filters (int): Number of filters for heatmap branch. + Defaults to 32 + num_offset_filters_per_joint (int): Number of filters for each joint + in displacement branch. Defaults to 15 + input_transform (str): Transformation of input features which should + be one of the following options: + + - ``'resize_concat'``: Resize multiple feature maps specified + by ``input_index`` to the same size as the first one and + concat these feature maps + - ``'select'``: Select feature map(s) specified by + ``input_index``. Multiple selected features will be + bundled into a tuple + + Defaults to ``'select'`` + input_index (int | Sequence[int]): The feature map index used in the + input transformation. See also ``input_transform``. Defaults to -1 + align_corners (bool): `align_corners` argument of + :func:`torch.nn.functional.interpolate` used in the input + transformation. Defaults to ``False`` + heatmap_loss (Config): Config of the heatmap loss. Defaults to use + :class:`KeypointMSELoss` + displacement_loss (Config): Config of the displacement regression loss. + Defaults to use :class:`SoftWeightSmoothL1Loss` + decoder (Config, optional): The decoder config that controls decoding + keypoint coordinates from the network output. Defaults to ``None`` + rescore_cfg (Config, optional): The config for rescore net which + estimates OKS via predicted keypoints and keypoint scores. + Defaults to ``None`` + init_cfg (Config, optional): Config to control the initialization. See + :attr:`default_init_cfg` for default settings + + .. _`Bottom-up human pose estimation via disentangled keypoint regression`: + https://arxiv.org/abs/2104.02300 + """ + + _version = 2 + + def __init__(self, + in_channels: Union[int, Sequence[int]], + num_keypoints: int, + num_heatmap_filters: int = 32, + num_displacement_filters_per_keypoint: int = 15, + input_transform: str = 'select', + input_index: Union[int, Sequence[int]] = -1, + align_corners: bool = False, + heatmap_loss: ConfigType = dict( + type='KeypointMSELoss', use_target_weight=True), + displacement_loss: ConfigType = dict( + type='SoftWeightSmoothL1Loss', + use_target_weight=True, + supervise_empty=False), + decoder: OptConfigType = None, + rescore_cfg: OptConfigType = None, + init_cfg: OptConfigType = None): + + if init_cfg is None: + init_cfg = self.default_init_cfg + + super().__init__(init_cfg) + + self.in_channels = in_channels + self.num_keypoints = num_keypoints + self.input_transform = input_transform + self.input_index = input_index + self.align_corners = align_corners + + in_channels = self._get_in_channels() + + # build heatmap branch + self.heatmap_conv_layers = self._make_heatmap_conv_layers( + in_channels=in_channels, + out_channels=1 + num_keypoints, + num_filters=num_heatmap_filters, + ) + + # build displacement branch + self.displacement_conv_layers = self._make_displacement_conv_layers( + in_channels=in_channels, + out_channels=2 * num_keypoints, + num_filters=num_keypoints * num_displacement_filters_per_keypoint, + groups=num_keypoints) + + # build losses + self.loss_module = ModuleDict( + dict( + heatmap=MODELS.build(heatmap_loss), + displacement=MODELS.build(displacement_loss), + )) + + # build decoder + if decoder is not None: + self.decoder = KEYPOINT_CODECS.build(decoder) + else: + self.decoder = None + + # build rescore net + if rescore_cfg is not None: + self.rescore_net = RescoreNet(**rescore_cfg) + else: + self.rescore_net = None + + # Register the hook to automatically convert old version state dicts + self._register_load_state_dict_pre_hook(self._load_state_dict_pre_hook) + + @property + def default_init_cfg(self): + init_cfg = [ + dict( + type='Normal', layer=['Conv2d', 'ConvTranspose2d'], std=0.001), + dict(type='Constant', layer='BatchNorm2d', val=1) + ] + return init_cfg + + def _make_heatmap_conv_layers(self, in_channels: int, out_channels: int, + num_filters: int): + """Create convolutional layers of heatmap branch by given + parameters.""" + layers = [ + ConvModule( + in_channels=in_channels, + out_channels=num_filters, + kernel_size=1, + norm_cfg=dict(type='BN')), + BasicBlock(num_filters, num_filters), + build_conv_layer( + dict(type='Conv2d'), + in_channels=num_filters, + out_channels=out_channels, + kernel_size=1), + ] + + return Sequential(*layers) + + def _make_displacement_conv_layers(self, in_channels: int, + out_channels: int, num_filters: int, + groups: int): + """Create convolutional layers of displacement branch by given + parameters.""" + layers = [ + ConvModule( + in_channels=in_channels, + out_channels=num_filters, + kernel_size=1, + norm_cfg=dict(type='BN')), + AdaptiveActivationBlock(num_filters, num_filters, groups=groups), + AdaptiveActivationBlock(num_filters, num_filters, groups=groups), + build_conv_layer( + dict(type='Conv2d'), + in_channels=num_filters, + out_channels=out_channels, + kernel_size=1, + groups=groups) + ] + + return Sequential(*layers) + + def forward(self, feats: Tuple[Tensor]) -> Tensor: + """Forward the network. The input is multi scale feature maps and the + output is a tuple of heatmap and displacement. + + Args: + feats (Tuple[Tensor]): Multi scale feature maps. + + Returns: + Tuple[Tensor]: output heatmap and displacement. + """ + x = self._transform_inputs(feats) + + heatmaps = self.heatmap_conv_layers(x) + displacements = self.displacement_conv_layers(x) + + return heatmaps, displacements + + def loss(self, + feats: Tuple[Tensor], + batch_data_samples: OptSampleList, + train_cfg: ConfigType = {}) -> dict: + """Calculate losses from a batch of inputs and data samples. + + Args: + feats (Tuple[Tensor]): The multi-stage features + batch_data_samples (List[:obj:`PoseDataSample`]): The batch + data samples + train_cfg (dict): The runtime config for training process. + Defaults to {} + + Returns: + dict: A dictionary of losses. + """ + pred_heatmaps, pred_displacements = self.forward(feats) + gt_heatmaps = torch.stack( + [d.gt_fields.heatmaps for d in batch_data_samples]) + heatmap_weights = torch.stack( + [d.gt_fields.heatmap_weights for d in batch_data_samples]) + gt_displacements = torch.stack( + [d.gt_fields.displacements for d in batch_data_samples]) + displacement_weights = torch.stack( + [d.gt_fields.displacement_weights for d in batch_data_samples]) + + if 'heatmap_mask' in batch_data_samples[0].gt_fields.keys(): + heatmap_mask = torch.stack( + [d.gt_fields.heatmap_mask for d in batch_data_samples]) + else: + heatmap_mask = None + + # calculate losses + losses = dict() + heatmap_loss = self.loss_module['heatmap'](pred_heatmaps, gt_heatmaps, + heatmap_weights, + heatmap_mask) + displacement_loss = self.loss_module['displacement']( + pred_displacements, gt_displacements, displacement_weights) + + losses.update({ + 'loss/heatmap': heatmap_loss, + 'loss/displacement': displacement_loss, + }) + + return losses + + def predict(self, + feats: Features, + batch_data_samples: OptSampleList, + test_cfg: ConfigType = {}) -> Predictions: + """Predict results from features. + + Args: + feats (Tuple[Tensor] | List[Tuple[Tensor]]): The multi-stage + features (or multiple multi-scale features in TTA) + batch_data_samples (List[:obj:`PoseDataSample`]): The batch + data samples + test_cfg (dict): The runtime config for testing process. Defaults + to {} + + Returns: + Union[InstanceList | Tuple[InstanceList | PixelDataList]]: If + ``test_cfg['output_heatmap']==True``, return both pose and heatmap + prediction; otherwise only return the pose prediction. + + The pose prediction is a list of ``InstanceData``, each contains + the following fields: + + - keypoints (np.ndarray): predicted keypoint coordinates in + shape (num_instances, K, D) where K is the keypoint number + and D is the keypoint dimension + - keypoint_scores (np.ndarray): predicted keypoint scores in + shape (num_instances, K) + + The heatmap prediction is a list of ``PixelData``, each contains + the following fields: + + - heatmaps (Tensor): The predicted heatmaps in shape (1, h, w) + or (K+1, h, w) if keypoint heatmaps are predicted + - displacements (Tensor): The predicted displacement fields + in shape (K*2, h, w) + """ + + assert len(batch_data_samples) == 1, f'DEKRHead only supports ' \ + f'prediction with batch_size 1, but got {len(batch_data_samples)}' + + multiscale_test = test_cfg.get('multiscale_test', False) + flip_test = test_cfg.get('flip_test', False) + metainfo = batch_data_samples[0].metainfo + aug_scales = [1] + + if not multiscale_test: + feats = [feats] + else: + aug_scales = aug_scales + metainfo['aug_scales'] + + heatmaps, displacements = [], [] + for feat, s in zip(feats, aug_scales): + if flip_test: + assert isinstance(feat, list) and len(feat) == 2 + flip_indices = metainfo['flip_indices'] + _feat, _feat_flip = feat + _heatmaps, _displacements = self.forward(_feat) + _heatmaps_flip, _displacements_flip = self.forward(_feat_flip) + + _heatmaps_flip = flip_heatmaps( + _heatmaps_flip, + flip_mode='heatmap', + flip_indices=flip_indices + [len(flip_indices)], + shift_heatmap=test_cfg.get('shift_heatmap', False)) + _heatmaps = (_heatmaps + _heatmaps_flip) / 2.0 + + _displacements_flip = flip_heatmaps( + _displacements_flip, + flip_mode='offset', + flip_indices=flip_indices, + shift_heatmap=False) + + # this is a coordinate amendment. + x_scale_factor = s * ( + metainfo['input_size'][0] / _heatmaps.shape[-1]) + _displacements_flip[:, ::2] += (x_scale_factor - 1) / ( + x_scale_factor) + _displacements = (_displacements + _displacements_flip) / 2.0 + + else: + _heatmaps, _displacements = self.forward(feat) + + heatmaps.append(_heatmaps) + displacements.append(_displacements) + + preds = self.decode(heatmaps, displacements, test_cfg, metainfo) + + if test_cfg.get('output_heatmaps', False): + heatmaps = [hm.detach() for hm in heatmaps] + displacements = [dm.detach() for dm in displacements] + B = heatmaps[0].shape[0] + pred_fields = [] + for i in range(B): + pred_fields.append( + PixelData( + heatmaps=heatmaps[0][i], + displacements=displacements[0][i])) + return preds, pred_fields + else: + return preds + + def decode(self, + heatmaps: Tuple[Tensor], + displacements: Tuple[Tensor], + test_cfg: ConfigType = {}, + metainfo: dict = {}) -> InstanceList: + """Decode keypoints from outputs. + + Args: + heatmaps (Tuple[Tensor]): The output heatmaps inferred from one + image or multi-scale images. + displacements (Tuple[Tensor]): The output displacement fields + inferred from one image or multi-scale images. + test_cfg (dict): The runtime config for testing process. Defaults + to {} + metainfo (dict): The metainfo of test dataset. Defaults to {} + + Returns: + List[InstanceData]: A list of InstanceData, each contains the + decoded pose information of the instances of one data sample. + """ + + if self.decoder is None: + raise RuntimeError( + f'The decoder has not been set in {self.__class__.__name__}. ' + 'Please set the decoder configs in the init parameters to ' + 'enable head methods `head.predict()` and `head.decode()`') + + multiscale_test = test_cfg.get('multiscale_test', False) + skeleton = metainfo.get('skeleton_links', None) + + preds = [] + batch_size = heatmaps[0].shape[0] + + for b in range(batch_size): + if multiscale_test: + raise NotImplementedError + else: + keypoints, (root_scores, + keypoint_scores) = self.decoder.decode( + heatmaps[0][b], displacements[0][b]) + + # rescore each instance + if self.rescore_net is not None and skeleton and len( + keypoints) > 0: + instance_scores = self.rescore_net(keypoints, keypoint_scores, + skeleton) + instance_scores[torch.isnan(instance_scores)] = 0 + root_scores = root_scores * instance_scores + + # nms + keypoints, keypoint_scores = to_numpy((keypoints, keypoint_scores)) + scores = to_numpy(root_scores)[..., None] * keypoint_scores + if len(keypoints) > 0 and test_cfg.get('nms_dist_thr', 0) > 0: + kpts_db = [] + for i in range(len(keypoints)): + kpts_db.append( + dict(keypoints=keypoints[i], score=keypoint_scores[i])) + keep_instance_inds = nearby_joints_nms( + kpts_db, + test_cfg['nms_dist_thr'], + test_cfg.get('nms_joints_thr', None), + score_per_joint=True, + max_dets=test_cfg.get('max_num_people', 30)) + keypoints = keypoints[keep_instance_inds] + scores = scores[keep_instance_inds] + + # pack outputs + preds.append( + InstanceData(keypoints=keypoints, keypoint_scores=scores)) + + return preds + + def _load_state_dict_pre_hook(self, state_dict, prefix, local_meta, *args, + **kwargs): + """A hook function to convert old-version state dict of + :class:`DEKRHead` (before MMPose v1.0.0) to a compatible format + of :class:`DEKRHead`. + + The hook will be automatically registered during initialization. + """ + version = local_meta.get('version', None) + if version and version >= self._version: + return + + # convert old-version state dict + keys = list(state_dict.keys()) + for k in keys: + if 'offset_conv_layer' in k: + v = state_dict.pop(k) + k = k.replace('offset_conv_layers', 'displacement_conv_layers') + if 'displacement_conv_layers.3.' in k: + # the source and target of displacement vectors are + # opposite between two versions. + v = -v + state_dict[k] = v + + if 'heatmap_conv_layers.2' in k: + # root heatmap is at the first/last channel of the + # heatmap tensor in MMPose v0.x/1.x, respectively. + v = state_dict.pop(k) + state_dict[k] = torch.cat((v[1:], v[:1])) + + if 'rescore_net' in k: + v = state_dict.pop(k) + k = k.replace('rescore_net', 'head.rescore_net') + state_dict[k] = v diff --git a/mmpose/models/losses/__init__.py b/mmpose/models/losses/__init__.py index ca7a5b509a..f21071e156 100644 --- a/mmpose/models/losses/__init__.py +++ b/mmpose/models/losses/__init__.py @@ -1,18 +1,17 @@ # Copyright (c) OpenMMLab. All rights reserved. +from .ae_loss import AssociativeEmbeddingLoss from .classification_loss import BCELoss, JSDiscretLoss, KLDiscretLoss -from .heatmap_loss import AdaptiveWingLoss -from .loss_wrappers import MultipleLossWrapper -from .mse_loss import (CombinedTargetMSELoss, KeypointMSELoss, - KeypointOHKMMSELoss) -from .multi_loss_factory import AELoss, HeatmapLoss, MultiLossFactory +from .heatmap_loss import (AdaptiveWingLoss, KeypointMSELoss, + KeypointOHKMMSELoss) +from .loss_wrappers import CombinedLoss, MultipleLossWrapper from .regression_loss import (BoneLoss, L1Loss, MPJPELoss, MSELoss, RLELoss, - SemiSupervisionLoss, SmoothL1Loss, SoftWingLoss, - WingLoss) + SemiSupervisionLoss, SmoothL1Loss, + SoftWeightSmoothL1Loss, SoftWingLoss, WingLoss) __all__ = [ - 'KeypointMSELoss', 'KeypointOHKMMSELoss', 'CombinedTargetMSELoss', - 'HeatmapLoss', 'AELoss', 'MultiLossFactory', 'SmoothL1Loss', 'WingLoss', + 'KeypointMSELoss', 'KeypointOHKMMSELoss', 'SmoothL1Loss', 'WingLoss', 'MPJPELoss', 'MSELoss', 'L1Loss', 'BCELoss', 'BoneLoss', 'SemiSupervisionLoss', 'SoftWingLoss', 'AdaptiveWingLoss', 'RLELoss', - 'KLDiscretLoss', 'MultipleLossWrapper', 'JSDiscretLoss' + 'KLDiscretLoss', 'MultipleLossWrapper', 'JSDiscretLoss', 'CombinedLoss', + 'AssociativeEmbeddingLoss', 'SoftWeightSmoothL1Loss' ] diff --git a/mmpose/models/losses/ae_loss.py b/mmpose/models/losses/ae_loss.py new file mode 100644 index 0000000000..1f1e08181b --- /dev/null +++ b/mmpose/models/losses/ae_loss.py @@ -0,0 +1,123 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from typing import List, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + +from mmpose.registry import MODELS + + +@MODELS.register_module() +class AssociativeEmbeddingLoss(nn.Module): + """Associative Embedding loss. + + Details can be found in + `Associative Embedding `_ + + Note: + + - batch size: B + - instance number: N + - keypoint number: K + - keypoint dimension: D + - embedding tag dimension: L + - heatmap size: [W, H] + + Args: + loss_weight (float): Weight of the loss. Defaults to 1.0 + push_loss_factor (float): A factor that controls the weight between + the push loss and the pull loss. Defaults to 0.5 + """ + + def __init__(self, + loss_weight: float = 1.0, + push_loss_factor: float = 0.5) -> None: + super().__init__() + self.loss_weight = loss_weight + self.push_loss_factor = push_loss_factor + + def _ae_loss_per_image(self, tags: Tensor, keypoint_indices: Tensor): + """Compute associative embedding loss for one image. + + Args: + tags (Tensor): Tagging heatmaps in shape (K*L, H, W) + keypoint_indices (Tensor): Ground-truth keypint position indices + in shape (N, K, 2) + """ + K = keypoint_indices.shape[1] + C, H, W = tags.shape + L = C // K + + tags = tags.view(L, K, H * W) + instance_tags = [] + instance_kpt_tags = [] + + for keypoint_indices_n in keypoint_indices: + _kpt_tags = [] + for k in range(K): + if keypoint_indices_n[k, 1]: + _kpt_tags.append(tags[:, k, keypoint_indices_n[k, 0]]) + + if _kpt_tags: + kpt_tags = torch.stack(_kpt_tags) + instance_kpt_tags.append(kpt_tags) + instance_tags.append(kpt_tags.mean(dim=0)) + + N = len(instance_kpt_tags) # number of instances with valid keypoints + + if N == 0: + pull_loss = tags.new_zeros(size=(), requires_grad=True) + push_loss = tags.new_zeros(size=(), requires_grad=True) + else: + pull_loss = sum( + F.mse_loss(_kpt_tags, _tag.expand_as(_kpt_tags)) + for (_kpt_tags, _tag) in zip(instance_kpt_tags, instance_tags)) + + if N == 1: + push_loss = tags.new_zeros(size=(), requires_grad=True) + else: + tag_mat = torch.stack(instance_tags) # (N, L) + diff = tag_mat[None] - tag_mat[:, None] # (N, N, L) + push_loss = torch.sum(torch.exp(-diff.pow(2))) + + # normalization + eps = 1e-6 + pull_loss = pull_loss / (N + eps) + push_loss = push_loss / ((N - 1) * N + eps) + + return pull_loss, push_loss + + def forward(self, tags: Tensor, keypoint_indices: Union[List[Tensor], + Tensor]): + """Compute associative embedding loss on a batch of data. + + Args: + tags (Tensor): Tagging heatmaps in shape (B, L*K, H, W) + keypoint_indices (Tensor|List[Tensor]): Ground-truth keypint + position indices represented by a Tensor in shape + (B, N, K, 2), or a list of B Tensors in shape (N_i, K, 2) + Each keypoint's index is represented as [i, v], where i is the + position index in the heatmap (:math:`i=y*w+x`) and v is the + visibility + + Returns: + tuple: + - pull_loss (Tensor) + - push_loss (Tensor) + """ + + assert tags.shape[0] == len(keypoint_indices) + + pull_loss = 0. + push_loss = 0. + + for i in range(tags.shape[0]): + _pull, _push = self._ae_loss_per_image(tags[i], + keypoint_indices[i]) + pull_loss += _pull * self.loss_weight + push_loss += _push * self.loss_weight * self.push_loss_factor + + return pull_loss, push_loss diff --git a/mmpose/models/losses/classification_loss.py b/mmpose/models/losses/classification_loss.py index 6e4a07f014..5755edd4c1 100644 --- a/mmpose/models/losses/classification_loss.py +++ b/mmpose/models/losses/classification_loss.py @@ -72,16 +72,32 @@ def __init__( self.kl_loss = nn.KLDivLoss(reduction='none') def kl(self, p, q): + """Kullback-Leibler Divergence.""" + eps = 1e-24 kl_values = self.kl_loss((q + eps).log(), p) return kl_values def js(self, pred_hm, gt_hm): + """Jensen-Shannon Divergence.""" + m = 0.5 * (pred_hm + gt_hm) js_values = 0.5 * (self.kl(pred_hm, m) + self.kl(gt_hm, m)) return js_values def forward(self, pred_hm, gt_hm, target_weight=None): + """Forward function. + + Args: + pred_hm (torch.Tensor[N, K, H, W]): Predicted heatmaps. + gt_hm (torch.Tensor[N, K, H, W]): Target heatmaps. + target_weight (torch.Tensor[N, K] or torch.Tensor[N]): + Weights across different labels. + + Returns: + torch.Tensor: Loss value. + """ + if self.use_target_weight: assert target_weight is not None assert pred_hm.ndim >= target_weight.ndim @@ -102,24 +118,31 @@ def forward(self, pred_hm, gt_hm, target_weight=None): @MODELS.register_module() class KLDiscretLoss(nn.Module): """Discrete KL Divergence loss for SimCC with Gaussian Label Smoothing. + Modified from `the official implementation. - Modified from `the official implementation `_. - Args: + beta (float): Temperature factor of Softmax. + label_softmax (bool): Whether to use Softmax on labels. use_target_weight (bool): Option to use weighted loss. Different joint types may have different target weights. """ - def __init__(self, use_target_weight=True): + def __init__(self, beta=1.0, label_softmax=False, use_target_weight=True): super(KLDiscretLoss, self).__init__() - + self.beta = beta + self.label_softmax = label_softmax self.use_target_weight = use_target_weight - self.log_softmax = nn.LogSoftmax(dim=1) # [B,LOGITS] + + self.log_softmax = nn.LogSoftmax(dim=1) self.kl_loss = nn.KLDivLoss(reduction='none') def criterion(self, dec_outs, labels): - scores = self.log_softmax(dec_outs) + """Criterion function.""" + + scores = self.log_softmax(dec_outs * self.beta) + if self.label_softmax: + labels = F.softmax(labels * self.beta, dim=1) loss = torch.mean(self.kl_loss(scores, labels), dim=1) return loss @@ -127,9 +150,11 @@ def forward(self, pred_simcc, gt_simcc, target_weight): """Forward function. Args: - pred_simcc (Tuple[Tensor, Tensor]): _description_ - gt_simcc (Tuple[Tensor, Tensor]): _description_ - target_weight (Tensor): _description_ + pred_simcc (Tuple[Tensor, Tensor]): Predicted SimCC vectors of + x-axis and y-axis. + gt_simcc (Tuple[Tensor, Tensor]): Target representations. + target_weight (torch.Tensor[N, K] or torch.Tensor[N]): + Weights across different labels. """ output_x, output_y = pred_simcc target_x, target_y = gt_simcc @@ -153,3 +178,44 @@ def forward(self, pred_simcc, gt_simcc, target_weight): self.criterion(coord_y_pred, coord_y_gt).mul(weight).sum()) return loss / num_joints + + +@MODELS.register_module() +class InfoNCELoss(nn.Module): + """InfoNCE loss for training a discriminative representation space with a + contrastive manner. + + `Representation Learning with Contrastive Predictive Coding + arXiv: `_. + + Args: + temperature (float, optional): The temperature to use in the softmax + function. Higher temperatures lead to softer probability + distributions. Defaults to 1.0. + loss_weight (float, optional): The weight to apply to the loss. + Defaults to 1.0. + """ + + def __init__(self, temperature: float = 1.0, loss_weight=1.0) -> None: + super(InfoNCELoss, self).__init__() + assert temperature > 0, f'the argument `temperature` must be ' \ + f'positive, but got {temperature}' + self.temp = temperature + self.loss_weight = loss_weight + + def forward(self, features: torch.Tensor) -> torch.Tensor: + """Computes the InfoNCE loss. + + Args: + features (Tensor): A tensor containing the feature + representations of different samples. + + Returns: + Tensor: A tensor of shape (1,) containing the InfoNCE loss. + """ + n = features.size(0) + features_norm = F.normalize(features, dim=1) + logits = features_norm.mm(features_norm.t()) / self.temp + targets = torch.arange(n, dtype=torch.long, device=features.device) + loss = F.cross_entropy(logits, targets, reduction='sum') + return loss * self.loss_weight diff --git a/mmpose/models/losses/heatmap_loss.py b/mmpose/models/losses/heatmap_loss.py index 065755d2fc..a105149468 100644 --- a/mmpose/models/losses/heatmap_loss.py +++ b/mmpose/models/losses/heatmap_loss.py @@ -1,10 +1,284 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + import torch import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor from mmpose.registry import MODELS +@MODELS.register_module() +class KeypointMSELoss(nn.Module): + """MSE loss for heatmaps. + + Args: + use_target_weight (bool): Option to use weighted MSE loss. + Different joint types may have different target weights. + Defaults to ``False`` + skip_empty_channel (bool): If ``True``, heatmap channels with no + non-zero value (which means no visible ground-truth keypoint + in the image) will not be used to calculate the loss. Defaults to + ``False`` + loss_weight (float): Weight of the loss. Defaults to 1.0 + """ + + def __init__(self, + use_target_weight: bool = False, + skip_empty_channel: bool = False, + loss_weight: float = 1.): + super().__init__() + self.use_target_weight = use_target_weight + self.skip_empty_channel = skip_empty_channel + self.loss_weight = loss_weight + + def forward(self, + output: Tensor, + target: Tensor, + target_weights: Optional[Tensor] = None, + mask: Optional[Tensor] = None) -> Tensor: + """Forward function of loss. + + Note: + - batch_size: B + - num_keypoints: K + - heatmaps height: H + - heatmaps weight: W + + Args: + output (Tensor): The output heatmaps with shape [B, K, H, W] + target (Tensor): The target heatmaps with shape [B, K, H, W] + target_weights (Tensor, optional): The target weights of differet + keypoints, with shape [B, K] (keypoint-wise) or + [B, K, H, W] (pixel-wise). + mask (Tensor, optional): The masks of valid heatmap pixels in + shape [B, K, H, W] or [B, 1, H, W]. If ``None``, no mask will + be applied. Defaults to ``None`` + + Returns: + Tensor: The calculated loss. + """ + + _mask = self._get_mask(target, target_weights, mask) + if _mask is None: + loss = F.mse_loss(output, target) + else: + _loss = F.mse_loss(output, target, reduction='none') + loss = (_loss * _mask).mean() + + return loss * self.loss_weight + + def _get_mask(self, target: Tensor, target_weights: Optional[Tensor], + mask: Optional[Tensor]) -> Optional[Tensor]: + """Generate the heatmap mask w.r.t. the given mask, target weight and + `skip_empty_channel` setting. + + Returns: + Tensor: The mask in shape (B, K, *) or ``None`` if no mask is + needed. + """ + # Given spatial mask + if mask is not None: + # check mask has matching type with target + assert (mask.ndim == target.ndim and all( + d_m == d_t or d_m == 1 + for d_m, d_t in zip(mask.shape, target.shape))), ( + f'mask and target have mismatched shapes {mask.shape} v.s.' + f'{target.shape}') + + # Mask by target weights (keypoint-wise mask) + if target_weights is not None: + # check target weight has matching shape with target + assert (target_weights.ndim in (2, 4) and target_weights.shape + == target.shape[:target_weights.ndim]), ( + 'target_weights and target have mismatched shapes ' + f'{target_weights.shape} v.s. {target.shape}') + + ndim_pad = target.ndim - target_weights.ndim + _mask = target_weights.view(target_weights.shape + + (1, ) * ndim_pad) + + if mask is None: + mask = _mask + else: + mask = mask * _mask + + # Mask by ``skip_empty_channel`` + if self.skip_empty_channel: + _mask = (target != 0).flatten(2).any() + ndim_pad = target.ndim - _mask.ndim + _mask = _mask.view(_mask.shape + (1, ) * ndim_pad) + + if mask is None: + mask = _mask + else: + mask = mask * _mask + + return mask + + +@MODELS.register_module() +class CombinedTargetMSELoss(nn.Module): + """MSE loss for combined target. + + CombinedTarget: The combination of classification target + (response map) and regression target (offset map). + Paper ref: Huang et al. The Devil is in the Details: Delving into + Unbiased Data Processing for Human Pose Estimation (CVPR 2020). + + Args: + use_target_weight (bool): Option to use weighted MSE loss. + Different joint types may have different target weights. + Defaults to ``False`` + loss_weight (float): Weight of the loss. Defaults to 1.0 + """ + + def __init__(self, + use_target_weight: bool = False, + loss_weight: float = 1.): + super().__init__() + self.criterion = nn.MSELoss(reduction='mean') + self.use_target_weight = use_target_weight + self.loss_weight = loss_weight + + def forward(self, output: Tensor, target: Tensor, + target_weights: Tensor) -> Tensor: + """Forward function of loss. + + Note: + - batch_size: B + - num_channels: C + - heatmaps height: H + - heatmaps weight: W + - num_keypoints: K + Here, C = 3 * K + + Args: + output (Tensor): The output feature maps with shape [B, C, H, W]. + target (Tensor): The target feature maps with shape [B, C, H, W]. + target_weights (Tensor): The target weights of differet keypoints, + with shape [B, K]. + + Returns: + Tensor: The calculated loss. + """ + batch_size = output.size(0) + num_channels = output.size(1) + heatmaps_pred = output.reshape( + (batch_size, num_channels, -1)).split(1, 1) + heatmaps_gt = target.reshape( + (batch_size, num_channels, -1)).split(1, 1) + loss = 0. + num_joints = num_channels // 3 + for idx in range(num_joints): + heatmap_pred = heatmaps_pred[idx * 3].squeeze() + heatmap_gt = heatmaps_gt[idx * 3].squeeze() + offset_x_pred = heatmaps_pred[idx * 3 + 1].squeeze() + offset_x_gt = heatmaps_gt[idx * 3 + 1].squeeze() + offset_y_pred = heatmaps_pred[idx * 3 + 2].squeeze() + offset_y_gt = heatmaps_gt[idx * 3 + 2].squeeze() + if self.use_target_weight: + target_weight = target_weights[:, idx, None] + heatmap_pred = heatmap_pred * target_weight + heatmap_gt = heatmap_gt * target_weight + # classification loss + loss += 0.5 * self.criterion(heatmap_pred, heatmap_gt) + # regression loss + loss += 0.5 * self.criterion(heatmap_gt * offset_x_pred, + heatmap_gt * offset_x_gt) + loss += 0.5 * self.criterion(heatmap_gt * offset_y_pred, + heatmap_gt * offset_y_gt) + return loss / num_joints * self.loss_weight + + +@MODELS.register_module() +class KeypointOHKMMSELoss(nn.Module): + """MSE loss with online hard keypoint mining. + + Args: + use_target_weight (bool): Option to use weighted MSE loss. + Different joint types may have different target weights. + Defaults to ``False`` + topk (int): Only top k joint losses are kept. Defaults to 8 + loss_weight (float): Weight of the loss. Defaults to 1.0 + """ + + def __init__(self, + use_target_weight: bool = False, + topk: int = 8, + loss_weight: float = 1.): + super().__init__() + assert topk > 0 + self.criterion = nn.MSELoss(reduction='none') + self.use_target_weight = use_target_weight + self.topk = topk + self.loss_weight = loss_weight + + def _ohkm(self, losses: Tensor) -> Tensor: + """Online hard keypoint mining. + + Note: + - batch_size: B + - num_keypoints: K + + Args: + loss (Tensor): The losses with shape [B, K] + + Returns: + Tensor: The calculated loss. + """ + ohkm_loss = 0. + B = losses.shape[0] + for i in range(B): + sub_loss = losses[i] + _, topk_idx = torch.topk( + sub_loss, k=self.topk, dim=0, sorted=False) + tmp_loss = torch.gather(sub_loss, 0, topk_idx) + ohkm_loss += torch.sum(tmp_loss) / self.topk + ohkm_loss /= B + return ohkm_loss + + def forward(self, output: Tensor, target: Tensor, + target_weights: Tensor) -> Tensor: + """Forward function of loss. + + Note: + - batch_size: B + - num_keypoints: K + - heatmaps height: H + - heatmaps weight: W + + Args: + output (Tensor): The output heatmaps with shape [B, K, H, W]. + target (Tensor): The target heatmaps with shape [B, K, H, W]. + target_weights (Tensor): The target weights of differet keypoints, + with shape [B, K]. + + Returns: + Tensor: The calculated loss. + """ + num_keypoints = output.size(1) + if num_keypoints < self.topk: + raise ValueError(f'topk ({self.topk}) should not be ' + f'larger than num_keypoints ({num_keypoints}).') + + losses = [] + for idx in range(num_keypoints): + if self.use_target_weight: + target_weight = target_weights[:, idx, None, None] + losses.append( + self.criterion(output[:, idx] * target_weight, + target[:, idx] * target_weight)) + else: + losses.append(self.criterion(output[:, idx], target[:, idx])) + + losses = [loss.mean(dim=(1, 2)).unsqueeze(dim=1) for loss in losses] + losses = torch.cat(losses, dim=1) + + return self._ohkm(losses) * self.loss_weight + + @MODELS.register_module() class AdaptiveWingLoss(nn.Module): """Adaptive wing loss. paper ref: 'Adaptive Wing Loss for Robust Face @@ -64,7 +338,10 @@ def criterion(self, pred, target): return torch.mean(losses) - def forward(self, output, target, target_weights): + def forward(self, + output: Tensor, + target: Tensor, + target_weights: Optional[Tensor] = None): """Forward function. Note: @@ -78,9 +355,101 @@ def forward(self, output, target, target_weights): Weights across different joint types. """ if self.use_target_weight: + assert (target_weights.ndim in (2, 4) and target_weights.shape + == target.shape[:target_weights.ndim]), ( + 'target_weights and target have mismatched shapes ' + f'{target_weights.shape} v.s. {target.shape}') + + ndim_pad = target.ndim - target_weights.ndim + target_weights = target_weights.view(target_weights.shape + + (1, ) * ndim_pad) loss = self.criterion(output * target_weights, target * target_weights) else: loss = self.criterion(output, target) return loss * self.loss_weight + + +@MODELS.register_module() +class FocalHeatmapLoss(KeypointMSELoss): + """A class for calculating the modified focal loss for heatmap prediction. + + This loss function is exactly the same as the one used in CornerNet. It + runs faster and costs a little bit more memory. + + `CornerNet: Detecting Objects as Paired Keypoints + arXiv: `_. + + Arguments: + alpha (int): The alpha parameter in the focal loss equation. + beta (int): The beta parameter in the focal loss equation. + use_target_weight (bool): Option to use weighted MSE loss. + Different joint types may have different target weights. + Defaults to ``False`` + skip_empty_channel (bool): If ``True``, heatmap channels with no + non-zero value (which means no visible ground-truth keypoint + in the image) will not be used to calculate the loss. Defaults to + ``False`` + loss_weight (float): Weight of the loss. Defaults to 1.0 + """ + + def __init__(self, + alpha: int = 2, + beta: int = 4, + use_target_weight: bool = False, + skip_empty_channel: bool = False, + loss_weight: float = 1.0): + super(FocalHeatmapLoss, self).__init__(use_target_weight, + skip_empty_channel, loss_weight) + self.alpha = alpha + self.beta = beta + + def forward(self, + output: Tensor, + target: Tensor, + target_weights: Optional[Tensor] = None, + mask: Optional[Tensor] = None) -> Tensor: + """Calculate the modified focal loss for heatmap prediction. + + Note: + - batch_size: B + - num_keypoints: K + - heatmaps height: H + - heatmaps weight: W + + Args: + output (Tensor): The output heatmaps with shape [B, K, H, W] + target (Tensor): The target heatmaps with shape [B, K, H, W] + target_weights (Tensor, optional): The target weights of differet + keypoints, with shape [B, K] (keypoint-wise) or + [B, K, H, W] (pixel-wise). + mask (Tensor, optional): The masks of valid heatmap pixels in + shape [B, K, H, W] or [B, 1, H, W]. If ``None``, no mask will + be applied. Defaults to ``None`` + + Returns: + Tensor: The calculated loss. + """ + _mask = self._get_mask(target, target_weights, mask) + + pos_inds = target.eq(1).float() + neg_inds = target.lt(1).float() + + if _mask is not None: + pos_inds = pos_inds * _mask + neg_inds = neg_inds * _mask + + neg_weights = torch.pow(1 - target, self.beta) + + pos_loss = torch.log(output) * torch.pow(1 - output, + self.alpha) * pos_inds + neg_loss = torch.log(1 - output) * torch.pow( + output, self.alpha) * neg_weights * neg_inds + + num_pos = pos_inds.float().sum() + if num_pos == 0: + loss = -neg_loss.sum() + else: + loss = -(pos_loss.sum() + neg_loss.sum()) / num_pos + return loss * self.loss_weight diff --git a/mmpose/models/losses/loss_wrappers.py b/mmpose/models/losses/loss_wrappers.py index f374c0da71..d821661b48 100644 --- a/mmpose/models/losses/loss_wrappers.py +++ b/mmpose/models/losses/loss_wrappers.py @@ -1,7 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict + import torch.nn as nn from mmpose.registry import MODELS +from mmpose.utils.typing import ConfigType @MODELS.register_module() @@ -24,6 +27,21 @@ def __init__(self, losses: list): self.loss_modules = nn.ModuleList(loss_modules) def forward(self, input_list, target_list, keypoint_weights=None): + """Forward function. + + Note: + - batch_size: N + - num_keypoints: K + - dimension of keypoints: D (D=2 or D=3) + + Args: + input_list (List[Tensor]): List of inputs. + target_list (List[Tensor]): List of targets. + keypoint_weights (Tensor[N, K, D]): + Weights across different joint types. + """ + assert isinstance(input_list, list), '' + assert isinstance(target_list, list), '' assert len(input_list) == len(target_list), '' losses = [] @@ -35,3 +53,30 @@ def forward(self, input_list, target_list, keypoint_weights=None): losses.append(loss_i) return losses + + +@MODELS.register_module() +class CombinedLoss(nn.ModuleDict): + """A wrapper to combine multiple loss functions. These loss functions can + have different input type (e.g. heatmaps or regression values), and can + only be involed individually and explixitly. + + Args: + losses (Dict[str, ConfigType]): The names and configs of loss + functions to be wrapped + + Example:: + >>> heatmap_loss_cfg = dict(type='KeypointMSELoss') + >>> ae_loss_cfg = dict(type='AssociativeEmbeddingLoss') + >>> loss_module = CombinedLoss( + ... losses=dict( + ... heatmap_loss=heatmap_loss_cfg, + ... ae_loss=ae_loss_cfg)) + >>> loss_hm = loss_module.heatmap_loss(pred_heatmap, gt_heatmap) + >>> loss_ae = loss_module.ae_loss(pred_tags, keypoint_indices) + """ + + def __init__(self, losses: Dict[str, ConfigType]): + super().__init__() + for loss_name, loss_cfg in losses.items(): + self.add_module(loss_name, MODELS.build(loss_cfg)) diff --git a/mmpose/models/losses/mse_loss.py b/mmpose/models/losses/mse_loss.py deleted file mode 100644 index 2fd8dcdf33..0000000000 --- a/mmpose/models/losses/mse_loss.py +++ /dev/null @@ -1,220 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import torch -import torch.nn as nn -from torch import Tensor - -from mmpose.registry import MODELS - - -@MODELS.register_module() -class KeypointMSELoss(nn.Module): - """MSE loss for heatmaps. - - Args: - use_target_weight (bool): Option to use weighted MSE loss. - Different joint types may have different target weights. - Defaults to ``False`` - loss_weight (float): Weight of the loss. Defaults to 1.0 - """ - - def __init__(self, - use_target_weight: bool = False, - loss_weight: float = 1.): - super().__init__() - self.criterion = nn.MSELoss() - self.use_target_weight = use_target_weight - self.loss_weight = loss_weight - - def forward(self, output: Tensor, target: Tensor, - target_weights: Tensor) -> Tensor: - """Forward function of loss. - - Note: - - batch_size: B - - num_keypoints: K - - heatmaps height: H - - heatmaps weight: W - - Args: - output (Tensor): The output heatmaps with shape [B, K, H, W]. - target (Tensor): The target heatmaps with shape [B, K, H, W]. - target_weights (Tensor): The target weights of differet keypoints, - with shape [B, K]. - - Returns: - Tensor: The calculated loss. - """ - if self.use_target_weight: - assert target_weights is not None - assert output.ndim >= target_weights.ndim - - for i in range(output.ndim - target_weights.ndim): - target_weights = target_weights.unsqueeze(-1) - - loss = self.criterion(output * target_weights, - target * target_weights) - else: - loss = self.criterion(output, target) - - return loss * self.loss_weight - - -@MODELS.register_module() -class CombinedTargetMSELoss(nn.Module): - """MSE loss for combined target. - - CombinedTarget: The combination of classification target - (response map) and regression target (offset map). - Paper ref: Huang et al. The Devil is in the Details: Delving into - Unbiased Data Processing for Human Pose Estimation (CVPR 2020). - - Args: - use_target_weight (bool): Option to use weighted MSE loss. - Different joint types may have different target weights. - Defaults to ``False`` - loss_weight (float): Weight of the loss. Defaults to 1.0 - """ - - def __init__(self, - use_target_weight: bool = False, - loss_weight: float = 1.): - super().__init__() - self.criterion = nn.MSELoss(reduction='mean') - self.use_target_weight = use_target_weight - self.loss_weight = loss_weight - - def forward(self, output: Tensor, target: Tensor, - target_weights: Tensor) -> Tensor: - """Forward function of loss. - - Note: - - batch_size: B - - num_channels: C - - heatmaps height: H - - heatmaps weight: W - - num_keypoints: K - Here, C = 3 * K - - Args: - output (Tensor): The output feature maps with shape [B, C, H, W]. - target (Tensor): The target feature maps with shape [B, C, H, W]. - target_weights (Tensor): The target weights of differet keypoints, - with shape [B, K]. - - Returns: - Tensor: The calculated loss. - """ - batch_size = output.size(0) - num_channels = output.size(1) - heatmaps_pred = output.reshape( - (batch_size, num_channels, -1)).split(1, 1) - heatmaps_gt = target.reshape( - (batch_size, num_channels, -1)).split(1, 1) - loss = 0. - num_joints = num_channels // 3 - for idx in range(num_joints): - heatmap_pred = heatmaps_pred[idx * 3].squeeze() - heatmap_gt = heatmaps_gt[idx * 3].squeeze() - offset_x_pred = heatmaps_pred[idx * 3 + 1].squeeze() - offset_x_gt = heatmaps_gt[idx * 3 + 1].squeeze() - offset_y_pred = heatmaps_pred[idx * 3 + 2].squeeze() - offset_y_gt = heatmaps_gt[idx * 3 + 2].squeeze() - if self.use_target_weight: - target_weight = target_weights[:, idx, None] - heatmap_pred = heatmap_pred * target_weight - heatmap_gt = heatmap_gt * target_weight - # classification loss - loss += 0.5 * self.criterion(heatmap_pred, heatmap_gt) - # regression loss - loss += 0.5 * self.criterion(heatmap_gt * offset_x_pred, - heatmap_gt * offset_x_gt) - loss += 0.5 * self.criterion(heatmap_gt * offset_y_pred, - heatmap_gt * offset_y_gt) - return loss / num_joints * self.loss_weight - - -@MODELS.register_module() -class KeypointOHKMMSELoss(nn.Module): - """MSE loss with online hard keypoint mining. - - Args: - use_target_weight (bool): Option to use weighted MSE loss. - Different joint types may have different target weights. - Defaults to ``False`` - topk (int): Only top k joint losses are kept. Defaults to 8 - loss_weight (float): Weight of the loss. Defaults to 1.0 - """ - - def __init__(self, - use_target_weight: bool = False, - topk: int = 8, - loss_weight: float = 1.): - super().__init__() - assert topk > 0 - self.criterion = nn.MSELoss(reduction='none') - self.use_target_weight = use_target_weight - self.topk = topk - self.loss_weight = loss_weight - - def _ohkm(self, losses: Tensor) -> Tensor: - """Online hard keypoint mining. - - Note: - - batch_size: B - - num_keypoints: K - - Args: - loss (Tensor): The losses with shape [B, K] - - Returns: - Tensor: The calculated loss. - """ - ohkm_loss = 0. - B = losses.shape[0] - for i in range(B): - sub_loss = losses[i] - _, topk_idx = torch.topk( - sub_loss, k=self.topk, dim=0, sorted=False) - tmp_loss = torch.gather(sub_loss, 0, topk_idx) - ohkm_loss += torch.sum(tmp_loss) / self.topk - ohkm_loss /= B - return ohkm_loss - - def forward(self, output: Tensor, target: Tensor, - target_weights: Tensor) -> Tensor: - """Forward function of loss. - - Note: - - batch_size: B - - num_keypoints: K - - heatmaps height: H - - heatmaps weight: W - - Args: - output (Tensor): The output heatmaps with shape [B, K, H, W]. - target (Tensor): The target heatmaps with shape [B, K, H, W]. - target_weights (Tensor): The target weights of differet keypoints, - with shape [B, K]. - - Returns: - Tensor: The calculated loss. - """ - num_keypoints = output.size(1) - if num_keypoints < self.topk: - raise ValueError(f'topk ({self.topk}) should not be ' - f'larger than num_keypoints ({num_keypoints}).') - - losses = [] - for idx in range(num_keypoints): - if self.use_target_weight: - target_weight = target_weights[:, idx, None, None] - losses.append( - self.criterion(output[:, idx] * target_weight, - target[:, idx] * target_weight)) - else: - losses.append(self.criterion(output[:, idx], target[:, idx])) - - losses = [loss.mean(dim=(1, 2)).unsqueeze(dim=1) for loss in losses] - losses = torch.cat(losses, dim=1) - - return self._ohkm(losses) * self.loss_weight diff --git a/mmpose/models/losses/multi_loss_factory.py b/mmpose/models/losses/multi_loss_factory.py deleted file mode 100644 index 4fef842efc..0000000000 --- a/mmpose/models/losses/multi_loss_factory.py +++ /dev/null @@ -1,281 +0,0 @@ -# ------------------------------------------------------------------------------ -# Adapted from https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation -# Original licence: Copyright (c) Microsoft, under the MIT License. -# ------------------------------------------------------------------------------ - -import torch -import torch.nn as nn - -from mmpose.registry import MODELS - - -def _make_input(t, requires_grad=False, device=torch.device('cpu')): - """Make zero inputs for AE loss. - - Args: - t (torch.Tensor): input - requires_grad (bool): Option to use requires_grad. - device: torch device - - Returns: - torch.Tensor: zero input. - """ - inp = torch.autograd.Variable(t, requires_grad=requires_grad) - inp = inp.sum() - inp = inp.to(device) - return inp - - -@MODELS.register_module() -class HeatmapLoss(nn.Module): - """Accumulate the heatmap loss for each image in the batch. - - Args: - supervise_empty (bool): Whether to supervise empty channels. - """ - - def __init__(self, supervise_empty=True): - super().__init__() - self.supervise_empty = supervise_empty - - def forward(self, pred, gt, mask): - """Forward function. - - Note: - - batch_size: N - - heatmaps weight: W - - heatmaps height: H - - max_num_people: M - - num_keypoints: K - - Args: - pred (torch.Tensor[N,K,H,W]):heatmap of output. - gt (torch.Tensor[N,K,H,W]): target heatmap. - mask (torch.Tensor[N,H,W]): mask of target. - """ - assert pred.size() == gt.size( - ), f'pred.size() is {pred.size()}, gt.size() is {gt.size()}' - - if not self.supervise_empty: - empty_mask = (gt.sum(dim=[2, 3], keepdim=True) > 0).float() - loss = ((pred - gt)**2) * empty_mask.expand_as( - pred) * mask[:, None, :, :].expand_as(pred) - else: - loss = ((pred - gt)**2) * mask[:, None, :, :].expand_as(pred) - loss = loss.mean(dim=3).mean(dim=2).mean(dim=1) - return loss - - -@MODELS.register_module() -class AELoss(nn.Module): - """Associative Embedding loss. - - `Associative Embedding: End-to-End Learning for Joint Detection and - Grouping `_. - """ - - def __init__(self, loss_type): - super().__init__() - self.loss_type = loss_type - - def singleTagLoss(self, pred_tag, joints): - """Associative embedding loss for one image. - - Note: - - heatmaps weight: W - - heatmaps height: H - - max_num_people: M - - num_keypoints: K - - Args: - pred_tag (torch.Tensor[KxHxW,1]): tag of output for one image. - joints (torch.Tensor[M,K,2]): joints information for one image. - """ - tags = [] - pull = 0 - for joints_per_person in joints: - tmp = [] - for joint in joints_per_person: - if joint[1] > 0: - tmp.append(pred_tag[joint[0]]) - if len(tmp) == 0: - continue - tmp = torch.stack(tmp) - tags.append(torch.mean(tmp, dim=0)) - pull = pull + torch.mean((tmp - tags[-1].expand_as(tmp))**2) - - num_tags = len(tags) - if num_tags == 0: - return ( - _make_input(torch.zeros(1).float(), device=pred_tag.device), - _make_input(torch.zeros(1).float(), device=pred_tag.device)) - elif num_tags == 1: - return (_make_input( - torch.zeros(1).float(), device=pred_tag.device), pull) - - tags = torch.stack(tags) - - size = (num_tags, num_tags) - A = tags.expand(*size) - B = A.permute(1, 0) - - diff = A - B - - if self.loss_type == 'exp': - diff = torch.pow(diff, 2) - push = torch.exp(-diff) - push = torch.sum(push) - num_tags - elif self.loss_type == 'max': - diff = 1 - torch.abs(diff) - push = torch.clamp(diff, min=0).sum() - num_tags - else: - raise ValueError('Unknown ae loss type') - - push_loss = push / ((num_tags - 1) * num_tags) * 0.5 - pull_loss = pull / (num_tags) - - return push_loss, pull_loss - - def forward(self, tags, joints): - """Accumulate the tag loss for each image in the batch. - - Note: - - batch_size: N - - heatmaps weight: W - - heatmaps height: H - - max_num_people: M - - num_keypoints: K - - Args: - tags (torch.Tensor[N,KxHxW,1]): tag channels of output. - joints (torch.Tensor[N,M,K,2]): joints information. - """ - pushes, pulls = [], [] - joints = joints.cpu().data.numpy() - batch_size = tags.size(0) - for i in range(batch_size): - push, pull = self.singleTagLoss(tags[i], joints[i]) - pushes.append(push) - pulls.append(pull) - return torch.stack(pushes), torch.stack(pulls) - - -@MODELS.register_module() -class MultiLossFactory(nn.Module): - """Loss for bottom-up models. - - Args: - num_joints (int): Number of keypoints. - num_stages (int): Number of stages. - ae_loss_type (str): Type of ae loss. - with_ae_loss (list[bool]): Use ae loss or not in multi-heatmap. - push_loss_factor (list[float]): - Parameter of push loss in multi-heatmap. - pull_loss_factor (list[float]): - Parameter of pull loss in multi-heatmap. - with_heatmap_loss (list[bool]): - Use heatmap loss or not in multi-heatmap. - heatmaps_loss_factor (list[float]): - Parameter of heatmap loss in multi-heatmap. - supervise_empty (bool): Whether to supervise empty channels. - """ - - def __init__(self, - num_joints, - num_stages, - ae_loss_type, - with_ae_loss, - push_loss_factor, - pull_loss_factor, - with_heatmaps_loss, - heatmaps_loss_factor, - supervise_empty=True): - super().__init__() - - assert isinstance(with_heatmaps_loss, (list, tuple)), \ - 'with_heatmaps_loss should be a list or tuple' - assert isinstance(heatmaps_loss_factor, (list, tuple)), \ - 'heatmaps_loss_factor should be a list or tuple' - assert isinstance(with_ae_loss, (list, tuple)), \ - 'with_ae_loss should be a list or tuple' - assert isinstance(push_loss_factor, (list, tuple)), \ - 'push_loss_factor should be a list or tuple' - assert isinstance(pull_loss_factor, (list, tuple)), \ - 'pull_loss_factor should be a list or tuple' - - self.num_joints = num_joints - self.num_stages = num_stages - self.ae_loss_type = ae_loss_type - self.with_ae_loss = with_ae_loss - self.push_loss_factor = push_loss_factor - self.pull_loss_factor = pull_loss_factor - self.with_heatmaps_loss = with_heatmaps_loss - self.heatmaps_loss_factor = heatmaps_loss_factor - - self.heatmaps_loss = \ - nn.ModuleList( - [ - HeatmapLoss(supervise_empty) - if with_heatmaps_loss else None - for with_heatmaps_loss in self.with_heatmaps_loss - ] - ) - - self.ae_loss = \ - nn.ModuleList( - [ - AELoss(self.ae_loss_type) if with_ae_loss else None - for with_ae_loss in self.with_ae_loss - ] - ) - - def forward(self, outputs, heatmaps, masks, joints): - """Forward function to calculate losses. - - Note: - - batch_size: N - - heatmaps weight: W - - heatmaps height: H - - max_num_people: M - - num_keypoints: K - - output_channel: C C=2K if use ae loss else K - - Args: - outputs (list(torch.Tensor[N,C,H,W])): outputs of stages. - heatmaps (list(torch.Tensor[N,K,H,W])): target of heatmaps. - masks (list(torch.Tensor[N,H,W])): masks of heatmaps. - joints (list(torch.Tensor[N,M,K,2])): joints of ae loss. - """ - heatmaps_losses = [] - push_losses = [] - pull_losses = [] - for idx in range(len(outputs)): - offset_feat = 0 - if self.heatmaps_loss[idx]: - heatmaps_pred = outputs[idx][:, :self.num_joints] - offset_feat = self.num_joints - heatmaps_loss = self.heatmaps_loss[idx](heatmaps_pred, - heatmaps[idx], - masks[idx]) - heatmaps_loss = heatmaps_loss * self.heatmaps_loss_factor[idx] - heatmaps_losses.append(heatmaps_loss) - else: - heatmaps_losses.append(None) - - if self.ae_loss[idx]: - tags_pred = outputs[idx][:, offset_feat:] - batch_size = tags_pred.size()[0] - tags_pred = tags_pred.contiguous().view(batch_size, -1, 1) - - push_loss, pull_loss = self.ae_loss[idx](tags_pred, - joints[idx]) - push_loss = push_loss * self.push_loss_factor[idx] - pull_loss = pull_loss * self.pull_loss_factor[idx] - - push_losses.append(push_loss) - pull_losses.append(pull_loss) - else: - push_losses.append(None) - pull_losses.append(None) - - return heatmaps_losses, push_losses, pull_losses diff --git a/mmpose/models/losses/regression_loss.py b/mmpose/models/losses/regression_loss.py index 2fa6e182da..9a64a4adfe 100644 --- a/mmpose/models/losses/regression_loss.py +++ b/mmpose/models/losses/regression_loss.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import math +from functools import partial import torch import torch.nn as nn @@ -135,6 +136,87 @@ def forward(self, output, target, target_weight=None): return loss * self.loss_weight +@MODELS.register_module() +class SoftWeightSmoothL1Loss(nn.Module): + """Smooth L1 loss with soft weight for regression. + + Args: + use_target_weight (bool): Option to use weighted MSE loss. + Different joint types may have different target weights. + supervise_empty (bool): Whether to supervise the output with zero + weight. + beta (float): Specifies the threshold at which to change between + L1 and L2 loss. + loss_weight (float): Weight of the loss. Default: 1.0. + """ + + def __init__(self, + use_target_weight=False, + supervise_empty=True, + beta=1.0, + loss_weight=1.): + super().__init__() + + reduction = 'none' if use_target_weight else 'mean' + self.criterion = partial( + self.smooth_l1_loss, reduction=reduction, beta=beta) + + self.supervise_empty = supervise_empty + self.use_target_weight = use_target_weight + self.loss_weight = loss_weight + + @staticmethod + def smooth_l1_loss(input, target, reduction='none', beta=1.0): + """Re-implement torch.nn.functional.smooth_l1_loss with beta to support + pytorch <= 1.6.""" + delta = input - target + mask = delta.abs() < beta + delta[mask] = (delta[mask]).pow(2) / (2 * beta) + delta[~mask] = delta[~mask].abs() - beta / 2 + + if reduction == 'mean': + return delta.mean() + elif reduction == 'sum': + return delta.sum() + elif reduction == 'none': + return delta + else: + raise ValueError(f'reduction must be \'mean\', \'sum\' or ' + f'\'none\', but got \'{reduction}\'') + + def forward(self, output, target, target_weight=None): + """Forward function. + + Note: + - batch_size: N + - num_keypoints: K + - dimension of keypoints: D (D=2 or D=3) + + Args: + output (torch.Tensor[N, K, D]): Output regression. + target (torch.Tensor[N, K, D]): Target regression. + target_weight (torch.Tensor[N, K, D]): + Weights across different joint types. + """ + if self.use_target_weight: + assert target_weight is not None + assert output.ndim >= target_weight.ndim + + for i in range(output.ndim - target_weight.ndim): + target_weight = target_weight.unsqueeze(-1) + + loss = self.criterion(output, target) * target_weight + if self.supervise_empty: + loss = loss.mean() + else: + num_elements = torch.nonzero(target_weight > 0).size()[0] + loss = loss.sum() / max(num_elements, 1.0) + else: + loss = self.criterion(output, target) + + return loss * self.loss_weight + + @MODELS.register_module() class WingLoss(nn.Module): """Wing Loss. paper ref: 'Wing Loss for Robust Facial Landmark Localisation diff --git a/mmpose/models/necks/gap_neck.py b/mmpose/models/necks/gap_neck.py index 61ec786b19..58ce5d939f 100644 --- a/mmpose/models/necks/gap_neck.py +++ b/mmpose/models/necks/gap_neck.py @@ -22,6 +22,8 @@ def init_weights(self): pass def forward(self, inputs): + """Forward function.""" + if isinstance(inputs, tuple): outs = tuple([self.gap(x) for x in inputs]) outs = tuple( diff --git a/mmpose/models/pose_estimators/__init__.py b/mmpose/models/pose_estimators/__init__.py index e3ad99b245..6ead1a979e 100644 --- a/mmpose/models/pose_estimators/__init__.py +++ b/mmpose/models/pose_estimators/__init__.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +from .bottomup import BottomupPoseEstimator from .topdown import TopdownPoseEstimator -__all__ = ['TopdownPoseEstimator'] +__all__ = ['TopdownPoseEstimator', 'BottomupPoseEstimator'] diff --git a/mmpose/models/pose_estimators/base.py b/mmpose/models/pose_estimators/base.py index 049b37756c..b97232b344 100644 --- a/mmpose/models/pose_estimators/base.py +++ b/mmpose/models/pose_estimators/base.py @@ -1,12 +1,16 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABCMeta, abstractmethod +from typing import Tuple, Union import torch from mmengine.model import BaseModel from torch import Tensor -from mmpose.utils.typing import (ForwardResults, OptConfigType, OptMultiConfig, - OptSampleList, SampleList) +from mmpose.datasets.datasets.utils import parse_pose_metainfo +from mmpose.registry import MODELS +from mmpose.utils.typing import (ConfigType, ForwardResults, OptConfigType, + Optional, OptMultiConfig, OptSampleList, + SampleList) class BasePoseEstimator(BaseModel, metaclass=ABCMeta): @@ -14,16 +18,44 @@ class BasePoseEstimator(BaseModel, metaclass=ABCMeta): Args: data_preprocessor (dict | ConfigDict, optional): The pre-processing - config of :class:`BaseDataPreprocessor`. Defaults to ``None``. + config of :class:`BaseDataPreprocessor`. Defaults to ``None`` init_cfg (dict | ConfigDict): The model initialization config. Defaults to ``None`` + metainfo (dict): Meta information for dataset, such as keypoints + definition and properties. If set, the metainfo of the input data + batch will be overridden. For more details, please refer to + https://mmpose.readthedocs.io/en/1.x/user_guides/ + prepare_datasets.html#create-a-custom-dataset-info- + config-file-for-the-dataset. Defaults to ``None`` """ + _version = 2 def __init__(self, + backbone: ConfigType, + neck: OptConfigType = None, + head: OptConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, data_preprocessor: OptConfigType = None, - init_cfg: OptMultiConfig = None): + init_cfg: OptMultiConfig = None, + metainfo: Optional[dict] = None): super().__init__( data_preprocessor=data_preprocessor, init_cfg=init_cfg) + self.metainfo = self._load_metainfo(metainfo) + + self.backbone = MODELS.build(backbone) + + if neck is not None: + self.neck = MODELS.build(neck) + + if head is not None: + self.head = MODELS.build(head) + + self.train_cfg = train_cfg if train_cfg else {} + self.test_cfg = test_cfg if test_cfg else {} + + # Register the hook to automatically convert old version state dicts + self._register_load_state_dict_pre_hook(self._load_state_dict_pre_hook) @property def with_neck(self) -> bool: @@ -35,6 +67,27 @@ def with_head(self) -> bool: """bool: whether the pose estimator has a head.""" return hasattr(self, 'head') and self.head is not None + @staticmethod + def _load_metainfo(metainfo: dict = None) -> dict: + """Collect meta information from the dictionary of meta. + + Args: + metainfo (dict): Raw data of pose meta information. + + Returns: + dict: Parsed meta information. + """ + + if metainfo is None: + return None + + if not isinstance(metainfo, dict): + raise TypeError( + f'metainfo should be a dict, but got {type(metainfo)}') + + metainfo = parse_pose_metainfo(metainfo) + return metainfo + def forward(self, inputs: torch.Tensor, data_samples: OptSampleList, @@ -73,6 +126,10 @@ def forward(self, if mode == 'loss': return self.loss(inputs, data_samples) elif mode == 'predict': + # use customed metainfo to override the default metainfo + if self.metainfo is not None: + for data_sample in data_samples: + data_sample.set_metainfo(self.metainfo) return self.predict(inputs, data_samples) elif mode == 'tensor': return self._forward(inputs) @@ -89,14 +146,58 @@ def predict(self, inputs: Tensor, data_samples: SampleList) -> SampleList: """Predict results from a batch of inputs and data samples with post- processing.""" - @abstractmethod - def _forward(self, inputs: Tensor, data_samples: OptSampleList = None): - """Network forward process. + def _forward(self, + inputs: Tensor, + data_samples: OptSampleList = None + ) -> Union[Tensor, Tuple[Tensor]]: + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. - Usually includes backbone, neck and head forward without any post- - processing. + Args: + inputs (Tensor): Inputs with shape (N, C, H, W). + + Returns: + Union[Tensor | Tuple[Tensor]]: forward output of the network. """ - @abstractmethod - def extract_feat(self, inputs: Tensor): - """Extract features.""" + x = self.extract_feat(inputs) + if self.with_head: + x = self.head.forward(x) + + return x + + def extract_feat(self, inputs: Tensor) -> Tuple[Tensor]: + """Extract features. + + Args: + inputs (Tensor): Image tensor with shape (N, C, H ,W). + + Returns: + tuple[Tensor]: Multi-level features that may have various + resolutions. + """ + x = self.backbone(inputs) + if self.with_neck: + x = self.neck(x) + + return x + + def _load_state_dict_pre_hook(self, state_dict, prefix, local_meta, *args, + **kwargs): + """A hook function to convert old-version state dict of + :class:`TopdownHeatmapSimpleHead` (before MMPose v1.0.0) to a + compatible format of :class:`HeatmapHead`. + + The hook will be automatically registered during initialization. + """ + version = local_meta.get('version', None) + if version and version >= self._version: + return + + # convert old-version state dict + keys = list(state_dict.keys()) + for k in keys: + if 'keypoint_head' in k: + v = state_dict.pop(k) + k = k.replace('keypoint_head', 'head') + state_dict[k] = v diff --git a/mmpose/models/pose_estimators/bottomup.py b/mmpose/models/pose_estimators/bottomup.py new file mode 100644 index 0000000000..5400f2478e --- /dev/null +++ b/mmpose/models/pose_estimators/bottomup.py @@ -0,0 +1,178 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from itertools import zip_longest +from typing import List, Optional, Union + +from mmengine.utils import is_list_of +from torch import Tensor + +from mmpose.registry import MODELS +from mmpose.utils.typing import (ConfigType, InstanceList, OptConfigType, + OptMultiConfig, PixelDataList, SampleList) +from .base import BasePoseEstimator + + +@MODELS.register_module() +class BottomupPoseEstimator(BasePoseEstimator): + """Base class for bottom-up pose estimators. + + Args: + backbone (dict): The backbone config + neck (dict, optional): The neck config. Defaults to ``None`` + head (dict, optional): The head config. Defaults to ``None`` + train_cfg (dict, optional): The runtime config for training process. + Defaults to ``None`` + test_cfg (dict, optional): The runtime config for testing process. + Defaults to ``None`` + data_preprocessor (dict, optional): The data preprocessing config to + build the instance of :class:`BaseDataPreprocessor`. Defaults to + ``None``. + init_cfg (dict, optional): The config to control the initialization. + Defaults to ``None`` + """ + + def __init__(self, + backbone: ConfigType, + neck: OptConfigType = None, + head: OptConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__( + backbone=backbone, + neck=neck, + head=head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) + + def loss(self, inputs: Tensor, data_samples: SampleList) -> dict: + """Calculate losses from a batch of inputs and data samples. + + Args: + inputs (Tensor): Inputs with shape (N, C, H, W). + data_samples (List[:obj:`PoseDataSample`]): The batch + data samples. + + Returns: + dict: A dictionary of losses. + """ + feats = self.extract_feat(inputs) + + losses = dict() + + if self.with_head: + losses.update( + self.head.loss(feats, data_samples, train_cfg=self.train_cfg)) + + return losses + + def predict(self, inputs: Union[Tensor, List[Tensor]], + data_samples: SampleList) -> SampleList: + """Predict results from a batch of inputs and data samples with post- + processing. + + Args: + inputs (Tensor | List[Tensor]): Input image in tensor or image + pyramid as a list of tensors. Each tensor is in shape + [B, C, H, W] + data_samples (List[:obj:`PoseDataSample`]): The batch + data samples + + Returns: + list[:obj:`PoseDataSample`]: The pose estimation results of the + input images. The return value is `PoseDataSample` instances with + ``pred_instances`` and ``pred_fields``(optional) field , and + ``pred_instances`` usually contains the following keys: + + - keypoints (Tensor): predicted keypoint coordinates in shape + (num_instances, K, D) where K is the keypoint number and D + is the keypoint dimension + - keypoint_scores (Tensor): predicted keypoint scores in shape + (num_instances, K) + """ + assert self.with_head, ( + 'The model must have head to perform prediction.') + + multiscale_test = self.test_cfg.get('multiscale_test', False) + flip_test = self.test_cfg.get('flip_test', False) + + # enable multi-scale test + aug_scales = data_samples[0].metainfo.get('aug_scales', None) + if multiscale_test: + assert isinstance(aug_scales, list) + assert is_list_of(inputs, Tensor) + # `inputs` includes images in original and augmented scales + assert len(inputs) == len(aug_scales) + 1 + else: + assert isinstance(inputs, Tensor) + # single-scale test + inputs = [inputs] + + feats = [] + for _inputs in inputs: + if flip_test: + _feats_orig = self.extract_feat(_inputs) + _feats_flip = self.extract_feat(_inputs.flip(-1)) + _feats = [_feats_orig, _feats_flip] + else: + _feats = self.extract_feat(_inputs) + + feats.append(_feats) + + if not multiscale_test: + feats = feats[0] + + preds = self.head.predict(feats, data_samples, test_cfg=self.test_cfg) + + if isinstance(preds, tuple): + batch_pred_instances, batch_pred_fields = preds + else: + batch_pred_instances = preds + batch_pred_fields = None + + results = self.add_pred_to_datasample(batch_pred_instances, + batch_pred_fields, data_samples) + + return results + + def add_pred_to_datasample(self, batch_pred_instances: InstanceList, + batch_pred_fields: Optional[PixelDataList], + batch_data_samples: SampleList) -> SampleList: + """Add predictions into data samples. + + Args: + batch_pred_instances (List[InstanceData]): The predicted instances + of the input data batch + batch_pred_fields (List[PixelData], optional): The predicted + fields (e.g. heatmaps) of the input batch + batch_data_samples (List[PoseDataSample]): The input data batch + + Returns: + List[PoseDataSample]: A list of data samples where the predictions + are stored in the ``pred_instances`` field of each data sample. + The length of the list is the batch size when ``merge==False``, or + 1 when ``merge==True``. + """ + assert len(batch_pred_instances) == len(batch_data_samples) + if batch_pred_fields is None: + batch_pred_fields = [] + + for pred_instances, pred_fields, data_sample in zip_longest( + batch_pred_instances, batch_pred_fields, batch_data_samples): + + # convert keypoint coordinates from input space to image space + input_size = data_sample.metainfo['input_size'] + input_center = data_sample.metainfo['input_center'] + input_scale = data_sample.metainfo['input_scale'] + + pred_instances.keypoints = pred_instances.keypoints / input_size \ + * input_scale + input_center - 0.5 * input_scale + + data_sample.pred_instances = pred_instances + + if pred_fields is not None: + data_sample.pred_fields = pred_fields + + return batch_data_samples diff --git a/mmpose/models/pose_estimators/topdown.py b/mmpose/models/pose_estimators/topdown.py index 0aacdffab1..521827ff2a 100644 --- a/mmpose/models/pose_estimators/topdown.py +++ b/mmpose/models/pose_estimators/topdown.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from itertools import zip_longest -from typing import Optional, Tuple +from typing import Optional from torch import Tensor @@ -24,13 +24,17 @@ class TopdownPoseEstimator(BasePoseEstimator): Defaults to ``None`` data_preprocessor (dict, optional): The data preprocessing config to build the instance of :class:`BaseDataPreprocessor`. Defaults to - ``None``. + ``None`` init_cfg (dict, optional): The config to control the initialization. Defaults to ``None`` + metainfo (dict): Meta information for dataset, such as keypoints + definition and properties. If set, the metainfo of the input data + batch will be overridden. For more details, please refer to + https://mmpose.readthedocs.io/en/1.x/user_guides/ + prepare_datasets.html#create-a-custom-dataset-info- + config-file-for-the-dataset. Defaults to ``None`` """ - _version = 2 - def __init__(self, backbone: ConfigType, neck: OptConfigType = None, @@ -38,56 +42,17 @@ def __init__(self, train_cfg: OptConfigType = None, test_cfg: OptConfigType = None, data_preprocessor: OptConfigType = None, - init_cfg: OptMultiConfig = None): - super().__init__(data_preprocessor, init_cfg) - - self.backbone = MODELS.build(backbone) - - if neck is not None: - self.neck = MODELS.build(neck) - - if head is not None: - self.head = MODELS.build(head) - - self.train_cfg = train_cfg if train_cfg else {} - self.test_cfg = test_cfg if test_cfg else {} - - # Register the hook to automatically convert old version state dicts - self._register_load_state_dict_pre_hook(self._load_state_dict_pre_hook) - - def extract_feat(self, inputs: Tensor) -> Tuple[Tensor]: - """Extract features. - - Args: - inputs (Tensor): Image tensor with shape (N, C, H ,W). - - Returns: - tuple[Tensor]: Multi-level features that may have various - resolutions. - """ - x = self.backbone(inputs) - if self.with_neck: - x = self.neck(x) - - return x - - def _forward(self, inputs: Tensor): - """Network forward process. Usually includes backbone, neck and head - forward without any post-processing. - - Args: - inputs (Tensor): Inputs with shape (N, C, H, W). - - Returns: - tuple: A tuple of features from ``rpn_head`` and ``roi_head`` - forward. - """ - - x = self.extract_feat(inputs) - if self.with_head: - x = self.head.forward(x) - - return x + init_cfg: OptMultiConfig = None, + metainfo: Optional[dict] = None): + super().__init__( + backbone=backbone, + neck=neck, + head=head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg, + metainfo=metainfo) def loss(self, inputs: Tensor, data_samples: SampleList) -> dict: """Calculate losses from a batch of inputs and data samples. @@ -143,7 +108,7 @@ def predict(self, inputs: Tensor, data_samples: SampleList) -> SampleList: preds = self.head.predict(feats, data_samples, test_cfg=self.test_cfg) - if isinstance(preds, Tuple): + if isinstance(preds, tuple): batch_pred_instances, batch_pred_fields = preds else: batch_pred_instances = preds @@ -165,19 +130,16 @@ def add_pred_to_datasample(self, batch_pred_instances: InstanceList, batch_pred_fields (List[PixelData], optional): The predicted fields (e.g. heatmaps) of the input batch batch_data_samples (List[PoseDataSample]): The input data batch - merge (bool): Whether merge all predictions into a single - `PoseDataSample`. This is useful when the input batch is - instances (bboxes) from the same image. Defaults to ``False`` Returns: List[PoseDataSample]: A list of data samples where the predictions are stored in the ``pred_instances`` field of each data sample. - The length of the list is the batch size when ``merge==False``, or - 1 when ``merge==True``. """ assert len(batch_pred_instances) == len(batch_data_samples) if batch_pred_fields is None: batch_pred_fields = [] + output_keypoint_indices = self.test_cfg.get('output_keypoint_indices', + None) for pred_instances, pred_fields, data_sample in zip_longest( batch_pred_instances, batch_pred_fields, batch_data_samples): @@ -192,6 +154,14 @@ def add_pred_to_datasample(self, batch_pred_instances: InstanceList, pred_instances.keypoints = pred_instances.keypoints / input_size \ * bbox_scales + bbox_centers - 0.5 * bbox_scales + if output_keypoint_indices is not None: + # select output keypoints with given indices + num_keypoints = pred_instances.keypoints.shape[1] + for key, value in pred_instances.all_items(): + if key.startswith('keypoint'): + pred_instances.set_field( + value[:, output_keypoint_indices], key) + # add bbox information into pred_instances pred_instances.bboxes = gt_instances.bboxes pred_instances.bbox_scores = gt_instances.bbox_scores @@ -199,26 +169,14 @@ def add_pred_to_datasample(self, batch_pred_instances: InstanceList, data_sample.pred_instances = pred_instances if pred_fields is not None: + if output_keypoint_indices is not None: + # select output heatmap channels with keypoint indices + # when the number of heatmap channel matches num_keypoints + for key, value in pred_fields.all_items(): + if value.shape[0] != num_keypoints: + continue + pred_fields.set_field(value[output_keypoint_indices], + key) data_sample.pred_fields = pred_fields return batch_data_samples - - def _load_state_dict_pre_hook(self, state_dict, prefix, local_meta, *args, - **kwargs): - """A hook function to convert old-version state dict of - :class:`TopdownHeatmapSimpleHead` (before MMPose v1.0.0) to a - compatible format of :class:`HeatmapHead`. - - The hook will be automatically registered during initialization. - """ - version = local_meta.get('version', None) - if version and version >= self._version: - return - - # convert old-version state dict - keys = list(state_dict.keys()) - for k in keys: - if 'keypoint_head' in k: - v = state_dict.pop(k) - k = k.replace('keypoint_head', 'head') - state_dict[k] = v diff --git a/mmpose/models/utils/__init__.py b/mmpose/models/utils/__init__.py index d6f260d37c..730d43aca0 100644 --- a/mmpose/models/utils/__init__.py +++ b/mmpose/models/utils/__init__.py @@ -1,5 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. from .ckpt_convert import pvt_convert +from .rtmcc_block import RTMCCBlock, rope from .transformer import PatchEmbed, nchw_to_nlc, nlc_to_nchw -__all__ = ['PatchEmbed', 'nchw_to_nlc', 'nlc_to_nchw', 'pvt_convert'] +__all__ = [ + 'PatchEmbed', 'nchw_to_nlc', 'nlc_to_nchw', 'pvt_convert', 'RTMCCBlock', + 'rope' +] diff --git a/mmpose/models/utils/rtmcc_block.py b/mmpose/models/utils/rtmcc_block.py new file mode 100644 index 0000000000..0e317376b2 --- /dev/null +++ b/mmpose/models/utils/rtmcc_block.py @@ -0,0 +1,299 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn.bricks import DropPath +from mmengine.utils import digit_version +from mmengine.utils.dl_utils import TORCH_VERSION + + +def rope(x, dim): + """Applies Rotary Position Embedding to input tensor. + + Args: + x (torch.Tensor): Input tensor. + dim (int | list[int]): The spatial dimension(s) to apply + rotary position embedding. + + Returns: + torch.Tensor: The tensor after applying rotary position + embedding. + + Reference: + `RoFormer: Enhanced Transformer with Rotary + Position Embedding `_ + """ + shape = x.shape + if isinstance(dim, int): + dim = [dim] + + spatial_shape = [shape[i] for i in dim] + total_len = 1 + for i in spatial_shape: + total_len *= i + + position = torch.reshape( + torch.arange(total_len, dtype=torch.int, device=x.device), + spatial_shape) + + for i in range(dim[-1] + 1, len(shape) - 1, 1): + position = torch.unsqueeze(position, dim=-1) + + half_size = shape[-1] // 2 + freq_seq = -torch.arange( + half_size, dtype=torch.int, device=x.device) / float(half_size) + inv_freq = 10000**-freq_seq + + sinusoid = position[..., None] * inv_freq[None, None, :] + + sin = torch.sin(sinusoid) + cos = torch.cos(sinusoid) + x1, x2 = torch.chunk(x, 2, dim=-1) + + return torch.cat([x1 * cos - x2 * sin, x2 * cos + x1 * sin], dim=-1) + + +class Scale(nn.Module): + """Scale vector by element multiplications. + + Args: + dim (int): The dimension of the scale vector. + init_value (float, optional): The initial value of the scale vector. + Defaults to 1.0. + trainable (bool, optional): Whether the scale vector is trainable. + Defaults to True. + """ + + def __init__(self, dim, init_value=1., trainable=True): + super().__init__() + self.scale = nn.Parameter( + init_value * torch.ones(dim), requires_grad=trainable) + + def forward(self, x): + """Forward function.""" + + return x * self.scale + + +class ScaleNorm(nn.Module): + """Scale Norm. + + Args: + dim (int): The dimension of the scale vector. + eps (float, optional): The minimum value in clamp. Defaults to 1e-5. + + Reference: + `Transformers without Tears: Improving the Normalization + of Self-Attention `_ + """ + + def __init__(self, dim, eps=1e-5): + super().__init__() + self.scale = dim**-0.5 + self.eps = eps + self.g = nn.Parameter(torch.ones(1)) + + def forward(self, x): + """Forward function. + + Args: + x (torch.Tensor): Input tensor. + + Returns: + torch.Tensor: The tensor after applying scale norm. + """ + + norm = torch.norm(x, dim=-1, keepdim=True) * self.scale + return x / norm.clamp(min=self.eps) * self.g + + +class RTMCCBlock(nn.Module): + """Gated Attention Unit (GAU) in RTMBlock. + + Args: + num_token (int): The number of tokens. + in_token_dims (int): The input token dimension. + out_token_dims (int): The output token dimension. + expansion_factor (int, optional): The expansion factor of the + intermediate token dimension. Defaults to 2. + s (int, optional): The self-attention feature dimension. + Defaults to 128. + eps (float, optional): The minimum value in clamp. Defaults to 1e-5. + dropout_rate (float, optional): The dropout rate. Defaults to 0.0. + drop_path (float, optional): The drop path rate. Defaults to 0.0. + attn_type (str, optional): Type of attention which should be one of + the following options: + + - 'self-attn': Self-attention. + - 'cross-attn': Cross-attention. + + Defaults to 'self-attn'. + act_fn (str, optional): The activation function which should be one + of the following options: + + - 'ReLU': ReLU activation. + - 'SiLU': SiLU activation. + + Defaults to 'SiLU'. + bias (bool, optional): Whether to use bias in linear layers. + Defaults to False. + use_rel_bias (bool, optional): Whether to use relative bias. + Defaults to True. + pos_enc (bool, optional): Whether to use rotary position + embedding. Defaults to False. + + Reference: + `Transformer Quality in Linear Time + `_ + """ + + def __init__(self, + num_token, + in_token_dims, + out_token_dims, + expansion_factor=2, + s=128, + eps=1e-5, + dropout_rate=0., + drop_path=0., + attn_type='self-attn', + act_fn='SiLU', + bias=False, + use_rel_bias=True, + pos_enc=False): + + super(RTMCCBlock, self).__init__() + self.s = s + self.num_token = num_token + self.use_rel_bias = use_rel_bias + self.attn_type = attn_type + self.pos_enc = pos_enc + self.drop_path = DropPath(drop_path) \ + if drop_path > 0. else nn.Identity() + + self.e = int(in_token_dims * expansion_factor) + if use_rel_bias: + if attn_type == 'self-attn': + self.w = nn.Parameter( + torch.rand([2 * num_token - 1], dtype=torch.float)) + else: + self.a = nn.Parameter(torch.rand([1, s], dtype=torch.float)) + self.b = nn.Parameter(torch.rand([1, s], dtype=torch.float)) + self.o = nn.Linear(self.e, out_token_dims, bias=bias) + + if attn_type == 'self-attn': + self.uv = nn.Linear(in_token_dims, 2 * self.e + self.s, bias=bias) + self.gamma = nn.Parameter(torch.rand((2, self.s))) + self.beta = nn.Parameter(torch.rand((2, self.s))) + else: + self.uv = nn.Linear(in_token_dims, self.e + self.s, bias=bias) + self.k_fc = nn.Linear(in_token_dims, self.s, bias=bias) + self.v_fc = nn.Linear(in_token_dims, self.e, bias=bias) + nn.init.xavier_uniform_(self.k_fc.weight) + nn.init.xavier_uniform_(self.v_fc.weight) + + self.ln = ScaleNorm(in_token_dims, eps=eps) + + nn.init.xavier_uniform_(self.uv.weight) + + if act_fn == 'SiLU': + assert digit_version(TORCH_VERSION) >= digit_version('1.7.0'), \ + 'SiLU activation requires PyTorch version >= 1.7' + + self.act_fn = nn.SiLU(True) + else: + self.act_fn = nn.ReLU(True) + + if in_token_dims == out_token_dims: + self.shortcut = True + self.res_scale = Scale(in_token_dims) + else: + self.shortcut = False + + self.sqrt_s = math.sqrt(s) + + self.dropout_rate = dropout_rate + + if dropout_rate > 0.: + self.dropout = nn.Dropout(dropout_rate) + + def rel_pos_bias(self, seq_len, k_len=None): + """Add relative position bias.""" + + if self.attn_type == 'self-attn': + t = F.pad(self.w[:2 * seq_len - 1], [0, seq_len]).repeat(seq_len) + t = t[..., :-seq_len].reshape(-1, seq_len, 3 * seq_len - 2) + r = (2 * seq_len - 1) // 2 + t = t[..., r:-r] + else: + a = rope(self.a.repeat(seq_len, 1), dim=0) + b = rope(self.b.repeat(k_len, 1), dim=0) + t = torch.bmm(a, b.permute(0, 2, 1)) + return t + + def _forward(self, inputs): + """GAU Forward function.""" + + if self.attn_type == 'self-attn': + x = inputs + else: + x, k, v = inputs + + x = self.ln(x) + + uv = self.uv(x) + + if self.attn_type == 'self-attn': + u, v, base = torch.split( + self.act_fn(uv), [self.e, self.e, self.s], dim=-1) + + base = base.unsqueeze(2) * self.gamma[None, None, :] + self.beta + + if self.pos_enc: + base = rope(base, dim=1) + + q, k = torch.unbind(base, dim=-2) + + else: + u, q = torch.split(self.act_fn(uv), [self.e, self.s], dim=-1) + + k = self.k_fc(k) + v = self.v_fc(v) + + if self.pos_enc: + q = rope(q, 1) + k = rope(k, 1) + + qk = torch.bmm(q, k.permute(0, 2, 1)) + + if self.use_rel_bias: + if self.attn_type == 'self-attn': + bias = self.rel_pos_bias(q.size(1)) + else: + bias = self.rel_pos_bias(q.size(1), k.size(1)) + qk += bias[:, :q.size(1), :k.size(1)] + + kernel = torch.square(F.relu(qk / self.sqrt_s)) + + if self.dropout_rate > 0.: + kernel = self.dropout(kernel) + + x = u * torch.bmm(kernel, v) + x = self.o(x) + + return x + + def forward(self, x): + """Forward function.""" + + if self.shortcut: + if self.attn_type == 'cross-attn': + res_shortcut = x[0] + else: + res_shortcut = x + main_branch = self.drop_path(self._forward(x)) + return self.res_scale(res_shortcut) + main_branch + else: + return self.drop_path(self._forward(x)) diff --git a/mmpose/models/utils/transformer.py b/mmpose/models/utils/transformer.py index d33d696ad2..103b9e9970 100644 --- a/mmpose/models/utils/transformer.py +++ b/mmpose/models/utils/transformer.py @@ -87,6 +87,8 @@ def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'): self.dilation = dilation def get_pad_shape(self, input_shape): + """Get horizontal and vertical padding shapes.""" + input_h, input_w = input_shape kernel_h, kernel_w = self.kernel_size stride_h, stride_w = self.stride @@ -99,6 +101,8 @@ def get_pad_shape(self, input_shape): return pad_h, pad_w def forward(self, x): + """Forward function.""" + pad_h, pad_w = self.get_pad_shape(x.size()[-2:]) if pad_h > 0 or pad_w > 0: if self.padding == 'corner': diff --git a/mmpose/models/utils/tta.py b/mmpose/models/utils/tta.py index c3ef43070e..0add48a422 100644 --- a/mmpose/models/utils/tta.py +++ b/mmpose/models/utils/tta.py @@ -1,11 +1,13 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import List, Tuple +from typing import List, Optional, Tuple +import torch +import torch.nn.functional as F from torch import Tensor def flip_heatmaps(heatmaps: Tensor, - flip_indices: List[int], + flip_indices: Optional[List[int]] = None, flip_mode: str = 'heatmap', shift_heatmap: bool = True): """Flip heatmaps for test-time augmentation. @@ -14,13 +16,16 @@ def flip_heatmaps(heatmaps: Tensor, heatmaps (Tensor): The heatmaps to flip. Should be a tensor in shape [B, C, H, W] flip_indices (List[int]): The indices of each keypoint's symmetric - keypoint + keypoint. Defaults to ``None`` flip_mode (str): Specify the flipping mode. Options are: - ``'heatmap'``: horizontally flip the heatmaps and swap heatmaps of symmetric keypoints according to ``flip_indices`` - ``'udp_combined'``: similar to ``'heatmap'`` mode but further flip the x_offset values + - ``'offset'``: horizontally flip the offset fields and swap + heatmaps of symmetric keypoints according to + ``flip_indices``. x_offset values are also reversed shift_heatmap (bool): Shift the flipped heatmaps to align with the original heatmaps and improve accuracy. Defaults to ``True`` @@ -29,16 +34,30 @@ def flip_heatmaps(heatmaps: Tensor, """ if flip_mode == 'heatmap': - assert len(flip_indices) == heatmaps.shape[1] - heatmaps = heatmaps[:, flip_indices].flip(-1) - + heatmaps = heatmaps.flip(-1) + if flip_indices is not None: + assert len(flip_indices) == heatmaps.shape[1] + heatmaps = heatmaps[:, flip_indices] elif flip_mode == 'udp_combined': B, C, H, W = heatmaps.shape - assert len(flip_indices) * 3 == C heatmaps = heatmaps.view(B, C // 3, 3, H, W) - heatmaps = heatmaps[:, flip_indices].flip(-1) + heatmaps = heatmaps.flip(-1) + if flip_indices is not None: + assert len(flip_indices) == C // 3 + heatmaps = heatmaps[:, flip_indices] heatmaps[:, :, 1] = -heatmaps[:, :, 1] heatmaps = heatmaps.view(B, C, H, W) + + elif flip_mode == 'offset': + B, C, H, W = heatmaps.shape + heatmaps = heatmaps.view(B, C // 2, -1, H, W) + heatmaps = heatmaps.flip(-1) + if flip_indices is not None: + assert len(flip_indices) == C // 2 + heatmaps = heatmaps[:, flip_indices] + heatmaps[:, :, 0] = -heatmaps[:, :, 0] + heatmaps = heatmaps.view(B, C, H, W) + else: raise ValueError(f'Invalid flip_mode value "{flip_mode}"') @@ -93,3 +112,57 @@ def flip_coordinates(coords: Tensor, flip_indices: List[int], coords = coords[:, flip_indices] return coords + + +def aggregate_heatmaps(heatmaps: List[Tensor], + size: Optional[Tuple[int, int]], + align_corners: bool = False, + mode: str = 'average'): + """Aggregate multiple heatmaps. + + Args: + heatmaps (List[Tensor]): Multiple heatmaps to aggregate. Each should + be in shape (B, C, H, W) + size (Tuple[int, int], optional): The target size in (w, h). All + heatmaps will be resized to the target size. If not given, the + first heatmap tensor's width and height will be used as the target + size. Defaults to ``None`` + align_corners (bool): Whether align corners when resizing heatmaps. + Defaults to ``False`` + mode (str): Aggregation mode in one of the following: + + - ``'average'``: Get average of heatmaps. All heatmaps mush have + the same channel number + - ``'concat'``: Concate the heatmaps at the channel dim + """ + + if mode not in {'average', 'concat'}: + raise ValueError(f'Invalid aggregation mode `{mode}`') + + if size is None: + h, w = heatmaps[0].shape[2:4] + else: + w, h = size + + for i, _heatmaps in enumerate(heatmaps): + assert _heatmaps.ndim == 4 + if mode == 'average': + assert _heatmaps.shape[:2] == heatmaps[0].shape[:2] + else: + assert _heatmaps.shape[0] == heatmaps[0].shape[0] + + if _heatmaps.shape[2:4] != (h, w): + heatmaps[i] = F.interpolate( + _heatmaps, + size=(h, w), + mode='bilinear', + align_corners=align_corners) + + if mode == 'average': + output = sum(heatmaps).div(len(heatmaps)) + elif mode == 'concat': + output = torch.cat(heatmaps, dim=1) + else: + raise ValueError() + + return output diff --git a/mmpose/registry.py b/mmpose/registry.py index 310f5da257..f1c080565f 100644 --- a/mmpose/registry.py +++ b/mmpose/registry.py @@ -7,32 +7,128 @@ https://mmengine.readthedocs.io/en/latest/tutorials/registry.html. """ +from mmengine.registry import DATA_SAMPLERS as MMENGINE_DATA_SAMPLERS from mmengine.registry import DATASETS as MMENGINE_DATASETS +from mmengine.registry import EVALUATOR as MMENGINE_EVALUATOR from mmengine.registry import HOOKS as MMENGINE_HOOKS +from mmengine.registry import INFERENCERS as MMENGINE_INFERENCERS +from mmengine.registry import LOG_PROCESSORS as MMENGINE_LOG_PROCESSORS +from mmengine.registry import LOOPS as MMENGINE_LOOPS from mmengine.registry import METRICS as MMENGINE_METRICS +from mmengine.registry import MODEL_WRAPPERS as MMENGINE_MODEL_WRAPPERS from mmengine.registry import MODELS as MMENGINE_MODELS +from mmengine.registry import \ + OPTIM_WRAPPER_CONSTRUCTORS as MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS +from mmengine.registry import OPTIM_WRAPPERS as MMENGINE_OPTIM_WRAPPERS +from mmengine.registry import OPTIMIZERS as MMENGINE_OPTIMIZERS +from mmengine.registry import PARAM_SCHEDULERS as MMENGINE_PARAM_SCHEDULERS +from mmengine.registry import \ + RUNNER_CONSTRUCTORS as MMENGINE_RUNNER_CONSTRUCTORS +from mmengine.registry import RUNNERS as MMENGINE_RUNNERS +from mmengine.registry import TASK_UTILS as MMENGINE_TASK_UTILS from mmengine.registry import TRANSFORMS as MMENGINE_TRANSFORMS from mmengine.registry import VISBACKENDS as MMENGINE_VISBACKENDS from mmengine.registry import VISUALIZERS as MMENGINE_VISUALIZERS +from mmengine.registry import \ + WEIGHT_INITIALIZERS as MMENGINE_WEIGHT_INITIALIZERS from mmengine.registry import Registry +# Registries For Runner and the related +# manage all kinds of runners like `EpochBasedRunner` and `IterBasedRunner` +RUNNERS = Registry('runner', parent=MMENGINE_RUNNERS) +# manage runner constructors that define how to initialize runners +RUNNER_CONSTRUCTORS = Registry( + 'runner constructor', parent=MMENGINE_RUNNER_CONSTRUCTORS) +# manage all kinds of loops like `EpochBasedTrainLoop` +LOOPS = Registry('loop', parent=MMENGINE_LOOPS) +# manage all kinds of hooks like `CheckpointHook` +HOOKS = Registry( + 'hook', parent=MMENGINE_HOOKS, locations=['mmpose.engine.hooks']) + +# Registries For Data and the related # manage data-related modules -DATASETS = Registry('dataset', parent=MMENGINE_DATASETS) -TRANSFORMS = Registry('transform', parent=MMENGINE_TRANSFORMS) +DATASETS = Registry( + 'dataset', parent=MMENGINE_DATASETS, locations=['mmpose.datasets']) +DATA_SAMPLERS = Registry( + 'data sampler', + parent=MMENGINE_DATA_SAMPLERS, + locations=['mmpose.datasets.samplers']) +TRANSFORMS = Registry( + 'transform', + parent=MMENGINE_TRANSFORMS, + locations=['mmpose.datasets.transforms']) # manage all kinds of modules inheriting `nn.Module` -MODELS = Registry('model', parent=MMENGINE_MODELS) +MODELS = Registry('model', parent=MMENGINE_MODELS, locations=['mmpose.models']) +# manage all kinds of model wrappers like 'MMDistributedDataParallel' +MODEL_WRAPPERS = Registry( + 'model_wrapper', + parent=MMENGINE_MODEL_WRAPPERS, + locations=['mmpose.models']) +# manage all kinds of weight initialization modules like `Uniform` +WEIGHT_INITIALIZERS = Registry( + 'weight initializer', + parent=MMENGINE_WEIGHT_INITIALIZERS, + locations=['mmpose.models']) +# manage all kinds of batch augmentations like Mixup and CutMix. +BATCH_AUGMENTS = Registry('batch augment', locations=['mmpose.models']) + +# Registries For Optimizer and the related +# manage all kinds of optimizers like `SGD` and `Adam` +OPTIMIZERS = Registry( + 'optimizer', + parent=MMENGINE_OPTIMIZERS, + locations=['mmpose.engine.optimizers']) +# manage optimizer wrapper +OPTIM_WRAPPERS = Registry( + 'optimizer_wrapper', + parent=MMENGINE_OPTIM_WRAPPERS, + locations=['mmpose.engine.optimizers']) +# manage constructors that customize the optimization hyperparameters. +OPTIM_WRAPPER_CONSTRUCTORS = Registry( + 'optimizer wrapper constructor', + parent=MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS, + locations=['mmpose.engine.optimizers']) +# manage all kinds of parameter schedulers like `MultiStepLR` +PARAM_SCHEDULERS = Registry( + 'parameter scheduler', + parent=MMENGINE_PARAM_SCHEDULERS, + locations=['mmpose.engine.schedulers']) + +# manage all kinds of metrics +METRICS = Registry( + 'metric', parent=MMENGINE_METRICS, locations=['mmpose.evaluation.metrics']) +# manage all kinds of evaluators +EVALUATORS = Registry( + 'evaluator', parent=MMENGINE_EVALUATOR, locations=['mmpose.evaluation']) + +# manage task-specific modules like anchor generators and box coders +TASK_UTILS = Registry( + 'task util', parent=MMENGINE_TASK_UTILS, locations=['mmpose.models']) +# Registries For Visualizer and the related # manage visualizer -VISUALIZERS = Registry('visualizer', parent=MMENGINE_VISUALIZERS) +VISUALIZERS = Registry( + 'visualizer', + parent=MMENGINE_VISUALIZERS, + locations=['mmpose.visualization']) # manage visualizer backend -VISBACKENDS = Registry('vis_backend', parent=MMENGINE_VISBACKENDS) +VISBACKENDS = Registry( + 'vis_backend', + parent=MMENGINE_VISBACKENDS, + locations=['mmpose.visualization']) -# manage all kinds of metrics -METRICS = Registry('metric', parent=MMENGINE_METRICS) +# manage all kinds log processors +LOG_PROCESSORS = Registry( + 'log processor', + parent=MMENGINE_LOG_PROCESSORS, + locations=['mmpose.visualization']) # manager keypoint encoder/decoder -KEYPOINT_CODECS = Registry('KEYPOINT_CODECS') +KEYPOINT_CODECS = Registry('KEYPOINT_CODECS', locations=['mmpose.codecs']) -# manage all kinds of hooks like `CheckpointHook` -HOOKS = Registry('hook', parent=MMENGINE_HOOKS) +# manage inferencer +INFERENCERS = Registry( + 'inferencer', + parent=MMENGINE_INFERENCERS, + locations=['mmpose.apis.inferencers']) diff --git a/mmpose/structures/__init__.py b/mmpose/structures/__init__.py index 28718c0b99..e4384af1cd 100644 --- a/mmpose/structures/__init__.py +++ b/mmpose/structures/__init__.py @@ -5,11 +5,11 @@ from .keypoint import flip_keypoints from .multilevel_pixel_data import MultilevelPixelData from .pose_data_sample import PoseDataSample -from .utils import merge_data_samples, revert_heatmap +from .utils import merge_data_samples, revert_heatmap, split_instances __all__ = [ 'PoseDataSample', 'MultilevelPixelData', 'bbox_cs2xywh', 'bbox_cs2xyxy', 'bbox_xywh2cs', 'bbox_xywh2xyxy', 'bbox_xyxy2cs', 'bbox_xyxy2xywh', 'flip_bbox', 'get_udp_warp_matrix', 'get_warp_matrix', 'flip_keypoints', - 'merge_data_samples', 'revert_heatmap' + 'merge_data_samples', 'revert_heatmap', 'split_instances' ] diff --git a/mmpose/structures/keypoint/transforms.py b/mmpose/structures/keypoint/transforms.py index d35e22d8b6..99adaa1306 100644 --- a/mmpose/structures/keypoint/transforms.py +++ b/mmpose/structures/keypoint/transforms.py @@ -38,7 +38,7 @@ def flip_keypoints(keypoints: np.ndarray, """ assert keypoints.shape[:-1] == keypoints_visible.shape, ( - f'Unmatched shapes of keypoints {keypoints.shape} and ' + f'Mismatched shapes of keypoints {keypoints.shape} and ' f'keypoints_visible {keypoints_visible.shape}') direction_options = {'horizontal', 'vertical', 'diagonal'} diff --git a/mmpose/structures/utils.py b/mmpose/structures/utils.py index 413e9f5cf6..882cda8603 100644 --- a/mmpose/structures/utils.py +++ b/mmpose/structures/utils.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +import warnings from typing import List import cv2 @@ -31,7 +32,9 @@ def merge_data_samples(data_samples: List[PoseDataSample]) -> PoseDataSample: raise ValueError('Invalid input type, should be a list of ' ':obj:`PoseDataSample`') - assert len(data_samples) > 0 + if len(data_samples) == 0: + warnings.warn('Try to merge an empty list of data samples.') + return PoseDataSample() merged = PoseDataSample(metainfo=data_samples[0].metainfo) @@ -110,3 +113,26 @@ def revert_heatmap(heatmap, bbox_center, bbox_scale, img_shape): heatmap = heatmap.transpose(2, 0, 1) return heatmap + + +def split_instances(instances: InstanceData) -> List[InstanceData]: + """Convert instances into a list where each element is a dict that contains + information about one instance.""" + results = [] + + # return an empty list if there is no instance detected by the model + if instances is None: + return results + + for i in range(len(instances.keypoints)): + result = dict( + keypoints=instances.keypoints[i].tolist(), + keypoint_scores=instances.keypoint_scores[i].tolist(), + ) + if 'bboxes' in instances: + result['bbox'] = instances.bboxes[i].tolist(), + if 'bbox_scores' in instances: + result['bbox_score'] = instances.bbox_scores[i] + results.append(result) + + return results diff --git a/mmpose/testing/__init__.py b/mmpose/testing/__init__.py index d6f194d638..5612dac6c6 100644 --- a/mmpose/testing/__init__.py +++ b/mmpose/testing/__init__.py @@ -1,4 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. -from ._utils import get_coco_sample, get_packed_inputs +from ._utils import (get_coco_sample, get_config_file, get_packed_inputs, + get_pose_estimator_cfg, get_repo_dir) -__all__ = ['get_packed_inputs', 'get_coco_sample'] +__all__ = [ + 'get_packed_inputs', 'get_coco_sample', 'get_config_file', + 'get_pose_estimator_cfg', 'get_repo_dir' +] diff --git a/mmpose/testing/_utils.py b/mmpose/testing/_utils.py index 0758daa674..1908129be8 100644 --- a/mmpose/testing/_utils.py +++ b/mmpose/testing/_utils.py @@ -1,8 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from copy import deepcopy from typing import Optional import numpy as np import torch +from mmengine.config import Config +from mmengine.dataset import pseudo_collate from mmengine.structures import InstanceData, PixelData from mmpose.structures import MultilevelPixelData, PoseDataSample @@ -53,6 +57,7 @@ def get_coco_sample( data = { 'img': img, 'img_shape': img_shape, + 'ori_shape': img_shape, 'bbox': bbox, 'keypoints': keypoints, 'keypoints_visible': keypoints_visible, @@ -77,7 +82,7 @@ def get_packed_inputs(batch_size=2, num_instances=1, num_keypoints=17, num_levels=1, - img_shape=(128, 128), + img_shape=(256, 192), input_size=(192, 256), heatmap_size=(48, 64), simcc_split_ratio=2.0, @@ -87,7 +92,7 @@ def get_packed_inputs(batch_size=2, """Create a dummy batch of model inputs and data samples.""" rng = np.random.RandomState(0) - packed_inputs = [] + inputs_list = [] for idx in range(batch_size): inputs = dict() @@ -114,7 +119,6 @@ def get_packed_inputs(batch_size=2, # gt_instance gt_instances = InstanceData() gt_instance_labels = InstanceData() - bboxes = _rand_bboxes(rng, num_instances, w, h) bbox_centers, bbox_scales = bbox_xyxy2cs(bboxes) @@ -175,9 +179,10 @@ def get_packed_inputs(batch_size=2, data_sample.gt_instances = gt_instances data_sample.gt_instance_labels = gt_instance_labels - inputs['data_sample'] = data_sample - packed_inputs.append(inputs) + inputs['data_samples'] = data_sample + inputs_list.append(inputs) + packed_inputs = pseudo_collate(inputs_list) return packed_inputs @@ -196,7 +201,8 @@ def _rand_simcc_label(rng, num_instances, num_keypoints, len_feats): def _rand_bboxes(rng, num_instances, img_w, img_h): - cx, cy, bw, bh = rng.rand(num_instances, 4).T + cx, cy = rng.rand(num_instances, 2).T + bw, bh = 0.2 + 0.8 * rng.rand(num_instances, 2).T tl_x = ((cx * img_w) - (img_w * bw / 2)).clip(0, img_w) tl_y = ((cy * img_h) - (img_h * bh / 2)).clip(0, img_h) @@ -205,3 +211,38 @@ def _rand_bboxes(rng, num_instances, img_w, img_h): bboxes = np.vstack([tl_x, tl_y, br_x, br_y]).T return bboxes + + +def get_repo_dir(): + """Return the path of the MMPose repo directory.""" + try: + # Assume the function in invoked is the source mmpose repo + repo_dir = osp.dirname(osp.dirname(osp.dirname(__file__))) + except NameError: + # For IPython development when __file__ is not defined + import mmpose + repo_dir = osp.dirname(osp.dirname(mmpose.__file__)) + + return repo_dir + + +def get_config_file(fn: str): + """Return full path of a config file from the given relative path.""" + repo_dir = get_repo_dir() + if fn.startswith('configs'): + fn_config = osp.join(repo_dir, fn) + else: + fn_config = osp.join(repo_dir, 'configs', fn) + + if not osp.isfile(fn_config): + raise FileNotFoundError(f'Cannot find config file {fn_config}') + + return fn_config + + +def get_pose_estimator_cfg(fn: str): + """Load model config from a config file.""" + + fn_config = get_config_file(fn) + config = Config.fromfile(fn_config) + return deepcopy(config.model) diff --git a/mmpose/utils/__init__.py b/mmpose/utils/__init__.py index 044bb286c4..c48ca01cea 100644 --- a/mmpose/utils/__init__.py +++ b/mmpose/utils/__init__.py @@ -1,11 +1,13 @@ # Copyright (c) OpenMMLab. All rights reserved. from .camera import SimpleCamera, SimpleCameraTorch from .collect_env import collect_env +from .config_utils import adapt_mmdet_pipeline from .logger import get_root_logger from .setup_env import register_all_modules, setup_multi_processes from .timer import StopWatch __all__ = [ 'get_root_logger', 'collect_env', 'StopWatch', 'setup_multi_processes', - 'register_all_modules', 'SimpleCamera', 'SimpleCameraTorch' + 'register_all_modules', 'SimpleCamera', 'SimpleCameraTorch', + 'adapt_mmdet_pipeline' ] diff --git a/mmpose/utils/config_utils.py b/mmpose/utils/config_utils.py new file mode 100644 index 0000000000..2f54d2ef24 --- /dev/null +++ b/mmpose/utils/config_utils.py @@ -0,0 +1,26 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmpose.utils.typing import ConfigDict + + +def adapt_mmdet_pipeline(cfg: ConfigDict) -> ConfigDict: + """Converts pipeline types in MMDetection's test dataloader to use the + 'mmdet' namespace. + + Args: + cfg (ConfigDict): Configuration dictionary for MMDetection. + + Returns: + ConfigDict: Configuration dictionary with updated pipeline types. + """ + # use lazy import to avoid hard dependence on mmdet + from mmdet.datasets import transforms + + if 'test_dataloader' not in cfg: + return cfg + + pipeline = cfg.test_dataloader.dataset.pipeline + for trans in pipeline: + if trans['type'] in dir(transforms): + trans['type'] = 'mmdet.' + trans['type'] + + return cfg diff --git a/mmpose/utils/tensor_utils.py b/mmpose/utils/tensor_utils.py index ceab5d141f..1be73f8991 100644 --- a/mmpose/utils/tensor_utils.py +++ b/mmpose/utils/tensor_utils.py @@ -18,7 +18,7 @@ def to_numpy(x: Union[Tensor, Sequence[Tensor]], tensors return_device (bool): Whether return the tensor device. Defaults to ``False`` - unzip (bool): Whether unzip the input sequence. Ddfaults to ``False`` + unzip (bool): Whether unzip the input sequence. Defaults to ``False`` Returns: np.ndarray | tuple: If ``return_device`` is ``True``, return a tuple diff --git a/mmpose/utils/typing.py b/mmpose/utils/typing.py index b4039d3f05..557891b3b9 100644 --- a/mmpose/utils/typing.py +++ b/mmpose/utils/typing.py @@ -25,4 +25,5 @@ # Type hint of features # - Tuple[Tensor]: multi-level features extracted by the network # - List[Tuple[Tensor]]: multiple feature pyramids for TTA -Features = Union[Tuple[Tensor], List[Tuple[Tensor]]] +# - List[List[Tuple[Tensor]]]: multi-scale feature pyramids +Features = Union[Tuple[Tensor], List[Tuple[Tensor]], List[List[Tuple[Tensor]]]] diff --git a/mmpose/version.py b/mmpose/version.py index 85e350f846..390f9399d3 100644 --- a/mmpose/version.py +++ b/mmpose/version.py @@ -1,6 +1,6 @@ # Copyright (c) Open-MMLab. All rights reserved. -__version__ = '1.0.0rc0' +__version__ = '1.0.0rc1' short_version = __version__ diff --git a/mmpose/visualization/local_visualizer.py b/mmpose/visualization/local_visualizer.py index 3c4a3ed4e3..1743ef7997 100644 --- a/mmpose/visualization/local_visualizer.py +++ b/mmpose/visualization/local_visualizer.py @@ -12,6 +12,7 @@ from mmpose.registry import VISUALIZERS from mmpose.structures import PoseDataSample +from .simcc_vis import SimCCVisualizer def _get_adaptive_scales(areas: np.ndarray, @@ -174,7 +175,7 @@ def _draw_instances_bbox(self, image: np.ndarray, return self.get_image() if 'labels' in instances and self.text_color is not None: - classes = self.dataset_meta.get('CLASSES', None) + classes = self.dataset_meta.get('classes', None) labels = instances.labels positions = bboxes[:, :2] @@ -210,7 +211,8 @@ def _draw_instances_bbox(self, image: np.ndarray, def _draw_instances_kpts(self, image: np.ndarray, instances: InstanceData, - kpt_score_thr: float = 0.3): + kpt_score_thr: float = 0.3, + show_kpt_idx: bool = False): """Draw keypoints and skeletons (optional) of GT or prediction. Args: @@ -275,6 +277,14 @@ def _draw_instances_kpts(self, edge_colors=color, alpha=transparency, line_widths=self.radius) + if show_kpt_idx: + self.draw_texts( + str(kid), + kpt, + colors=color, + font_sizes=self.radius * 3, + vertical_alignments='bottom', + horizontal_alignments='center') # draw links if self.skeleton is not None and self.link_color is not None: @@ -362,6 +372,34 @@ def _draw_instance_heatmap( out_image = self.draw_featmap(heatmaps, overlaid_image) return out_image + def _draw_instance_xy_heatmap( + self, + fields: PixelData, + overlaid_image: Optional[np.ndarray] = None, + n: int = 20, + ): + """Draw heatmaps of GT or prediction. + + Args: + fields (:obj:`PixelData`): Data structure for + pixel-level annotations or predictions. + overlaid_image (np.ndarray): The image to draw. + n (int): Number of keypoint, up to 20. + + Returns: + np.ndarray: the drawn image which channel is RGB. + """ + if 'heatmaps' not in fields: + return None + heatmaps = fields.heatmaps + _, h, w = heatmaps.shape + if isinstance(heatmaps, np.ndarray): + heatmaps = torch.from_numpy(heatmaps) + out_image = SimCCVisualizer().draw_instance_xy_heatmap( + heatmaps, overlaid_image, n) + out_image = cv2.resize(out_image[:, :, ::-1], (w, h)) + return out_image + @master_only def add_datasample(self, name: str, @@ -371,6 +409,7 @@ def add_datasample(self, draw_pred: bool = True, draw_heatmap: bool = False, draw_bbox: bool = False, + show_kpt_idx: bool = False, show: bool = False, wait_time: float = 0, out_file: Optional[str] = None, @@ -419,7 +458,8 @@ def add_datasample(self, # draw bboxes & keypoints if 'gt_instances' in data_sample: gt_img_data = self._draw_instances_kpts( - gt_img_data, data_sample.gt_instances, kpt_score_thr) + gt_img_data, data_sample.gt_instances, kpt_score_thr, + show_kpt_idx) if draw_bbox: gt_img_data = self._draw_instances_bbox( gt_img_data, data_sample.gt_instances) @@ -439,15 +479,20 @@ def add_datasample(self, # draw bboxes & keypoints if 'pred_instances' in data_sample: pred_img_data = self._draw_instances_kpts( - pred_img_data, data_sample.pred_instances, kpt_score_thr) + pred_img_data, data_sample.pred_instances, kpt_score_thr, + show_kpt_idx) if draw_bbox: pred_img_data = self._draw_instances_bbox( pred_img_data, data_sample.pred_instances) # draw heatmaps if 'pred_fields' in data_sample and draw_heatmap: - pred_img_heatmap = self._draw_instance_heatmap( - data_sample.pred_fields, image) + if 'keypoint_x_labels' in data_sample.pred_instances: + pred_img_heatmap = self._draw_instance_xy_heatmap( + data_sample.pred_fields, image) + else: + pred_img_heatmap = self._draw_instance_heatmap( + data_sample.pred_fields, image) if pred_img_heatmap is not None: pred_img_data = np.concatenate( (pred_img_data, pred_img_heatmap), axis=0) @@ -479,3 +524,5 @@ def add_datasample(self, else: # save drawn_img to backends self.add_image(name, drawn_img, step) + + return self.get_image() diff --git a/mmpose/visualization/simcc_vis.py b/mmpose/visualization/simcc_vis.py new file mode 100644 index 0000000000..c2950934a6 --- /dev/null +++ b/mmpose/visualization/simcc_vis.py @@ -0,0 +1,129 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Union + +import cv2 as cv +import numpy as np +import torch +from torchvision.transforms import ToPILImage + + +class SimCCVisualizer: + + def draw_instance_xy_heatmap(self, + heatmap: torch.Tensor, + overlaid_image: Optional[np.ndarray], + n: int = 20, + mix: bool = True, + weight: float = 0.5): + """Draw heatmaps of GT or prediction. + + Args: + heatmap (torch.Tensor): Tensor of heatmap. + overlaid_image (np.ndarray): The image to draw. + n (int): Number of keypoint, up to 20. + mix (bool):Whether to merge heatmap and original image. + weight (float): Weight of original image during fusion. + + Returns: + np.ndarray: the drawn image which channel is RGB. + """ + heatmap2d = heatmap.data.max(0, keepdim=True)[0] + xy_heatmap, K = self.split_simcc_xy(heatmap) + K = K if K <= n else n + blank_size = tuple(heatmap.size()[1:]) + maps = {'x': [], 'y': []} + for i in xy_heatmap: + x, y = self.draw_1d_heatmaps(i['x']), self.draw_1d_heatmaps(i['y']) + maps['x'].append(x) + maps['y'].append(y) + white = self.creat_blank(blank_size) + map2d = self.draw_2d_heatmaps(heatmap2d) + if mix: + map2d = cv.addWeighted(overlaid_image, 1 - weight, map2d, weight, + 0) + self.image_cover(white, map2d, int(blank_size[1] * 0.1), + int(blank_size[0] * 0.1)) + white = self.add_1d_heatmaps(maps, white, blank_size, K) + return white + + def split_simcc_xy(self, heatmap: Union[np.ndarray, torch.Tensor]): + """Extract one-dimensional heatmap from two-dimensional heatmap and + calculate the number of keypoint.""" + size = heatmap.size() + k = size[0] if size[0] <= 20 else 20 + maps = [] + for _ in range(k): + xy_dict = {} + single_heatmap = heatmap[_] + xy_dict['x'], xy_dict['y'] = self.merge_maps(single_heatmap) + maps.append(xy_dict) + return maps, k + + def merge_maps(self, map_2d): + """Synthesis of one-dimensional heatmap.""" + x = map_2d.data.max(0, keepdim=True)[0] + y = map_2d.data.max(1, keepdim=True)[0] + return x, y + + def draw_1d_heatmaps(self, heatmap_1d): + """Draw one-dimensional heatmap.""" + size = heatmap_1d.size() + length = max(size) + np_heatmap = ToPILImage()(heatmap_1d).convert('RGB') + cv_img = cv.cvtColor(np.asarray(np_heatmap), cv.COLOR_RGB2BGR) + if size[0] < size[1]: + cv_img = cv.resize(cv_img, (length, 15)) + else: + cv_img = cv.resize(cv_img, (15, length)) + single_map = cv.applyColorMap(cv_img, cv.COLORMAP_JET) + return single_map + + def creat_blank(self, size: Union[list, tuple]): + """Create the background.""" + blank = np.zeros((size[0] * 2, size[1] * 2, 3), np.uint8) + blank.fill(255) + return blank + + def draw_2d_heatmaps(self, heatmap_2d): + """Draw a two-dimensional heatmap fused with the original image.""" + np_heatmap = ToPILImage()(heatmap_2d).convert('RGB') + cv_img = cv.cvtColor(np.asarray(np_heatmap), cv.COLOR_RGB2BGR) + map_2d = cv.applyColorMap(cv_img, cv.COLORMAP_JET) + return map_2d + + def image_cover(self, background: np.ndarray, foreground: np.ndarray, + x: int, y: int): + """Paste the foreground on the background.""" + fore_size = foreground.shape + background[y:y + fore_size[0], x:x + fore_size[1]] = foreground + return background + + def add_1d_heatmaps(self, + maps: dict, + background: np.ndarray, + map2d_size: Union[tuple, list], + K: int, + interval: int = 10): + """Paste one-dimensional heatmaps onto the background in turn.""" + y_startpoint, x_startpoint = [int(1.1*map2d_size[1]), + int(0.1*map2d_size[0])],\ + [int(0.1*map2d_size[1]), + int(1.1*map2d_size[0])] + x_startpoint[1] += interval * 2 + y_startpoint[0] += interval * 2 + add = interval + 10 + for i in range(K): + self.image_cover(background, maps['x'][i], x_startpoint[0], + x_startpoint[1]) + cv.putText(background, str(i), + (x_startpoint[0] - 30, x_startpoint[1] + 10), + cv.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2) + self.image_cover(background, maps['y'][i], y_startpoint[0], + y_startpoint[1]) + cv.putText(background, str(i), + (y_startpoint[0], y_startpoint[1] - 5), + cv.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2) + x_startpoint[1] += add + y_startpoint[0] += add + return background[:x_startpoint[1] + y_startpoint[1] + + 1, :y_startpoint[0] + x_startpoint[0] + 1] diff --git a/model-index.yml b/model-index.yml index 009ef1c8bb..549177c999 100644 --- a/model-index.yml +++ b/model-index.yml @@ -2,3 +2,11 @@ Import: - configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco.yml - configs/body_2d_keypoint/topdown_heatmap/coco/litehrnet_coco.yml - configs/body_2d_keypoint/topdown_heatmap/coco/mspn_coco.yml +- configs/body_2d_keypoint/topdown_heatmap/coco/hourglass_coco.yml +- configs/body_2d_keypoint/topdown_heatmap/coco/vitpose_coco.yml +- configs/body_2d_keypoint/simcc/coco/resnet_coco.yml +- configs/body_2d_keypoint/simcc/coco/mobilenetv2_coco.yml +- configs/body_2d_keypoint/simcc/coco/vipnas_coco.yml +- configs/animal_2d_keypoint/topdown_heatmap/ap10k/resnet_ap10k.yml +- configs/face_2d_keypoint/topdown_heatmap/wflw/hrnetv2_wflw.yml +- configs/hand_2d_keypoint/topdown_heatmap/onehand10k/resnet_onehand10k.yml diff --git a/projects/README.md b/projects/README.md new file mode 100644 index 0000000000..982f154761 --- /dev/null +++ b/projects/README.md @@ -0,0 +1,21 @@ +# Welcome to Projects of MMPose + +In this folder, we welcome all contribution of keypoint detection techniques from community. + +Here, these requirements, e.g. code standards, are not as strict as in the core package. Thus, developers from the community can implement their algorithms much more easily and efficiently in MMPose. We appreciate all contributions from the community that makes MMPose greater. + +Here is an [example project](./example_project) about how to add your algorithms easily. For common questions about projects, please read our [faq](faq.md). + +We also provide some documentation listed below: + +- [New Model Guide](https://mmpose.readthedocs.io/en/1.x/migration.html#step3-model) + + The documentation of adding new models. + +- [Contribution Guide](https://mmpose.readthedocs.io/en/1.x/notes/contribution_guide.html) + + The guides for new contributors about how to add your projects to MMPose. + +- [Discussions](https://github.com/open-mmlab/mmpose/discussions) + + Welcome to start a discussion! diff --git a/projects/example_project/README.md b/projects/example_project/README.md new file mode 100644 index 0000000000..68f58b5a01 --- /dev/null +++ b/projects/example_project/README.md @@ -0,0 +1,166 @@ +# Example Project + +> A README.md template for releasing a project. +> +> All the fields in this README are **mandatory** for others to understand what you have achieved in this implementation. +> Please read our [Projects FAQ](../faq.md) if you still feel unclear about the requirements, or raise an [issue](https://github.com/open-mmlab/mmpose/issues) to us! + +## Description + +> Share any information you would like others to know. For example: +> +> Author: @xxx. +> +> This is an implementation of \[XXX\]. + +Author: @xxx. + +This project implements a top-down pose estimator with custom head and loss functions that have been seamlessly inherited from existing modules within MMPose. + +## Usage + +> For a typical model, this section should contain the commands for training and testing. +> You are also suggested to dump your environment specification to env.yml by `conda env export > env.yml`. + +### Prerequisites + +- Python 3.7 +- PyTorch 1.6 or higher +- [MIM](https://github.com/open-mmlab/mim) v0.33 or higher +- [MMPose](https://github.com/open-mmlab/mmpose) v1.0.0rc0 or higher + +All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `example_project/` root directory, run the following line to add the current directory to `PYTHONPATH`: + +```shell +export PYTHONPATH=`pwd`:$PYTHONPATH +``` + +### Data Preparation + +Prepare the COCO dataset according to the [instruction](https://mmpose.readthedocs.io/en/1.x/dataset_zoo/2d_body_keypoint.html#coco). + +### Training commands + +**To train with single GPU:** + +```shell +mim train mmpose configs/example-head-loss_hrnet-w32_8xb64-210e_coco-256x192.py +``` + +**To train with multiple GPUs:** + +```shell +mim train mmpose configs/example-head-loss_hrnet-w32_8xb64-210e_coco-256x192.py --launcher pytorch --gpus 8 +``` + +**To train with multiple GPUs by slurm:** + +```shell +mim train mmpose configs/example-head-loss_hrnet-w32_8xb64-210e_coco-256x192.py --launcher slurm \ + --gpus 16 --gpus-per-node 8 --partition $PARTITION +``` + +### Testing commands + +**To test with single GPU:** + +```shell +mim test mmpose configs/example-head-loss_hrnet-w32_8xb64-210e_coco-256x192.py $CHECKPOINT +``` + +**To test with multiple GPUs:** + +```shell +mim test mmpose configs/example-head-loss_hrnet-w32_8xb64-210e_coco-256x192.py $CHECKPOINT --launcher pytorch --gpus 8 +``` + +**To test with multiple GPUs by slurm:** + +```shell +mim test mmpose configs/example-head-loss_hrnet-w32_8xb64-210e_coco-256x192.py $CHECKPOINT --launcher slurm \ + --gpus 16 --gpus-per-node 8 --partition $PARTITION +``` + +## Results + +> List the results as usually done in other model's README. Here is an [Example](https://github.com/open-mmlab/mmpose/blob/dev-1.x/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco.md). + +> You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project + +| Model | Backbone | Input Size | AP | AP50 | AP75 | AR | AR50 | Download | +| :-----------------------------------------------------------: | :-------: | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :---------------------------------------------------------------: | +| [ExampleHead + ExampleLoss](./configs/example-head-loss_hrnet-w32_8xb64-210e_coco-256x192.py) | HRNet-w32 | 256x912 | 0.749 | 0.906 | 0.821 | 0.804 | 0.945 | [model](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192-81c58e40_20220909.pth) \| [log](https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_hrnet-w32_8xb64-210e_coco-256x192_20220909.log) | + +## Citation + +> You may remove this section if not applicable. + +```bibtex +@misc{mmpose2020, + title={OpenMMLab Pose Estimation Toolbox and Benchmark}, + author={MMPose Contributors}, + howpublished = {\url{https://github.com/open-mmlab/mmpose}}, + year={2020} +} +``` + +## Checklist + +Here is a checklist of this project's progress. And you can ignore this part if you don't plan to contribute +to MMPose projects. + +> The PIC (person in charge) or contributors of this project should check all the items that they believe have been finished, which will further be verified by codebase maintainers via a PR. + +> OpenMMLab's maintainer will review the code to ensure the project's quality. Reaching the first milestone means that this project suffices the minimum requirement of being merged into 'projects/'. But this project is only eligible to become a part of the core package upon attaining the last milestone. + +> Note that keeping this section up-to-date is crucial not only for this project's developers but the entire community, since there might be some other contributors joining this project and deciding their starting point from this list. It also helps maintainers accurately estimate time and effort on further code polishing, if needed. + +> A project does not necessarily have to be finished in a single PR, but it's essential for the project to at least reach the first milestone in its very first PR. + +- [ ] Milestone 1: PR-ready, and acceptable to be one of the `projects/`. + + - [ ] Finish the code + + > The code's design shall follow existing interfaces and convention. For example, each model component should be registered into `mmpose.registry.MODELS` and configurable via a config file. + + - [ ] Basic docstrings & proper citation + + > Each major class should contains a docstring, describing its functionality and arguments. If your code is copied or modified from other open-source projects, don't forget to cite the source project in docstring and make sure your behavior is not against its license. Typically, we do not accept any code snippet under GPL license. [A Short Guide to Open Source Licenses](https://medium.com/nationwide-technology/a-short-guide-to-open-source-licenses-cf5b1c329edd) + + - [ ] Test-time correctness + + > If you are reproducing the result from a paper, make sure your model's inference-time performance matches that in the original paper. The weights usually could be obtained by simply renaming the keys in the official pre-trained weights. This test could be skipped though, if you are able to prove the training-time correctness and check the second milestone. + + - [ ] A full README + + > As this template does. + +- [ ] Milestone 2: Indicates a successful model implementation. + + - [ ] Training-time correctness + + > If you are reproducing the result from a paper, checking this item means that you should have trained your model from scratch based on the original paper's specification and verified that the final result matches the report within a minor error range. + +- [ ] Milestone 3: Good to be a part of our core package! + + - [ ] Type hints and docstrings + + > Ideally *all* the methods should have [type hints](https://www.pythontutorial.net/python-basics/python-type-hints/) and [docstrings](https://google.github.io/styleguide/pyguide.html#381-docstrings). [Example](https://github.com/open-mmlab/mmpose/blob/0fb7f22000197181dc0629f767dd99d881d23d76/mmpose/utils/tensor_utils.py#L53) + + - [ ] Unit tests + + > Unit tests for the major module are required. [Example](https://github.com/open-mmlab/mmpose/blob/1.x/tests/test_models/test_heads/test_heatmap_heads/test_heatmap_head.py) + + - [ ] Code polishing + + > Refactor your code according to reviewer's comment. + + - [ ] Metafile.yml + + > It will be parsed by MIM and Inferencer. [Example](https://github.com/open-mmlab/mmpose/blob/dev-1.x/configs/body_2d_keypoint/topdown_heatmap/coco/hrnet_coco.yml) + + - [ ] Move your modules into the core package following the codebase's file hierarchy structure. + + > In particular, you may have to refactor this README into a standard one. [Example](https://github.com/open-mmlab/mmpose/blob/dev-1.x/configs/body_2d_keypoint/topdown_heatmap/README.md) + + - [ ] Refactor your modules into the core package following the codebase's file hierarchy structure. diff --git a/projects/example_project/configs/example-head-loss_hrnet-w32_8xb64-210e_coco-256x192.py b/projects/example_project/configs/example-head-loss_hrnet-w32_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000000..99b19d478c --- /dev/null +++ b/projects/example_project/configs/example-head-loss_hrnet-w32_8xb64-210e_coco-256x192.py @@ -0,0 +1,15 @@ +# Directly inherit the entire recipe you want to use. +_base_ = 'mmpose::body_2d_keypoint/topdown_heatmap/coco/' \ + 'td-hm_hrnet-w32_8xb64-210e_coco-256x192.py' + +# This line is to import your own modules. +custom_imports = dict(imports='models') + +# Modify the model to use your own head and loss. +_base_['model']['head'] = dict( + type='ExampleHead', + in_channels=32, + out_channels=17, + deconv_out_channels=None, + loss=dict(type='ExampleLoss', use_target_weight=True), + decoder=_base_['codec']) diff --git a/projects/example_project/models/__init__.py b/projects/example_project/models/__init__.py new file mode 100644 index 0000000000..61dc5dac0e --- /dev/null +++ b/projects/example_project/models/__init__.py @@ -0,0 +1,4 @@ +from .example_head import ExampleHead +from .example_loss import ExampleLoss + +__all__ = ['ExampleHead', 'ExampleLoss'] diff --git a/projects/example_project/models/example_head.py b/projects/example_project/models/example_head.py new file mode 100644 index 0000000000..c5da95d481 --- /dev/null +++ b/projects/example_project/models/example_head.py @@ -0,0 +1,77 @@ +from mmpose.models import HeatmapHead +from mmpose.registry import MODELS + + +# Register your head to the `MODELS`. +@MODELS.register_module() +class ExampleHead(HeatmapHead): + """Implements an example head. + + Implement the model head just like a normal pytorch module. + """ + + def __init__(self, **kwargs) -> None: + print('Initializing ExampleHead...') + super().__init__(**kwargs) + + def forward(self, feats): + """Forward the network. The input is multi scale feature maps and the + output is the coordinates. + + Args: + feats (Tuple[Tensor]): Multi scale feature maps. + + Returns: + Tensor: output coordinates or heatmaps. + """ + return super().forward(feats) + + def predict(self, feats, batch_data_samples, test_cfg={}): + """Predict results from outputs. The behaviour of head during testing + should be defined in this function. + + Args: + feats (Tuple[Tensor] | List[Tuple[Tensor]]): The multi-stage + features (or multiple multi-stage features in TTA) + batch_data_samples (List[:obj:`PoseDataSample`]): A list of + data samples for instances in a batch + test_cfg (dict): The runtime config for testing process. Defaults + to {} + + Returns: + Union[InstanceList | Tuple[InstanceList | PixelDataList]]: If + ``test_cfg['output_heatmap']==True``, return both pose and heatmap + prediction; otherwise only return the pose prediction. + + The pose prediction is a list of ``InstanceData``, each contains + the following fields: + + - keypoints (np.ndarray): predicted keypoint coordinates in + shape (num_instances, K, D) where K is the keypoint number + and D is the keypoint dimension + - keypoint_scores (np.ndarray): predicted keypoint scores in + shape (num_instances, K) + + The heatmap prediction is a list of ``PixelData``, each contains + the following fields: + + - heatmaps (Tensor): The predicted heatmaps in shape (K, h, w) + """ + return super().predict(feats, batch_data_samples, test_cfg) + + def loss(self, feats, batch_data_samples, train_cfg={}) -> dict: + """Calculate losses from a batch of inputs and data samples. The + behaviour of head during training should be defined in this function. + + Args: + feats (Tuple[Tensor]): The multi-stage features + batch_data_samples (List[:obj:`PoseDataSample`]): A list of + data samples for instances in a batch + train_cfg (dict): The runtime config for training process. + Defaults to {} + + Returns: + dict: A dictionary of losses. + """ + + return super().loss(feats, batch_data_samples, train_cfg) diff --git a/projects/example_project/models/example_loss.py b/projects/example_project/models/example_loss.py new file mode 100644 index 0000000000..e55d03537e --- /dev/null +++ b/projects/example_project/models/example_loss.py @@ -0,0 +1,40 @@ +from mmpose.models import KeypointMSELoss +from mmpose.registry import MODELS + + +# Register your loss to the `MODELS`. +@MODELS.register_module() +class ExampleLoss(KeypointMSELoss): + """Implements an example loss. + + Implement the loss just like a normal pytorch module. + """ + + def __init__(self, **kwargs) -> None: + print('Initializing ExampleLoss...') + super().__init__(**kwargs) + + def forward(self, output, target, target_weights=None, mask=None): + """Forward function of loss. The input arguments should match those + given in `head.loss` function. + + Note: + - batch_size: B + - num_keypoints: K + - heatmaps height: H + - heatmaps weight: W + + Args: + output (Tensor): The output heatmaps with shape [B, K, H, W] + target (Tensor): The target heatmaps with shape [B, K, H, W] + target_weights (Tensor, optional): The target weights of differet + keypoints, with shape [B, K] (keypoint-wise) or + [B, K, H, W] (pixel-wise). + mask (Tensor, optional): The masks of valid heatmap pixels in + shape [B, K, H, W] or [B, 1, H, W]. If ``None``, no mask will + be applied. Defaults to ``None`` + + Returns: + Tensor: The calculated loss. + """ + return super().forward(output, target, target_weights, mask) diff --git a/projects/faq.md b/projects/faq.md new file mode 100644 index 0000000000..fc53d1b5ee --- /dev/null +++ b/projects/faq.md @@ -0,0 +1,23 @@ +# FAQ + +To help users better understand the `projects/` folder and how to use it effectively, we've created this FAQ page. Here, users can find answers to common questions and learn more about various aspects of the `projects/` folder, such as its usage and contribution guidance. + +## Q1: Why set up `projects/` folder? + +Implementing new models and features into OpenMMLab's algorithm libraries could be troublesome due to the rigorous requirements on code quality, which could hinder the fast iteration of SOTA models and might discourage our members from sharing their latest outcomes here. And that's why we have this `projects/` folder now, where some experimental features, frameworks and models are placed, only needed to satisfy the minimum requirement on the code quality, and can be used as standalone libraries. Users are welcome to use them if they [use MMPose from source](https://mmpose.readthedocs.io/en/dev-1.x/installation.html#best-practices). + +## Q2: Why should there be a checklist for a project? + +This checkelist is crucial not only for this project's developers but the entire community, since there might be some other contributors joining this project and deciding their starting point from this list. It also helps maintainers accurately estimate time and effort on further code polishing, if needed. + +## Q3: What kind of PR will be merged? + +Reaching the first milestone means that this project suffices the minimum requirement of being merged into 'projects/'. That is, the very first PR of a project must have all the terms in the first milestone checked. We do not have any extra requirements on the project's following PRs, so they can be a minor bug fix or update, and do not have to achieve one milestone at once. But keep in mind that this project is only eligible to become a part of the core package upon attaining the last milestone. + +## Q4: Compared to other models in the core packages, why do the model implementations in projects have different training/testing commands? + +Projects are organized independently from the core package, and therefore their modules cannot be directly imported by train.py and test.py. Each model implementation in projects should either use `mim` for training/testing as suggested in the example project or provide a custom train.py/test.py. + +## Q5: How to debug a project with a debugger? + +Debugger makes our lives easier, but using it becomes a bit tricky if we have to train/test a model via `mim`. The way to circumvent that is that we can take advantage of relative path to import these modules. Assuming that we are developing a project X and the core modules are placed under `projects/X/modules`, then simply adding `custom_imports = dict(imports='projects.X.modules')` to the config allows us to debug from usual entrypoints (e.g. `tools/train.py`) from the root directory of the algorithm library. Just don't forget to remove 'projects.X' before project publishment. diff --git a/projects/mmpose4aigc/README.md b/projects/mmpose4aigc/README.md new file mode 100644 index 0000000000..4005d906a6 --- /dev/null +++ b/projects/mmpose4aigc/README.md @@ -0,0 +1,108 @@ +# MMPose for AIGC (AI Generated Content) + +
+ +
+ +English | [简体中文](./README_CN.md) + +This project will demonstrate how to use MMPose to generate skeleton images for pose guided AI image generation. + +Currently, we support: + +- [T2I Adapter](https://huggingface.co/spaces/Adapter/T2I-Adapter) + +Please feel free to share interesting pose-guided AIGC projects to us! + +## Get Started + +### Step 1: Preparation + +**Env Requirements:** + +- GCC >= 7.5 +- cmake >= 3.14 + +Run the following commands to install the project: + +```shell +bash install_linux.sh +``` + +After installation, files are organized as follows: + +```shell +|----mmdeploy-1.0.0rc3-linux-x86_64-onnxruntime1.8.1 +| |----sdk +| |----rtmpose-ort +| | |----rtmdet-nano +| | |----rtmpose-m +| | |----000000147979.jpg +| | |----t2i-adapter_skeleton.txt +``` + +### Step 2: Generate a Skeleton Image + +Run the following command to generate a skeleton image: + +```shell +# generate a skeleton image +bash mmpose_t2i-adapter.sh \ + mmdeploy-1.0.0rc3-linux-x86_64-onnxruntime1.8.1/rtmpose-ort/000000147979.jpg +``` + +For more details, you can refer to [RTMPose](../rtmpose/README.md). + +The input image and its skeleton are as follows: + +
+ +
+ +### Step 3: Upload to T2I-Adapter + +The demo page of T2I- Adapter is [Here](https://huggingface.co/spaces/Adapter/T2I-Adapter). + +[![Huggingface Gradio](https://img.shields.io/static/v1?label=Demo&message=Huggingface%20Gradio&color=orange)](https://huggingface.co/spaces/ChongMou/T2I-Adapter) + +
+ +
+ +For example: + +
+ +
+ +## Gallery + +> A lady with a fish + +
+ +
+ +> An astronaut riding a bike on the moon + +
+ +
+ +> An astronaut riding a bike on Mars + +
+ +
+ +> An astronaut riding a bike on Jupiter + +
+ +
+ +> Monkey king + +
+ +
diff --git a/projects/mmpose4aigc/README_CN.md b/projects/mmpose4aigc/README_CN.md new file mode 100644 index 0000000000..bc78368261 --- /dev/null +++ b/projects/mmpose4aigc/README_CN.md @@ -0,0 +1,108 @@ +# MMPose for AIGC (AI Generated Content) + +
+ +
+ +简体中文 | [English](./README.md) + +本项目将支持使用 MMPose 来生成骨架图片,用于姿态引导的 AI 图像生成。 + +当前已支持: + +- [T2I Adapter](https://huggingface.co/spaces/Adapter/T2I-Adapter) + +欢迎分享更多姿态引导的 AIGC 项目给我们! + +## 快速上手 + +### Step 1: 准备 + +**环境要求:** + +- GCC >= 7.5 +- cmake >= 3.14 + +运行以下命令安装项目: + +```shell +bash install_linux.sh +``` + +最终的文件结构如下: + +```shell +|----mmdeploy-1.0.0rc3-linux-x86_64-onnxruntime1.8.1 +| |----sdk +| |----rtmpose-ort +| | |----rtmdet-nano +| | |----rtmpose-m +| | |----000000147979.jpg +| | |----t2i-adapter_skeleton.txt +``` + +### Step 2: 生成姿态骨架图片 + +运行以下命令生成姿态骨架图片: + +```shell +# 生成骨架图片 +bash mmpose_t2i-adapter.sh \ + mmdeploy-1.0.0rc3-linux-x86_64-onnxruntime1.8.1/rtmpose-ort/000000147979.jpg +``` + +更多详细信息可以查看 [RTMPose](../rtmpose/README_CN.md)。 + +输入图片与生成骨架图片如下: + +
+ +
+ +### Step 3: 使用 T2I-Adapter + +T2I- Adapter 在线试玩请点击 [这里](https://huggingface.co/spaces/Adapter/T2I-Adapter) + +[![Huggingface Gradio](https://img.shields.io/static/v1?label=Demo&message=Huggingface%20Gradio&color=orange)](https://huggingface.co/spaces/ChongMou/T2I-Adapter) + +
+ +
+ +示例: + +
+ +
+ +## 结果展示 + +> A lady with a fish + +
+ +
+ +> An astronaut riding a bike on the moon + +
+ +
+ +> An astronaut riding a bike on Mars + +
+ +
+ +> An astronaut riding a bike on Jupiter + +
+ +
+ +> Monkey king + +
+ +
diff --git a/projects/mmpose4aigc/install_linux.sh b/projects/mmpose4aigc/install_linux.sh new file mode 100644 index 0000000000..3b91409b16 --- /dev/null +++ b/projects/mmpose4aigc/install_linux.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# Copyright (c) OpenMMLab. All rights reserved. + +# Download pre-compiled files +wget https://github.com/open-mmlab/mmdeploy/releases/download/v1.0.0rc3/mmdeploy-1.0.0rc3-linux-x86_64-onnxruntime1.8.1.tar.gz + +# Unzip files +tar -xzvf mmdeploy-1.0.0rc3-linux-x86_64-onnxruntime1.8.1.tar.gz + +# Go to the sdk folder +cd mmdeploy-1.0.0rc3-linux-x86_64-onnxruntime1.8.1/sdk + +# Init environment +source env.sh + +# If opencv 3+ is not installed on your system, execute the following command. +# If it is installed, skip this command +bash opencv.sh + +# Compile executable programs +bash build.sh + +# Go to mmdeploy folder +cd ../ + +# Download models +wget https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-cpu.zip + +# Unzip files +unzip rtmpose-cpu.zip + +# Success +echo "Installation completed." diff --git a/projects/mmpose4aigc/install_models.sh b/projects/mmpose4aigc/install_models.sh new file mode 100644 index 0000000000..c16205b66f --- /dev/null +++ b/projects/mmpose4aigc/install_models.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# Copyright (c) OpenMMLab. All rights reserved. + +#Creating models folder +mkdir models + +#Go to models folder +cd models + +#Download det model +wget https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_nano_8xb32-100e_coco-obj365-person-05d8511e.pth + +#Download pose model +wget https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.pth + +#go back mmpose4aigc +cd .. + +# Success +echo "Download completed." diff --git a/projects/mmpose4aigc/mmpose_t2i-adapter.sh b/projects/mmpose4aigc/mmpose_t2i-adapter.sh new file mode 100644 index 0000000000..e8c07bef70 --- /dev/null +++ b/projects/mmpose4aigc/mmpose_t2i-adapter.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# Copyright (c) OpenMMLab. All rights reserved. + +WORKSPACE=mmdeploy-1.0.0rc3-linux-x86_64-onnxruntime1.8.1/sdk/ +export LD_LIBRARY_PATH=${WORKSPACE}/lib:${WORKSPACE}/thirdparty/onnxruntime/lib:$LD_LIBRARY_PATH + +INPUT_IMAGE=$1 + +mmdeploy-1.0.0rc3-linux-x86_64-onnxruntime1.8.1/sdk/bin/pose_tracker \ + mmdeploy-1.0.0rc3-linux-x86_64-onnxruntime1.8.1//rtmpose-ort/rtmdet-nano \ + mmdeploy-1.0.0rc3-linux-x86_64-onnxruntime1.8.1//rtmpose-ort/rtmpose-m \ + $INPUT_IMAGE \ + --background black \ + --skeleton mmdeploy-1.0.0rc3-linux-x86_64-onnxruntime1.8.1//rtmpose-ort/t2i-adapter_skeleton.txt \ + --output ./skeleton_res.jpg \ + --pose_kpt_thr 0.4 \ + --show -1 diff --git a/projects/mmpose4aigc/openpose_visualization.py b/projects/mmpose4aigc/openpose_visualization.py new file mode 100644 index 0000000000..192b86fe89 --- /dev/null +++ b/projects/mmpose4aigc/openpose_visualization.py @@ -0,0 +1,206 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mimetypes +import os +from argparse import ArgumentParser + +import mmcv +import numpy as np +from mmengine.registry import init_default_scope + +from mmpose.apis import inference_topdown +from mmpose.apis import init_model as init_pose_estimator +from mmpose.evaluation.functional import nms +from mmpose.registry import VISUALIZERS +from mmpose.structures import merge_data_samples + +try: + from mmdet.apis import inference_detector, init_detector + has_mmdet = True +except (ImportError, ModuleNotFoundError): + has_mmdet = False + +import math + +import cv2 + + +def mmpose_to_openpose_visualization(args, img_path, detector, pose_estimator): + """Visualize predicted keypoints of one image in openpose format.""" + + # predict bbox + init_default_scope(detector.cfg.get('default_scope', 'mmdet')) + det_result = inference_detector(detector, img_path) + pred_instance = det_result.pred_instances.cpu().numpy() + bboxes = np.concatenate( + (pred_instance.bboxes, pred_instance.scores[:, None]), axis=1) + bboxes = bboxes[np.logical_and(pred_instance.labels == args.det_cat_id, + pred_instance.scores > args.bbox_thr)] + bboxes = bboxes[nms(bboxes, args.nms_thr), :4] + + # predict keypoints + pose_results = inference_topdown(pose_estimator, img_path, bboxes) + data_samples = merge_data_samples(pose_results) + # concatenate scores and keypoints + keypoints = np.concatenate( + (data_samples.pred_instances.keypoints, + data_samples.pred_instances.keypoint_scores.reshape(-1, 17, 1)), + axis=-1) + # compute neck joint + neck = (keypoints[:, 5] + keypoints[:, 6]) / 2 + neck[:, 2] = keypoints[:, 5, 2] * keypoints[:, 6, 2] + # 17 keypoints to 18 keypoints + new_keypoints = np.insert(keypoints[:, ], 17, neck, axis=1) + # mmpose format to openpose format + new_keypoints[:, [15, 14, 17, 16, 2, 6, 3, 7, 4, 8, 12, 9, 13, 10, 1 + ], :] = new_keypoints[:, [ + 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17 + ], :] + # out file name .png + out_file = 'openpose_' + os.path.splitext( + os.path.basename(img_path))[0] + '.png' + # show the results + img = mmcv.imread(img_path, channel_order='rgb') + # black background + black_img = np.zeros_like(img) + + # openpose format + stickwidth = 4 + limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], + [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], + [15, 17], [1, 16], [16, 18]] + + colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], + [170, 255, 0], [85, 255, 0], [0, 255, 0], [0, 255, 85], + [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], + [0, 0, 255], [85, 0, 255], [170, 0, 255], [255, 0, 255], + [255, 0, 170], [255, 0, 85]] + + # draw keypoints + cur_black_img = black_img.copy() + for i in range(len(new_keypoints)): + for j in range(18): + x, y, conf = new_keypoints[i][j] + if conf == 0: + continue + cv2.circle( + cur_black_img, (int(x), int(y)), 4, colors[j], thickness=-1) + black_img = cv2.addWeighted(black_img, 0.4, cur_black_img, 0.6, 0) + + # draw links + cur_black_img = black_img.copy() + for i in range(len(new_keypoints)): + for link in range(17): + + Y = new_keypoints[i][np.array(limbSeq[link]) - 1, 0] + X = new_keypoints[i][np.array(limbSeq[link]) - 1, 1] + mX = np.mean(X) + mY = np.mean(Y) + length = ((X[0] - X[1])**2 + (Y[0] - Y[1])**2)**0.5 + angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1])) + polygon = cv2.ellipse2Poly( + (int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), + 0, 360, 1) + cv2.fillConvexPoly(cur_black_img, polygon, colors[link]) + black_img = cv2.addWeighted(black_img, 0.4, cur_black_img, 0.6, 0) + # save image + cv2.imwrite(out_file, black_img[:, :, [2, 1, 0]]) + + +def main(): + """Visualize the demo images. + + Using mmdet to detect the human. + """ + parser = ArgumentParser() + parser.add_argument( + '--det_config', + help='Config file for detection', + default='../rtmpose/rtmdet/person/rtmdet_nano_320-8xb32_coco-person.py' + ) + parser.add_argument( + '--det_checkpoint', + help='Checkpoint file for detection', + default='models/rtmdet_nano_8xb32-100e_coco-obj365-person-05d8511e.pth' + ) + parser.add_argument( + '--pose_config', + help='Config file for pose', + default='../rtmpose/rtmpose/body_2d_keypoint/ \ + rtmpose-m_8xb256-420e_coco-256x192.py') + parser.add_argument( + '--pose_checkpoint', + help='Checkpoint file for pose', + default='models/rtmpose-m_simcc-aic-coco_pt-aic- \ + coco_420e-256x192-63eb25f7_20230126.pth') + parser.add_argument('input', type=str, help='input Image file') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--det-cat-id', + type=int, + default=0, + help='Category id for bounding box detection model') + parser.add_argument( + '--bbox-thr', + type=float, + default=0.3, + help='Bounding box score threshold') + parser.add_argument( + '--nms-thr', + type=float, + default=0.3, + help='IoU threshold for bounding box NMS') + parser.add_argument( + '--kpt-thr', type=float, default=0.3, help='Keypoint score threshold') + parser.add_argument( + '--draw-heatmap', + action='store_true', + default=False, + help='Draw heatmap predicted by the model') + parser.add_argument( + '--radius', + type=int, + default=3, + help='Keypoint radius for visualization') + parser.add_argument( + '--thickness', + type=int, + default=1, + help='Link thickness for visualization') + + assert has_mmdet, 'Please install mmdet to run the demo.' + + args = parser.parse_args() + + assert args.input != '' + assert args.det_config is not None + assert args.det_checkpoint is not None + + # build detector + detector = init_detector( + args.det_config, args.det_checkpoint, device=args.device) + + # build pose estimator + pose_estimator = init_pose_estimator( + args.pose_config, + args.pose_checkpoint, + device=args.device, + cfg_options=dict( + model=dict(test_cfg=dict(output_heatmaps=args.draw_heatmap)))) + + # init visualizer + pose_estimator.cfg.visualizer.radius = args.radius + pose_estimator.cfg.visualizer.line_width = args.thickness + visualizer = VISUALIZERS.build(pose_estimator.cfg.visualizer) + # the dataset_meta is loaded from the checkpoint and + # then pass to the model in init_pose_estimator + visualizer.set_dataset_meta(pose_estimator.dataset_meta) + + input_type = mimetypes.guess_type(args.input)[0].split('/')[0] + if input_type == 'image': + mmpose_to_openpose_visualization(args, args.input, detector, + pose_estimator) + + +if __name__ == '__main__': + main() diff --git a/projects/rtmpose/README.md b/projects/rtmpose/README.md new file mode 100644 index 0000000000..1a19c3381c --- /dev/null +++ b/projects/rtmpose/README.md @@ -0,0 +1,809 @@ +
+ +
+ +# RTMPose: Real-Time Multi-Person Pose Estimation toolkit based on MMPose + +
+ +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmpose-real-time-multi-person-pose/2d-human-pose-estimation-on-coco-wholebody-1)](https://paperswithcode.com/sota/2d-human-pose-estimation-on-coco-wholebody-1?p=rtmpose-real-time-multi-person-pose) + +
+ +
+ +English | [简体中文](README_CN.md) + +
+ +______________________________________________________________________ + +## Abstract + +Recent studies on 2D pose estimation have achieved excellent performance on public benchmarks, yet its application in the industrial community still suffers from heavy model parameters and high latency. +In order to bridge this gap, we empirically study five aspects that affect the performance of multi-person pose estimation algorithms: paradigm, backbone network, localization algorithm, training strategy, and deployment inference, and present a high-performance real-time multi-person pose estimation framework, **RTMPose**, based on MMPose. +Our RTMPose-m achieves **75.8% AP** on COCO with **90+ FPS** on an Intel i7-11700 CPU and **430+ FPS** on an NVIDIA GTX 1660 Ti GPU, and RTMPose-l achieves **67.0% AP** on COCO-WholeBody with **130+ FPS**. +To further evaluate RTMPose's capability in critical real-time applications, we also report the performance after deploying on the mobile device. Our RTMPose-s achieves **72.2% AP** on COCO with **70+ FPS** on a Snapdragon 865 chip, outperforming existing open-source libraries. +With the help of MMDeploy, our project supports various platforms like CPU, GPU, NVIDIA Jetson, and mobile devices and multiple inference backends such as ONNXRuntime, TensorRT, ncnn, etc. + +![rtmpose_intro](https://user-images.githubusercontent.com/13503330/219269619-935499e5-bdd9-49ea-8104-3c7796dbd862.png) + +______________________________________________________________________ + +## 📄 Table of Contents + +- [🥳 🚀 What's New](#--whats-new-) +- [📖 Introduction](#-introduction-) +- [🙌 Community](#-community-) +- [⚡ Pipeline Performance](#-pipeline-performance-) +- [📊 Model Zoo](#-model-zoo-) +- [👀 Visualization](#-visualization-) +- [😎 Get Started](#-get-started-) +- [👨‍🏫 How to Train](#-how-to-train-) +- [🏗️ How to Deploy](#️-how-to-deploy-) +- [📚 Common Usage](#️-common-usage-) + - [🚀 Inference Speed Test](#-inference-speed-test-) + - [📊 Model Test](#-model-test-) +- [📜 Citation](#-citation-) + +## 🥳 🚀 What's New [🔝](#-table-of-contents) + +- Mar. 2023: RTMPose is released. RTMPose-m runs at 430+ FPS and achieves 75.8 mAP on COCO val set. + +## 📖 Introduction [🔝](#-table-of-contents) + +
+ +
+ +
+ +
+
+ +
+ +### ✨ Major Features + +- 🚀 **High efficiency and high accuracy** + + | Model | AP(COCO) | CPU-FPS | GPU-FPS | + | :---: | :------: | :-----: | :-----: | + | t | 68.5 | 300+ | 940+ | + | s | 72.2 | 200+ | 710+ | + | m | 75.8 | 90+ | 430+ | + | l | 76.5 | 50+ | 280+ | + +- 🛠️ **Easy to deploy** + + - Step-by-step deployment tutorials. + - Support various backends including + - ONNX + - TensorRT + - ncnn + - OpenVINO + - etc. + - Support various platforms including + - Linux + - Windows + - NVIDIA Jetson + - ARM + - etc. + +- 🏗️ **Design for practical applications** + + - Pipeline inference API and SDK for + - Python + - C++ + - C# + - JAVA + - etc. + +## 🙌 Community [🔝](#-table-of-contents) + +RTMPose is a long-term project dedicated to the training, optimization and deployment of high-performance real-time pose estimation algorithms in practical scenarios, so we are looking forward to the power from the community. Welcome to share the training configurations and tricks based on RTMPose in different business applications to help more community users! + +✨ ✨ ✨ + +- **If you are a new user of RTMPose, we eagerly hope you can fill out this [Google Questionnaire](https://docs.google.com/forms/d/e/1FAIpQLSfzwWr3eNlDzhU98qzk2Eph44Zio6hi5r0iSwfO9wSARkHdWg/viewform?usp=sf_link)/[Chinese version](https://uua478.fanqier.cn/f/xxmynrki), it's very important for our work!** + +✨ ✨ ✨ + +Feel free to join our community group for more help: + +- WeChat Group: + +
+ +
+ +- Discord Group: + - 🙌 https://discord.gg/raweFPmdzG 🙌 + +## ⚡ Pipeline Performance [🔝](#-table-of-contents) + +**Notes** + +- Pipeline latency is tested under skip-frame settings, the detection interval is 5 frames by defaults. +- Flip test is NOT used. +- Env Setup: + - torch >= 1.7.1 + - onnxruntime 1.12.1 + - TensorRT 8.4.3.1 + - ncnn 20221128 + - cuDNN 8.3.2 + - CUDA 11.3 + +| Detection Config | Pose Config | Input Size
(Det/Pose) | Model AP
(COCO) | Pipeline AP
(COCO) | Params (M)
(Det/Pose) | Flops (G)
(Det/Pose) | ORT-Latency(ms)
(i7-11700) | TRT-FP16-Latency(ms)
(GTX 1660Ti) | Download | +| :------------------------------------------------------------------ | :---------------------------------------------------------------------------- | :---------------------------: | :---------------------: | :------------------------: | :---------------------------: | :--------------------------: | :--------------------------------: | :---------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| [RTMDet-nano](./rtmdet/person/rtmdet_nano_320-8xb32_coco-person.py) | [RTMPose-t](./rtmpose/body_2d_keypoint/rtmpose-t_8xb256-420e_coco-256x192.py) | 320x320
256x192 | 40.3
67.1 | 64.4 | 0.99
3.34 | 0.31
0.36 | 12.403 | 2.467 | [det](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_nano_8xb32-100e_coco-obj365-person-05d8511e.pth)
[pose](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-tiny_simcc-aic-coco_pt-aic-coco_420e-256x192-cfc8f33d_20230126.pth) | +| [RTMDet-nano](./rtmdet/person/rtmdet_nano_320-8xb32_coco-person.py) | [RTMPose-s](./rtmpose/body_2d_keypoint/rtmpose-s_8xb256-420e_coco-256x192.py) | 320x320
256x192 | 40.3
71.1 | 68.5 | 0.99
5.47 | 0.31
0.68 | 16.658 | 2.730 | [det](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_nano_8xb32-100e_coco-obj365-person-05d8511e.pth)
[pose](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-s_simcc-aic-coco_pt-aic-coco_420e-256x192-fcb2599b_20230126.pth) | +| [RTMDet-nano](./rtmdet/person/rtmdet_nano_320-8xb32_coco-person.py) | [RTMPose-m](./rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py) | 320x320
256x192 | 40.3
75.3 | 73.2 | 0.99
13.59 | 0.31
1.93 | 26.613 | 4.312 | [det](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_nano_8xb32-100e_coco-obj365-person-05d8511e.pth)
[pose](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.pth) | +| [RTMDet-nano](./rtmdet/person/rtmdet_nano_320-8xb32_coco-person.py) | [RTMPose-l](./rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-256x192.py) | 320x320
256x192 | 40.3
76.3 | 74.2 | 0.99
27.66 | 0.31
4.16 | 36.311 | 4.644 | [det](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_nano_8xb32-100e_coco-obj365-person-05d8511e.pth)
[pose](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-256x192-f016ffe0_20230126.pth) | +| [RTMDet-m](./rtmdet/person/rtmdet_m_640-8xb32_coco-person.py) | [RTMPose-m](./rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py) | 640x640
256x192 | 62.5
75.3 | 75.7 | 24.66
13.59 | 38.95
1.93 | - | 6.923 | [det](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth)
[pose](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.pth) | +| [RTMDet-m](./rtmdet/person/rtmdet_m_640-8xb32_coco-person.py) | [RTMPose-l](./rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-256x192.py) | 640x640
256x192 | 62.5
76.3 | 76.6 | 24.66
27.66 | 38.95
4.16 | - | 7.204 | [det](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth)
[pose](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-256x192-f016ffe0_20230126.pth) | + +## 📊 Model Zoo [🔝](#-table-of-contents) + +**Notes** + +- Since all models are trained on multi-domain combined datasets for practical applications, results are **not** suitable for academic comparison. +- More results of RTMPose on public benchmarks can refer to [Model Zoo](https://mmpose.readthedocs.io/en/1.x/model_zoo_papers/algorithms.html) +- Flip test is used. +- Inference speed measured on more hardware platforms can refer to [Benchmark](./benchmark/README.md) +- If you have datasets you would like us to support, feel free to [contact us](https://docs.google.com/forms/d/e/1FAIpQLSfzwWr3eNlDzhU98qzk2Eph44Zio6hi5r0iSwfO9wSARkHdWg/viewform?usp=sf_link)/[联系我们](https://uua478.fanqier.cn/f/xxmynrki). + +### Body 2d (17 Keypoints) + +| Config | Input Size | AP
(COCO) | Params(M) | FLOPS(G) | ORT-Latency(ms)
(i7-11700) | TRT-FP16-Latency(ms)
(GTX 1660Ti) | ncnn-FP16-Latency(ms)
(Snapdragon 865) | Logs | Download | +| :---------: | :--------: | :---------------: | :-------: | :------: | :--------------------------------: | :---------------------------------------: | :--------------------------------------------: | :--------: | :------------: | +| [RTMPose-t](./rtmpose/body_2d_keypoint/rtmpose-t_8xb256-420e_coco-256x192.py) | 256x192 | 68.5 | 3.34 | 0.36 | 3.20 | 1.06 | 9.02 | [Log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-tiny_simcc-aic-coco_pt-aic-coco_420e-256x192-cfc8f33d_20230126.json) | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-tiny_simcc-aic-coco_pt-aic-coco_420e-256x192-cfc8f33d_20230126.pth) | +| [RTMPose-s](./rtmpose/body_2d_keypoint/rtmpose-s_8xb256-420e_coco-256x192.py) | 256x192 | 72.2 | 5.47 | 0.68 | 4.48 | 1.39 | 13.89 | [Log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-s_simcc-aic-coco_pt-aic-coco_420e-256x192-fcb2599b_20230126.json) | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-s_simcc-aic-coco_pt-aic-coco_420e-256x192-fcb2599b_20230126.pth) | +| [RTMPose-m](./rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py) | 256x192 | 75.8 | 13.59 | 1.93 | 11.06 | 2.29 | 26.44 | [Log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.json) | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.pth) | +| [RTMPose-l](./rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-256x192.py) | 256x192 | 76.5 | 27.66 | 4.16 | 18.85 | 3.46 | 45.37 | [Log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-256x192-f016ffe0_20230126.json) | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-256x192-f016ffe0_20230126.pth) | +| [RTMPose-m](./rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-384x288.py) | 384x288 | 77.0 | 13.72 | 4.33 | 24.78 | 3.66 | - | [Log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-384x288-a62a0b32_20230228.json) | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-384x288-a62a0b32_20230228.pth) | +| [RTMPose-l](./rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-384x288.py) | 384x288 | 77.3 | 27.79 | 9.35 | - | 6.05 | - | [Log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-384x288-97d6cb0f_20230228.json) | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-384x288-97d6cb0f_20230228.pth) | + +#### Model Pruning + +**Notes** + +- Model pruning is supported by [MMRazor](https://github.com/open-mmlab/mmrazor) + +| Config | Input Size | AP
(COCO) | Params(M) | FLOPS(G) | ORT-Latency(ms)
(i7-11700) | TRT-FP16-Latency(ms)
(GTX 1660Ti) | ncnn-FP16-Latency(ms)
(Snapdragon 865) | Logs | Download | +| :---------: | :--------: | :---------------: | :-------: | :------: | :--------------------------------: | :---------------------------------------: | :--------------------------------------------: | :--------: | :------------: | +| RTMPose-s-aic-coco-pruned | 256x192 | 69.4 | 3.43 | 0.35 | - | - | - | [log](https://download.openmmlab.com/mmrazor/v1/pruning/group_fisher/rtmpose-s/group_fisher_finetune_rtmpose-s_8xb256-420e_aic-coco-256x192.json) | [model](https://download.openmmlab.com/mmrazor/v1/pruning/group_fisher/rtmpose-s/group_fisher_finetune_rtmpose-s_8xb256-420e_aic-coco-256x192.pth) | + +For more details, please refer to [GroupFisher Pruning for RTMPose](./rtmpose/pruning/README.md). + +### WholeBody 2d (133 Keypoints) + +| Config | Input Size | Whole AP | Whole AR | FLOPS(G) | ORT-Latency(ms)
(i7-11700) | TRT-FP16-Latency(ms)
(GTX 1660Ti) | Logs | Download | +| :----------------------------- | :--------: | :------: | :------: | :------: | :--------------------------------: | :---------------------------------------: | :--------------------------: | :-------------------------------: | +| [RTMPose-m](./rtmpose/wholebody_2d_keypoint/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py) | 256x192 | 60.4 | 66.7 | 2.22 | 13.50 | 4.00 | [Log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-coco-wholebody_pt-aic-coco_270e-256x192-cd5e845c_20230123.json) | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-coco-wholebody_pt-aic-coco_270e-256x192-cd5e845c_20230123.pth) | +| [RTMPose-l](./rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py) | 256x192 | 63.2 | 69.4 | 4.52 | 23.41 | 5.67 | [Log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-256x192-6f206314_20230124.json) | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-256x192-6f206314_20230124.pth) | +| [RTMPose-l](./rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py) | 384x288 | 67.0 | 72.3 | 10.07 | 44.58 | 7.68 | [Log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-384x288-eaeb96c8_20230125.json) | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-384x288-eaeb96c8_20230125.pth) | + +### Animal 2d (17 Keypoints) + +| Config | Input Size | AP
(AP10K) | FLOPS(G) | ORT-Latency(ms)
(i7-11700) | TRT-FP16-Latency(ms)
(GTX 1660Ti) | Logs | Download | +| :---------------------------: | :--------: | :----------------: | :------: | :--------------------------------: | :---------------------------------------: | :--------------------------: | :------------------------------: | +| [RTMPose-m](./rtmpose/animal_2d_keypoint/rtmpose-m_8xb64-210e_ap10k-256x256.py) | 256x256 | 72.2 | 2.57 | 14.157 | 2.404 | [Log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-ap10k_pt-aic-coco_210e-256x256-7a041aa1_20230206.json) | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-ap10k_pt-aic-coco_210e-256x256-7a041aa1_20230206.pth) | + +### Face 2d + +Coming soon + +### Hand 2d + +Coming soon + +### Pretrained Models + +We provide the UDP pretraining configs of the CSPNeXt backbone. Find more details in the [pretrain_cspnext_udp folder](./rtmpose/pretrain_cspnext_udp/). + +| Model | Input Size | Params(M) | Flops(G) | AP
(GT) | AR
(GT) | Download | +| :----------: | :--------: | :-------: | :------: | :-------------: | :-------------: | :-----------------------------------------------------------------------------------------------------------------------------: | +| CSPNeXt-tiny | 256x192 | 6.03 | 1.43 | 65.5 | 68.9 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth) | +| CSPNeXt-s | 256x192 | 8.58 | 1.78 | 70.0 | 73.3 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/cspnext-s_udp-aic-coco_210e-256x192-92f5a029_20230130.pth) | +| CSPNeXt-m | 256x192 | 13.05 | 3.06 | 74.8 | 77.7 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth) | +| CSPNeXt-l | 256x192 | 32.44 | 5.33 | 77.2 | 79.9 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth) | + +We also provide the ImageNet classification pre-trained weights of the CSPNeXt backbone. Find more details in [RTMDet](https://github.com/open-mmlab/mmdetection/blob/dev-3.x/configs/rtmdet/README.md#classification). + +| Model | Input Size | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Download | +| :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :---------------------------------------------------------------------------------------------------------------------------------: | +| CSPNeXt-tiny | 224x224 | 2.73 | 0.34 | 69.44 | 89.45 | [Model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e-3a2dd350.pth) | +| CSPNeXt-s | 224x224 | 4.89 | 0.66 | 74.41 | 92.23 | [Model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e-ea671761.pth) | +| CSPNeXt-m | 224x224 | 13.05 | 1.93 | 79.27 | 94.79 | [Model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth) | +| CSPNeXt-l | 224x224 | 27.16 | 4.19 | 81.30 | 95.62 | [Model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-l_8xb256-rsb-a1-600e_in1k-6a760974.pth) | + +## 👀 Visualization [🔝](#-table-of-contents) + +
+ + +
+ +## 😎 Get Started [🔝](#-table-of-contents) + +We provide two appoaches to try RTMPose: + +- Pre-compiled MMDeploy SDK (Recommended) +- MMPose demo scripts + +### Pre-compiled MMDeploy SDK (Recommended) + +MMDeploy provides a precompiled SDK for Pipeline reasoning on RTMPose projects, where the model used for reasoning is the SDK version. For the tutorial of exporting the SDK version model, see [SDK Reasoning](#%EF%B8%8F-step3-inference-with-sdk), and for detailed parameter settings of inference, see [Pipeline Reasoning](#-step4-pipeline-inference). + +Env Requirements: + +- GCC >= 7.5 +- cmake >= 3.14 + +#### ONNX + +```shell +# Download pre-compiled files +wget https://github.com/open-mmlab/mmdeploy/releases/download/v1.0.0rc3/mmdeploy-1.0.0rc3-linux-x86_64-onnxruntime1.8.1.tar.gz + +# Unzip files +tar -xzvf mmdeploy-1.0.0rc3-linux-x86_64-onnxruntime1.8.1.tar.gz + +# Go to the sdk folder +cd mmdeploy-1.0.0rc3-linux-x86_64-onnxruntime1.8.1/sdk + +# Init environment +source env.sh + +# If opencv 3+ is not installed on your system, execute the following command. +# If it is installed, skip this command +bash opencv.sh + +# Compile executable programs +bash build.sh + +# inference for an image +./bin/det_pose {det work-dir} {pose work-dir} {your_img.jpg} --device cpu + +# inference for a video +./bin/pose_tracker {det work-dir} {pose work-dir} {your_video.mp4} --device cpu +``` + +#### TensorRT + +```shell +# Download pre-compiled files +wget https://github.com/open-mmlab/mmdeploy/releases/download/v1.0.0rc3/mmdeploy-1.0.0rc3-linux-x86_64-cuda11.1-tensorrt8.2.3.0.tar.gz + +# Unzip files +tar -xzvf mmdeploy-1.0.0rc3-linux-x86_64-cuda11.1-tensorrt8.2.3.0.tar.gz + +# Go to the sdk folder +cd mmdeploy-1.0.0rc3-linux-x86_64-cuda11.1-tensorrt8.2.3.0/sdk + +# Init environment +source env.sh + +# If opencv 3+ is not installed on your system, execute the following command. +# If it is installed, skip this command +bash opencv.sh + +# Compile executable programs +bash build.sh + +# inference for an image +./bin/det_pose {det work-dir} {pose work-dir} {your_img.jpg} --device cuda + +# inference for a video +./bin/pose_tracker {det work-dir} {pose work-dir} {your_video.mp4} --device cuda +``` + +For details, see [Pipeline Inference](#-step4-pipeline-inference). + +### MMPose demo scripts + +MMPose provides demo scripts to conduct [inference with existing models](https://mmpose.readthedocs.io/en/1.x/user_guides/inference.html). + +```shell +# go to the mmpose folder +cd ${PATH_TO_MMPOSE} + +# inference with rtmdet +python demo/topdown_demo_with_mmdet.py \ + projects/rtmpose/rtmdet/person/rtmdet_nano_320-8xb32_coco-person.py \ + {PATH_TO_CHECKPOINT}/rtmdet_nano_8xb32-100e_coco-obj365-person-05d8511e.pth \ + projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py \ + {PATH_TO_CHECKPOINT}/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.pth \ + --input {YOUR_TEST_IMG_OR_VIDEO} + --show +``` + +Result is as follows: + +![topdown_inference_with_rtmdet](https://user-images.githubusercontent.com/13503330/220005020-06bdf37f-6817-4681-a2c8-9dd55e4fbf1e.png) + +## 👨‍🏫 How to Train [🔝](#-table-of-contents) + +Please refer to [Train and Test](https://mmpose.readthedocs.io/en/1.x/user_guides/train_and_test.html). + +**Tips**: + +- RTMPose has `drop_last=True` enabled by default, please accordinally reduce `batch_size` and `base_lr` when your dataset is small. +- Guidelines to choose a model + - m: Recommended and Preferred Use + - t/s: For mobile devices with extremely low computing power, or scenarios with stringent inference speed requirements + - l: Suitable for scenarios with strong computing power and not sensitive to speed + +## 🏗️ How to Deploy [🔝](#-table-of-contents) + +Here is a basic example of deploy RTMPose with [MMDeploy-1.x](https://github.com/open-mmlab/mmdeploy/tree/1.x). + +### 🧩 Step1. Install MMDeploy + +Before starting the deployment, please make sure you install MMPose-1.x and MMDeploy-1.x correctly. + +- Install MMPose-1.x, please refer to the [MMPose-1.x installation guide](https://mmpose.readthedocs.io/en/1.x/installation.html). +- Install MMDeploy-1.x, please refer to the [MMDeploy-1.x installation guide](https://mmdeploy.readthedocs.io/en/1.x/get_started.html#installation). + +Depending on the deployment backend, some backends require compilation of custom operators, so please refer to the corresponding document to ensure the environment is built correctly according to your needs: + +- [ONNX RUNTIME SUPPORT](https://mmdeploy.readthedocs.io/en/1.x/05-supported-backends/onnxruntime.html) +- [TENSORRT SUPPORT](https://mmdeploy.readthedocs.io/en/1.x/05-supported-backends/tensorrt.html) +- [OPENVINO SUPPORT](https://mmdeploy.readthedocs.io/en/1.x/05-supported-backends/openvino.html) +- [More](https://github.com/open-mmlab/mmdeploy/tree/1.x/docs/en/05-supported-backends) + +### 🛠️ Step2. Convert Model + +After the installation, you can enjoy the model deployment journey starting from converting PyTorch model to backend model by running MMDeploy's `tools/deploy.py`. + +The detailed model conversion tutorial please refer to the [MMDeploy document](https://mmdeploy.readthedocs.io/en/1.x/02-how-to-run/convert_model.html). Here we only give the example of converting RTMPose. + +Here we take converting RTMDet-nano and RTMPose-m to ONNX/TensorRT as an example. + +- If you only want to use ONNX, please use: + - [`detection_onnxruntime_static.py`](https://github.com/open-mmlab/mmdeploy/blob/1.x/configs/mmdet/detection/detection_onnxruntime_static.py) for RTMDet. + - [`pose-detection_simcc_onnxruntime_dynamic.py`](https://github.com/open-mmlab/mmdeploy/blob/1.x/configs/mmpose/pose-detection_simcc_onnxruntime_dynamic.py) for RTMPose. +- If you want to use TensorRT, please use: + - [`detection_tensorrt_static-320x320.py`](https://github.com/open-mmlab/mmdeploy/blob/1.x/configs/mmdet/detection/detection_tensorrt_static-320x320.py) for RTMDet. + - [`pose-detection_simcc_tensorrt_dynamic-256x192.py`](https://github.com/open-mmlab/mmdeploy/blob/1.x/configs/mmpose/pose-detection_simcc_tensorrt_dynamic-256x192.py) for RTMPose. + +If you want to customize the settings in the deployment config for your requirements, please refer to [MMDeploy config tutorial](https://mmdeploy.readthedocs.io/en/1.x/02-how-to-run/write_config.html). + +In this tutorial, we organize files as follows: + +``` +|----mmdeploy +|----mmdetection +|----mmpose +|----rtmdet_nano +| |----rtmdet_nano.pth +|----rtmpose_m + |----rtmpose_m.pth +``` + +#### ONNX + +```shell +# go to the mmdeploy folder +cd ${PATH_TO_MMDEPLOY} + +# run the command to convert RTMDet +python tools/deploy.py \ + configs/mmdet/detection/detection_onnxrumtime_static.py \ + {RTMPOSE_PROJECT}/rtmdet/person/rtmdet_nano_320-8xb32_coco-person.py \ + ../rtmdet_nano/rtmdet_nano.pth \ + demo/resources/human-pose.jpg \ + --work-dir mmdeploy_models/mmdet/ort \ + --device cpu \ + --show + +# run the command to convert RTMPose +python tools/deploy.py \ + configs/mmpose/pose-detection_simcc_onnxruntime_dynamic.py \ + {RTMPOSE_PROJECT}/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py \ + ../rtmpose_m/rtmpose_m.pth \ + demo/resources/human-pose.jpg \ + --work-dir mmdeploy_models/mmpose/ort \ + --device cpu \ + --show +``` + +The converted model file is `{work-dir}/end2end.onnx` by defaults. + +#### TensorRT + +```shell +# go to the mmdeploy folder +cd ${PATH_TO_MMDEPLOY} + +# run the command to convert RTMDet +python tools/deploy.py \ + configs/mmdet/detection/detection_tensorrt_static-320x320.py \ + {RTMPOSE_PROJECT}/rtmdet/person/rtmdet_nano_320-8xb32_coco-person.py \ + ../rtmdet_nano/rtmdet_nano.pth \ + demo/resources/human-pose.jpg \ + --work-dir mmdeploy_models/mmdet/trt \ + --device cuda:0 \ + --show + +# run the command to convert RTMPose +python tools/deploy.py \ + configs/mmpose/pose-detection_simcc_tensorrt_dynamic-256x192.py \ + {RTMPOSE_PROJECT}/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py \ + ../rtmpose_m/rtmpose_m.pth \ + demo/resources/human-pose.jpg \ + --work-dir mmdeploy_models/mmpose/trt \ + --device cuda:0 \ + --show +``` + +The converted model file is `{work-dir}/end2end.engine` by defaults. + +🎊 If the script runs successfully, you will see the following files: + +![convert_models](https://user-images.githubusercontent.com/13503330/217726963-7815dd01-561a-4605-b0c6-07b6fe1956c3.png) + +#### Advanced Setting + +To convert the model with TRT-FP16, you can enable the fp16 mode in your deploy config: + +```Python +# in MMDeploy config +backend_config = dict( + type='tensorrt', + common_config=dict( + fp16_mode=True # enable fp16 + )) +``` + +### 🕹️ Step3. Inference with SDK + +We provide both Python and C++ inference API with MMDeploy SDK. + +To use SDK, you need to dump the required info during converting the model. Just add --dump-info to the model conversion command: + +```shell +# RTMDet +python tools/deploy.py \ + configs/mmdet/detection/detection_onnxrumtime_dynamic.py \ + {RTMPOSE_PROJECT}/rtmdet/person/rtmdet_nano_320-8xb32_coco-person.py \ + ../rtmdet_nano/rtmdet_nano.pth \ + demo/resources/human-pose.jpg \ + --work-dir mmdeploy_models/mmdet/sdk \ + --device cpu \ + --show \ + --dump-info # dump sdk info + +# RTMPose +python tools/deploy.py \ + configs/mmpose/pose-detection_simcc_onnxruntime_dynamic.py \ + {RTMPOSE_PROJECT}/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py \ + ../rtmpose_m/rtmpose_m.pth \ + demo/resources/human-pose.jpg \ + --work-dir mmdeploy_models/mmpose/sdk \ + --device cpu \ + --show \ + --dump-info # dump sdk info +``` + +After running the command, it will dump 3 json files additionally for the SDK: + +``` +|----sdk + |----end2end.onnx # ONNX model + |----end2end.engine # TensorRT engine file + + |----pipeline.json # + |----deploy.json # json files for the SDK + |----detail.json # +``` + +#### Python API + +Here is a basic example of SDK Python API: + +```Python +# Copyright (c) OpenMMLab. All rights reserved. +import argparse + +import cv2 +import numpy as np +from mmdeploy_python import PoseDetector + + +def parse_args(): + parser = argparse.ArgumentParser( + description='show how to use sdk python api') + parser.add_argument('device_name', help='name of device, cuda or cpu') + parser.add_argument( + 'model_path', + help='path of mmdeploy SDK model dumped by model converter') + parser.add_argument('image_path', help='path of an image') + parser.add_argument( + '--bbox', + default=None, + nargs='+', + type=int, + help='bounding box of an object in format (x, y, w, h)') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + + img = cv2.imread(args.image_path) + + detector = PoseDetector( + model_path=args.model_path, device_name=args.device_name, device_id=0) + + if args.bbox is None: + result = detector(img) + else: + # converter (x, y, w, h) -> (left, top, right, bottom) + print(args.bbox) + bbox = np.array(args.bbox, dtype=int) + bbox[2:] += bbox[:2] + result = detector(img, bbox) + print(result) + + _, point_num, _ = result.shape + points = result[:, :, :2].reshape(point_num, 2) + for [x, y] in points.astype(int): + cv2.circle(img, (x, y), 1, (0, 255, 0), 2) + + cv2.imwrite('output_pose.png', img) + + +if __name__ == '__main__': + main() +``` + +#### C++ API + +Here is a basic example of SDK C++ API: + +```C++ +#include "mmdeploy/detector.hpp" + +#include "opencv2/imgcodecs/imgcodecs.hpp" +#include "utils/argparse.h" +#include "utils/visualize.h" + +DEFINE_ARG_string(model, "Model path"); +DEFINE_ARG_string(image, "Input image path"); +DEFINE_string(device, "cpu", R"(Device name, e.g. "cpu", "cuda")"); +DEFINE_string(output, "detector_output.jpg", "Output image path"); + +DEFINE_double(det_thr, .5, "Detection score threshold"); + +int main(int argc, char* argv[]) { + if (!utils::ParseArguments(argc, argv)) { + return -1; + } + + cv::Mat img = cv::imread(ARGS_image); + if (img.empty()) { + fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str()); + return -1; + } + + // construct a detector instance + mmdeploy::Detector detector(mmdeploy::Model{ARGS_model}, mmdeploy::Device{FLAGS_device}); + + // apply the detector, the result is an array-like class holding references to + // `mmdeploy_detection_t`, will be released automatically on destruction + mmdeploy::Detector::Result dets = detector.Apply(img); + + // visualize + utils::Visualize v; + auto sess = v.get_session(img); + int count = 0; + for (const mmdeploy_detection_t& det : dets) { + if (det.score > FLAGS_det_thr) { // filter bboxes + sess.add_det(det.bbox, det.label_id, det.score, det.mask, count++); + } + } + + if (!FLAGS_output.empty()) { + cv::imwrite(FLAGS_output, sess.get()); + } + + return 0; +} +``` + +To build C++ example, please add MMDeploy package in your CMake project as following: + +```CMake +find_package(MMDeploy REQUIRED) +target_link_libraries(${name} PRIVATE mmdeploy ${OpenCV_LIBS}) +``` + +#### Other languages + +- [C# API Examples](https://github.com/open-mmlab/mmdeploy/tree/1.x/demo/csharp) +- [JAVA API Examples](https://github.com/open-mmlab/mmdeploy/tree/1.x/demo/java) + +## 🚀 Step4. Pipeline Inference + +### Inference for images + +If the user has MMDeploy compiled correctly, you will see the `det_pose` executable under the `mmdeploy/build/bin/`. + +```shell +# go to the mmdeploy folder +cd ${PATH_TO_MMDEPLOY}/build/bin/ + +# inference for an image +./det_pose {det work-dir} {pose work-dir} {your_img.jpg} --device cpu + +required arguments: + det_model Object detection model path [string] + pose_model Pose estimation model path [string] + image Input image path [string] + +optional arguments: + --device Device name, e.g. "cpu", "cuda" [string = "cpu"] + --output Output image path [string = "det_pose_output.jpg"] + --skeleton Path to skeleton data or name of predefined skeletons: + "coco" [string = "coco", "coco-wholoebody"] + --det_label Detection label use for pose estimation [int32 = 0] + (0 refers to 'person' in coco) + --det_thr Detection score threshold [double = 0.5] + --det_min_bbox_size Detection minimum bbox size [double = -1] + --pose_thr Pose key-point threshold [double = 0] +``` + +#### API Example + +- [`det_pose.py`](https://github.com/open-mmlab/mmdeploy/blob/dev-1.x/demo/python/det_pose.py) +- [`det_pose.cxx`](https://github.com/open-mmlab/mmdeploy/blob/dev-1.x/demo/csrc/cpp/det_pose.cxx) + +### Inference for a video + +If the user has MMDeploy compiled correctly, you will see the `pose_tracker` executable under the `mmdeploy/build/bin/`. + +```shell +# go to the mmdeploy folder +cd ${PATH_TO_MMDEPLOY}/build/bin/ + +# inference for a video +./pose_tracker {det work-dir} {pose work-dir} {your_video.mp4} --device cpu + +required arguments: + det_model Object detection model path [string] + pose_model Pose estimation model path [string] + input Input video path or camera index [string] + +optional arguments: + --device Device name, e.g. "cpu", "cuda" [string = "cpu"] + --output Output video path or format string [string = ""] + --output_size Long-edge of output frames [int32 = 0] + --flip Set to 1 for flipping the input horizontally [int32 = 0] + --show Delay passed to `cv::waitKey` when using `cv::imshow`; + -1: disable [int32 = 1] + --skeleton Path to skeleton data or name of predefined skeletons: + "coco", "coco-wholebody" [string = "coco"] + --background Output background, "default": original image, "black": + black background [string = "default"] + --det_interval Detection interval [int32 = 1] + --det_label Detection label use for pose estimation [int32 = 0] + (0 refers to 'person' in coco) + --det_thr Detection score threshold [double = 0.5] + --det_min_bbox_size Detection minimum bbox size [double = -1] + --det_nms_thr NMS IOU threshold for merging detected bboxes and + bboxes from tracked targets [double = 0.7] + --pose_max_num_bboxes Max number of bboxes used for pose estimation per frame + [int32 = -1] + --pose_kpt_thr Threshold for visible key-points [double = 0.5] + --pose_min_keypoints Min number of key-points for valid poses, -1 indicates + ceil(n_kpts/2) [int32 = -1] + --pose_bbox_scale Scale for expanding key-points to bbox [double = 1.25] + --pose_min_bbox_size Min pose bbox size, tracks with bbox size smaller than + the threshold will be dropped [double = -1] + --pose_nms_thr NMS OKS/IOU threshold for suppressing overlapped poses, + useful when multiple pose estimations collapse to the + same target [double = 0.5] + --track_iou_thr IOU threshold for associating missing tracks + [double = 0.4] + --track_max_missing Max number of missing frames before a missing tracks is + removed [int32 = 10] +``` + +#### API Example + +- [`pose_tracker.py`](https://github.com/open-mmlab/mmdeploy/blob/dev-1.x/demo/python/pose_tracker.py) +- [`pose_tracker.cxx`](https://github.com/open-mmlab/mmdeploy/blob/dev-1.x/demo/csrc/cpp/pose_tracker.cxx) + +## 📚 Common Usage [🔝](#-table-of-contents) + +### 🚀 Inference Speed Test [🔝](#-table-of-contents) + +If you need to test the inference speed of the model under the deployment framework, MMDeploy provides a convenient `tools/profiler.py` script. + +The user needs to prepare a folder for the test images `./test_images`, the profiler will randomly read images from this directory for the model speed test. + +```shell +python tools/profiler.py \ + configs/mmpose/pose-detection_simcc_onnxruntime_dynamic.py \ + {RTMPOSE_PROJECT}/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py \ + ../test_images \ + --model {WORK_DIR}/end2end.onnx \ + --shape 256x192 \ + --device cpu \ + --warmup 50 \ + --num-iter 200 +``` + +The result is as follows: + +```shell +01/30 15:06:35 - mmengine - INFO - [onnxruntime]-70 times per count: 8.73 ms, 114.50 FPS +01/30 15:06:36 - mmengine - INFO - [onnxruntime]-90 times per count: 9.05 ms, 110.48 FPS +01/30 15:06:37 - mmengine - INFO - [onnxruntime]-110 times per count: 9.87 ms, 101.32 FPS +01/30 15:06:37 - mmengine - INFO - [onnxruntime]-130 times per count: 9.99 ms, 100.10 FPS +01/30 15:06:38 - mmengine - INFO - [onnxruntime]-150 times per count: 10.39 ms, 96.29 FPS +01/30 15:06:39 - mmengine - INFO - [onnxruntime]-170 times per count: 10.77 ms, 92.86 FPS +01/30 15:06:40 - mmengine - INFO - [onnxruntime]-190 times per count: 10.98 ms, 91.05 FPS +01/30 15:06:40 - mmengine - INFO - [onnxruntime]-210 times per count: 11.19 ms, 89.33 FPS +01/30 15:06:41 - mmengine - INFO - [onnxruntime]-230 times per count: 11.16 ms, 89.58 FPS +01/30 15:06:42 - mmengine - INFO - [onnxruntime]-250 times per count: 11.06 ms, 90.41 FPS +----- Settings: ++------------+---------+ +| batch size | 1 | +| shape | 256x192 | +| iterations | 200 | +| warmup | 50 | ++------------+---------+ +----- Results: ++--------+------------+---------+ +| Stats | Latency/ms | FPS | ++--------+------------+---------+ +| Mean | 11.060 | 90.412 | +| Median | 11.852 | 84.375 | +| Min | 7.812 | 128.007 | +| Max | 13.690 | 73.044 | ++--------+------------+---------+ +``` + +If you want to learn more details of profiler, you can refer to the [Profiler Docs](https://mmdeploy.readthedocs.io/en/1.x/02-how-to-run/useful_tools.html#profiler). + +### 📊 Model Test [🔝](#-table-of-contents) + +If you need to test the inference accuracy of the model on the deployment backend, MMDeploy provides a convenient `tools/test.py` script. + +```shell +python tools/test.py \ + configs/mmpose/pose-detection_simcc_onnxruntime_dynamic.py \ + {RTMPOSE_PROJECT}/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py \ + --model {PATH_TO_MODEL}/rtmpose_m.pth \ + --device cpu +``` + +You can also refer to [MMDeploy Docs](https://github.com/open-mmlab/mmdeploy/blob/dev-1.x/docs/en/02-how-to-run/profile_model.md) for more details. + +## 📜 Citation [🔝](#-table-of-contents) + +If you find RTMPose useful in your research, please consider cite: + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2303.07399, + doi = {10.48550/ARXIV.2303.07399}, + url = {https://arxiv.org/abs/2303.07399}, + author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose}, + publisher = {arXiv}, + year = {2023}, + copyright = {Creative Commons Attribution 4.0 International} +} + +@misc{mmpose2020, + title={OpenMMLab Pose Estimation Toolbox and Benchmark}, + author={MMPose Contributors}, + howpublished = {\url{https://github.com/open-mmlab/mmpose}}, + year={2020} +} +``` diff --git a/projects/rtmpose/README_CN.md b/projects/rtmpose/README_CN.md new file mode 100644 index 0000000000..0fee8c9fbe --- /dev/null +++ b/projects/rtmpose/README_CN.md @@ -0,0 +1,799 @@ +
+ +
+ +# RTMPose: Real-Time Multi-Person Pose Estimation toolkit based on MMPose + +
+ +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmpose-real-time-multi-person-pose/2d-human-pose-estimation-on-coco-wholebody-1)](https://paperswithcode.com/sota/2d-human-pose-estimation-on-coco-wholebody-1?p=rtmpose-real-time-multi-person-pose) + +
+ +
+ +[English](README.md) | 简体中文 + +
+ +______________________________________________________________________ + +## Abstract + +近年来,2D 姿态估计的研究在公开数据集上取得了出色的成绩,但是它在工业界的应用仍然受到笨重的模型参数和高推理延迟的影响。为了让前沿姿态估计算法在工业界落地,我们通过实验研究了多人姿态估计算法的五个方面:范式、骨干网络、定位算法、训练策略和部署推理,基于 MMPose 提出了一个高性能的实时多人姿态估计框架 **RTMPose**。我们的 RTMPose-m 模型在 COCO 上取得 **75.8%AP**,在 Intel i7-11700 CPU 上达到 **90+FPS**,在 NVIDIA GTX 1660 Ti GPU 上达到 **430+FPS**,RTMPose-l 在 COCO-WholeBody 上达到 **67.0%AP**,**130+FPS**。我们同样验证了在算力有限的设备上做实时姿态估计,RTMPose-s 在移动端骁龙865芯片上可以达到 **COCO 72.2%AP**,**70+FPS**。在 MMDeploy 的帮助下,我们的项目支持 CPU、GPU、Jetson、移动端等多种部署环境。 + +![rtmpose_intro](https://user-images.githubusercontent.com/13503330/219269619-935499e5-bdd9-49ea-8104-3c7796dbd862.png) + +______________________________________________________________________ + +## 📄 Table of Contents + +- [🥳 🚀 最新进展](#--最新进展-) +- [📖 简介](#-简介-) +- [🙌 社区共建](#-社区共建-) +- [⚡ Pipeline 性能](#-pipeline-性能-) +- [📊 模型库](#-模型库-) +- [👀 可视化](#-可视化-) +- [😎 快速尝试](#-快速尝试-) +- [👨‍🏫 模型训练](#-模型训练-) +- [🏗️ 部署教程](#️-部署教程-) +- [📚 常用功能](#️-常用功能-) + - [🚀 模型测速](#-模型测速-) + - [📊 精度验证](#-精度验证-) +- [📜 引用](#-引用-) + +## 🥳 最新进展 [🔝](#-table-of-contents) + +- 2023 年 3 月:发布 RTMPose。RTMPose-m 取得 COCO 验证集 75.8 mAP,推理速度达到 430+ FPS 。 + +## 📖 简介 [🔝](#-table-of-contents) + +
+ +
+ +
+ +
+
+ +
+ +### ✨ 主要特性 + +- 🚀 **高精度,低延迟** + + | Model | AP(COCO) | CPU-FPS | GPU-FPS | + | :---: | :------: | :-----: | :-----: | + | t | 68.5 | 300+ | 940+ | + | s | 72.2 | 200+ | 710+ | + | m | 75.8 | 90+ | 430+ | + | l | 76.5 | 50+ | 280+ | + +- 🛠️ **易部署** + + - 详细的部署代码教程,手把手教你模型部署 + - MMDeploy 助力 + - 支持多种部署后端 + - ONNX + - TensorRT + - ncnn + - OpenVINO 等 + - 支持多种平台 + - Linux + - Windows + - NVIDIA Jetson + - ARM 等 + +- 🏗️ **为实际业务设计** + + - 提供多种 Pipeline 推理接口和 SDK + - Python + - C++ + - C# + - JAVA 等 + +## 🙌 社区共建 [🔝](#-table-of-contents) + +RTMPose 是一个长期优化迭代的项目,致力于业务场景下的高性能实时姿态估计算法的训练、优化和部署,因此我们十分期待来自社区的力量,欢迎分享不同业务场景中 RTMPose 的训练配置与技巧,助力更多的社区用户! + +✨ ✨ ✨ + +- **如果你是 RTMPose 的新用户,我们热切希望你能参与[这份问卷](https://uua478.fanqier.cn/f/xxmynrki)/[Google Questionnaire](https://docs.google.com/forms/d/e/1FAIpQLSfzwWr3eNlDzhU98qzk2Eph44Zio6hi5r0iSwfO9wSARkHdWg/viewform?usp=sf_link),这对于我们的工作非常重要!** + +✨ ✨ ✨ + +欢迎加入我们的社区交流群获得更多帮助: + +- 微信用户群 + +
+ + +- Discord Group: + - 🙌 https://discord.gg/raweFPmdzG 🙌 + +## ⚡ Pipeline 性能 [🔝](#-table-of-contents) + +**说明** + +- Pipeline 速度测试时开启了隔帧检测策略,默认检测间隔为 5 帧。 +- 环境配置: + - torch >= 1.7.1 + - onnxruntime 1.12.1 + - TensorRT 8.4.3.1 + - cuDNN 8.3.2 + - CUDA 11.3 + +| Detection Config | Pose Config | Input Size
(Det/Pose) | Model AP
(COCO) | Pipeline AP
(COCO) | Params (M)
(Det/Pose) | Flops (G)
(Det/Pose) | ORT-Latency(ms)
(i7-11700) | TRT-FP16-Latency(ms)
(GTX 1660Ti) | Download | +| :------------------------------------------------------------------ | :---------------------------------------------------------------------------- | :---------------------------: | :---------------------: | :------------------------: | :---------------------------: | :--------------------------: | :--------------------------------: | :---------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| [RTMDet-nano](./rtmdet/person/rtmdet_nano_320-8xb32_coco-person.py) | [RTMPose-t](./rtmpose/body_2d_keypoint/rtmpose-t_8xb256-420e_coco-256x192.py) | 320x320
256x192 | 40.3
67.1 | 64.4 | 0.99
3.34 | 0.31
0.36 | 12.403 | 2.467 | [det](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_nano_8xb32-100e_coco-obj365-person-05d8511e.pth)
[pose](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-tiny_simcc-aic-coco_pt-aic-coco_420e-256x192-cfc8f33d_20230126.pth) | +| [RTMDet-nano](./rtmdet/person/rtmdet_nano_320-8xb32_coco-person.py) | [RTMPose-s](./rtmpose/body_2d_keypoint/rtmpose-s_8xb256-420e_coco-256x192.py) | 320x320
256x192 | 40.3
71.1 | 68.5 | 0.99
5.47 | 0.31
0.68 | 16.658 | 2.730 | [det](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_nano_8xb32-100e_coco-obj365-person-05d8511e.pth)
[pose](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-s_simcc-aic-coco_pt-aic-coco_420e-256x192-fcb2599b_20230126.pth) | +| [RTMDet-nano](./rtmdet/person/rtmdet_nano_320-8xb32_coco-person.py) | [RTMPose-m](./rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py) | 320x320
256x192 | 40.3
75.3 | 73.2 | 0.99
13.59 | 0.31
1.93 | 26.613 | 4.312 | [det](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_nano_8xb32-100e_coco-obj365-person-05d8511e.pth)
[pose](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.pth) | +| [RTMDet-nano](./rtmdet/person/rtmdet_nano_320-8xb32_coco-person.py) | [RTMPose-l](./rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-256x192.py) | 320x320
256x192 | 40.3
76.3 | 74.2 | 0.99
27.66 | 0.31
4.16 | 36.311 | 4.644 | [det](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_nano_8xb32-100e_coco-obj365-person-05d8511e.pth)
[pose](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-256x192-f016ffe0_20230126.pth) | +| [RTMDet-m](./rtmdet/person/rtmdet_m_640-8xb32_coco-person.py) | [RTMPose-m](./rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py) | 640x640
256x192 | 62.5
75.3 | 75.7 | 24.66
13.59 | 38.95
1.93 | - | 6.923 | [det](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth)
[pose](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.pth) | +| [RTMDet-m](./rtmdet/person/rtmdet_m_640-8xb32_coco-person.py) | [RTMPose-l](./rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-256x192.py) | 640x640
256x192 | 62.5
76.3 | 76.6 | 24.66
27.66 | 38.95
4.16 | - | 7.204 | [det](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth)
[pose](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-256x192-f016ffe0_20230126.pth) | + +## 📊 模型库 [🔝](#-table-of-contents) + +**说明** + +- 此处提供的模型采用了多数据集联合训练以提高性能,模型指标不适用于学术比较。 +- 表格中为开启了 Flip Test 的测试结果。 +- RTMPose 在更多公开数据集上的性能指标可以前往 [Model Zoo](https://mmpose.readthedocs.io/en/1.x/model_zoo_papers/algorithms.html) 查看。 +- RTMPose 在更多硬件平台上的推理速度可以前往 [Benchmark](./benchmark/README_CN.md) 查看。 +- 如果你有希望我们支持的数据集,欢迎[联系我们](https://uua478.fanqier.cn/f/xxmynrki)/[Google Questionnaire](https://docs.google.com/forms/d/e/1FAIpQLSfzwWr3eNlDzhU98qzk2Eph44Zio6hi5r0iSwfO9wSARkHdWg/viewform?usp=sf_link)! + +### 人体 2d 关键点 (17 Keypoints) + +| Config | Input Size | AP
(COCO) | Params(M) | FLOPS(G) | ORT-Latency(ms)
(i7-11700) | TRT-FP16-Latency(ms)
(GTX 1660Ti) | ncnn-FP16-Latency(ms)
(Snapdragon 865) | Logs | Download | +| :---------: | :--------: | :---------------: | :-------: | :------: | :--------------------------------: | :---------------------------------------: | :--------------------------------------------: | :--------: | :------------: | +| [RTMPose-t](./rtmpose/body_2d_keypoint/rtmpose-t_8xb256-420e_coco-256x192.py) | 256x192 | 68.5 | 3.34 | 0.36 | 3.20 | 1.06 | 9.02 | [Log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-tiny_simcc-aic-coco_pt-aic-coco_420e-256x192-cfc8f33d_20230126.json) | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-tiny_simcc-aic-coco_pt-aic-coco_420e-256x192-cfc8f33d_20230126.pth) | +| [RTMPose-s](./rtmpose/body_2d_keypoint/rtmpose-s_8xb256-420e_coco-256x192.py) | 256x192 | 72.2 | 5.47 | 0.68 | 4.48 | 1.39 | 13.89 | [Log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-s_simcc-aic-coco_pt-aic-coco_420e-256x192-fcb2599b_20230126.json) | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-s_simcc-aic-coco_pt-aic-coco_420e-256x192-fcb2599b_20230126.pth) | +| [RTMPose-m](./rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py) | 256x192 | 75.8 | 13.59 | 1.93 | 11.06 | 2.29 | 26.44 | [Log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.json) | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.pth) | +| [RTMPose-l](./rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-256x192.py) | 256x192 | 76.5 | 27.66 | 4.16 | 18.85 | 3.46 | 45.37 | [Log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-256x192-f016ffe0_20230126.json) | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-256x192-f016ffe0_20230126.pth) | +| [RTMPose-m](./rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-384x288.py) | 384x288 | 77.0 | 13.72 | 4.33 | 24.78 | 3.66 | - | [Log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-384x288-a62a0b32_20230228.json) | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-384x288-a62a0b32_20230228.pth) | +| [RTMPose-l](./rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-384x288.py) | 384x288 | 77.3 | 27.79 | 9.35 | - | 6.05 | - | [Log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-384x288-97d6cb0f_20230228.json) | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-aic-coco_pt-aic-coco_420e-384x288-97d6cb0f_20230228.pth) | + +#### 模型剪枝 + +**说明** + +- 模型剪枝由 [MMRazor](https://github.com/open-mmlab/mmrazor) 提供 + +| Config | Input Size | AP
(COCO) | Params(M) | FLOPS(G) | ORT-Latency(ms)
(i7-11700) | TRT-FP16-Latency(ms)
(GTX 1660Ti) | ncnn-FP16-Latency(ms)
(Snapdragon 865) | Logs | Download | +| :---------: | :--------: | :---------------: | :-------: | :------: | :--------------------------------: | :---------------------------------------: | :--------------------------------------------: | :--------: | :------------: | +| RTMPose-s-aic-coco-pruned | 256x192 | 69.4 | 3.43 | 0.35 | - | - | - | [log](https://download.openmmlab.com/mmrazor/v1/pruning/group_fisher/rtmpose-s/group_fisher_finetune_rtmpose-s_8xb256-420e_aic-coco-256x192.json) | [model](https://download.openmmlab.com/mmrazor/v1/pruning/group_fisher/rtmpose-s/group_fisher_finetune_rtmpose-s_8xb256-420e_aic-coco-256x192.pth) | + +更多信息,请参考 [GroupFisher Pruning for RTMPose](./rtmpose/pruning/README.md). + +### 人体全身 2d 关键点 (133 Keypoints) + +| Config | Input Size | Whole AP | Whole AR | FLOPS(G) | ORT-Latency(ms)
(i7-11700) | TRT-FP16-Latency(ms)
(GTX 1660Ti) | Logs | Download | +| :----------------------------- | :--------: | :------: | :------: | :------: | :--------------------------------: | :---------------------------------------: | :--------------------------: | :-------------------------------: | +| [RTMPose-m](./rtmpose/wholebody_2d_keypoint/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py) | 256x192 | 60.4 | 66.7 | 2.22 | 13.50 | 4.00 | [Log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-coco-wholebody_pt-aic-coco_270e-256x192-cd5e845c_20230123.json) | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-coco-wholebody_pt-aic-coco_270e-256x192-cd5e845c_20230123.pth) | +| [RTMPose-l](./rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py) | 256x192 | 63.2 | 69.4 | 4.52 | 23.41 | 5.67 | [Log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-256x192-6f206314_20230124.json) | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-256x192-6f206314_20230124.pth) | +| [RTMPose-l](./rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py) | 384x288 | 67.0 | 72.3 | 10.07 | 44.58 | 7.68 | [Log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-384x288-eaeb96c8_20230125.json) | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-l_simcc-coco-wholebody_pt-aic-coco_270e-384x288-eaeb96c8_20230125.pth) | + +### 动物 2d 关键点 (17 Keypoints) + +| Config | Input Size | AP
(AP10K) | FLOPS(G) | ORT-Latency(ms)
(i7-11700) | TRT-FP16-Latency(ms)
(GTX 1660Ti) | Logs | Download | +| :---------------------------: | :--------: | :----------------: | :------: | :--------------------------------: | :---------------------------------------: | :--------------------------: | :------------------------------: | +| [RTMPose-m](./rtmpose/animal_2d_keypoint/rtmpose-m_8xb64-210e_ap10k-256x256.py) | 256x256 | 72.2 | 2.57 | 14.157 | 2.404 | [Log](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-ap10k_pt-aic-coco_210e-256x256-7a041aa1_20230206.json) | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-m_simcc-ap10k_pt-aic-coco_210e-256x256-7a041aa1_20230206.pth) | + +### 脸部 2d 关键点 + +| Config | Input Size | NME
(COCO-WholeBody-Face) | FLOPS(G) | ORT-Latency(ms)
(i7-11700) | TRT-FP16-Latency(ms)
(GTX 1660Ti) | Logs | Download | +| :--------------------------------------------------: | :--------: | :-------------------------------: | :------: | :--------------------------------: | :---------------------------------------: | :---------: | :---------: | +| [RTMPose-m](./rtmpose/face_2d_keypoint/wflw/rtmpose-m_8xb64-60e_coco-wholebody-face-256x256.py) | 256x256 | 4.57 | - | - | - | Coming soon | Coming soon | + +### 手部 2d 关键点 + +| Config | Input Size | PCK
(COCO-WholeBody-Hand) | FLOPS(G) | ORT-Latency(ms)
(i7-11700) | TRT-FP16-Latency(ms)
(GTX 1660Ti) | Logs | Download | +| :--------------------------------------------------: | :--------: | :-------------------------------: | :------: | :--------------------------------: | :---------------------------------------: | :---------: | :---------: | +| [RTMPose-m](./rtmpose/hand_2d_keypoint/coco_wholebody_hand/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py) | 256x256 | 81.5 | - | - | - | Coming soon | Coming soon | + +### 预训练模型 + +我们提供了 UDP 预训练的 CSPNeXt 模型参数,训练配置请参考 [pretrain_cspnext_udp folder](./rtmpose/pretrain_cspnext_udp/)。 + +| Model | Input Size | Params(M) | Flops(G) | AP
(GT) | AR
(GT) | Download | +| :-------: | :--------: | :-------: | :------: | :-------------: | :-------------: | :-----------------------------------------------------------------------------------------------------------------------------: | +| CSPNeXt-t | 256x192 | 6.03 | 1.43 | 65.5 | 68.9 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth) | +| CSPNeXt-s | 256x192 | 8.58 | 1.78 | 70.0 | 73.3 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/cspnext-s_udp-aic-coco_210e-256x192-92f5a029_20230130.pth) | +| CSPNeXt-m | 256x192 | 13.05 | 3.06 | 74.8 | 77.7 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth) | +| CSPNeXt-l | 256x192 | 32.44 | 5.33 | 77.2 | 79.9 | [Model](https://download.openmmlab.com/mmpose/v1/projects/rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth) | + +我们提供了 ImageNet 分类训练的 CSPNeXt 模型参数,更多细节请参考 [RTMDet](https://github.com/open-mmlab/mmdetection/blob/dev-3.x/configs/rtmdet/README.md#classification)。 + +| Model | Input Size | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Download | +| :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :---------------------------------------------------------------------------------------------------------------------------------: | +| CSPNeXt-tiny | 224x224 | 2.73 | 0.34 | 69.44 | 89.45 | [Model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e-3a2dd350.pth) | +| CSPNeXt-s | 224x224 | 4.89 | 0.66 | 74.41 | 92.23 | [Model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e-ea671761.pth) | +| CSPNeXt-m | 224x224 | 13.05 | 1.93 | 79.27 | 94.79 | [Model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth) | +| CSPNeXt-l | 224x224 | 27.16 | 4.19 | 81.30 | 95.62 | [Model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-l_8xb256-rsb-a1-600e_in1k-6a760974.pth) | + +## 👀 可视化 [🔝](#-table-of-contents) + +
+ + +
+ +## 😎 快速尝试 [🔝](#-table-of-contents) + +我们提供了两种途径来让用户尝试 RTMPose 模型: + +- MMDeploy SDK 预编译包 (推荐) +- MMPose demo 脚本 + +### MMDeploy SDK 预编译包 (推荐) + +MMDeploy 提供了预编译的 SDK,用于对 RTMPose 项目进行 Pipeline 推理,其中推理所用的模型为 SDK 版本。导出 SDK 版模型的教程见 [SDK 推理](#%EF%B8%8F-sdk-推理),推理的详细参数设置见 [Pipeline 推理](#-pipeline-推理)。 + +说明: + +- GCC 版本需大于 7.5 +- cmake 版本需大于 3.14 + +#### ONNX + +```shell +# 下载预编译包 +wget https://github.com/open-mmlab/mmdeploy/releases/download/v1.0.0rc3/mmdeploy-1.0.0rc3-linux-x86_64-onnxruntime1.8.1.tar.gz + +# 解压文件 +tar -xzvf mmdeploy-1.0.0rc3-linux-x86_64-onnxruntime1.8.1.tar.gz + +# 切换到 sdk 目录 +cd mmdeploy-1.0.0rc3-linux-x86_64-onnxruntime1.8.1/sdk + +# 设置环境变量 +source env.sh + +# 如果系统中没有安装 opencv 3+,请执行以下命令。如果已安装,可略过 +bash opencv.sh + +# 编译可执行程序 +bash build.sh + +# 图片推理 +./bin/det_pose {det work-dir} {pose work-dir} {your_img.jpg} --device cpu + +# 视频推理 +./bin/pose_tracker {det work-dir} {pose work-dir} {your_video.mp4} --device cpu +``` + +#### TensorRT + +```shell +# 下载预编译包 +wget https://github.com/open-mmlab/mmdeploy/releases/download/v1.0.0rc3/mmdeploy-1.0.0rc3-linux-x86_64-cuda11.1-tensorrt8.2.3.0.tar.gz + +# 解压文件 +tar -xzvf mmdeploy-1.0.0rc3-linux-x86_64-cuda11.1-tensorrt8.2.3.0.tar.gz + +# 切换到 sdk 目录 +cd mmdeploy-1.0.0rc3-linux-x86_64-cuda11.1-tensorrt8.2.3.0/sdk + +# 设置环境变量 +source env.sh + +# 如果系统中没有安装 opencv 3+,请执行以下命令。如果已安装,可略过 +bash opencv.sh + +# 编译可执行程序 +bash build.sh + +# 图片推理 +./bin/det_pose {det work-dir} {pose work-dir} {your_img.jpg} --device cuda + +# 视频推理 +./bin/pose_tracker {det work-dir} {pose work-dir} {your_video.mp4} --device cuda +``` + +详细参数设置见 [Pipeline 推理](#-pipeline-推理)。 + +### MMPose demo 脚本 + +通过 MMPose 提供的 demo 脚本可以基于 Pytorch 快速进行[模型推理](https://mmpose.readthedocs.io/en/1.x/user_guides/inference.html)和效果验证。 + +```shell +# 前往 mmpose 目录 +cd ${PATH_TO_MMPOSE} + +# RTMDet 与 RTMPose 联合推理 +python demo/topdown_demo_with_mmdet.py \ + projects/rtmpose/rtmdet/person/rtmdet_nano_320-8xb32_coco-person.py \ + {PATH_TO_CHECKPOINT}/rtmdet_nano_8xb32-100e_coco-obj365-person-05d8511e.pth \ + projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py \ + {PATH_TO_CHECKPOINT}/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.pth \ + --input {YOUR_TEST_IMG} + --show +``` + +效果展示: + +![topdown_inference_with_rtmdet](https://user-images.githubusercontent.com/13503330/220005020-06bdf37f-6817-4681-a2c8-9dd55e4fbf1e.png) + +## 👨‍🏫 模型训练 [🔝](#-table-of-contents) + +请参考 [训练与测试](https://mmpose.readthedocs.io/en/1.x/user_guides/train_and_test.html) 进行 RTMPose 的训练。 + +**提示**: + +- RTMPose 默认开启了 `drop_last=True`,当用户的数据集较小时请根据情况缩小 `batch_size` 和 `base_lr`。 +- 模型选择 + - m:推荐首选使用 + - t/s:适用于极端低算力的移动设备,或对推理速度要求严格的场景 + - l:适用于算力较强、对速度不敏感的场景 + +## 🏗️ 部署教程 [🔝](#-table-of-contents) + +本教程将展示如何通过 [MMDeploy-1.x](https://github.com/open-mmlab/mmdeploy/tree/1.x) 部署 RTMPose 项目。 + +### 🧩 安装 + +在开始部署之前,首先你需要确保正确安装了 MMPose, MMDetection, MMDeploy,相关安装教程如下: + +- [安装 MMPose 与 MMDetection](https://mmpose.readthedocs.io/zh_CN/1.x/installation.html) +- [安装 MMDeploy](https://mmdeploy.readthedocs.io/zh_CN/1.x/04-supported-codebases/mmpose.html) + +根据部署后端的不同,有的后端需要对自定义算子进行编译,请根据需求前往对应的文档确保环境搭建正确: + +- [ONNX](https://mmdeploy.readthedocs.io/zh_CN/1.x/05-supported-backends/onnxruntime.html) +- [TensorRT](https://mmdeploy.readthedocs.io/zh_CN/1.x/05-supported-backends/tensorrt.html) +- [OpenVINO](https://mmdeploy.readthedocs.io/zh_CN/1.x/05-supported-backends/openvino.html) +- [更多](https://github.com/open-mmlab/mmdeploy/tree/1.x/docs/en/05-supported-backends) + +### 🛠️ 模型转换 + +在完成安装之后,你就可以开始模型部署了。通过 MMDeploy 提供的 `tools/deploy.py` 可以方便地将 Pytorch 模型转换到不同的部署后端。 + +我们本节演示将 RTMDet 和 RTMPose 模型导出为 ONNX 和 TensorRT 格式,如果你希望了解更多内容请前往 [MMDeploy 文档](https://mmdeploy.readthedocs.io/zh_CN/1.x/02-how-to-run/convert_model.html)。 + +- ONNX 配置 + + \- RTMDet:[`detection_onnxruntime_static.py`](https://github.com/open-mmlab/mmdeploy/blob/1.x/configs/mmdet/detection/detection_onnxruntime_static.py) + + \- RTMPose:[`pose-detection_simcc_onnxruntime_dynamic.py`](https://github.com/open-mmlab/mmdeploy/blob/1.x/configs/mmpose/pose-detection_simcc_onnxruntime_dynamic.py) + +- TensorRT 配置 + + \- RTMDet:[`detection_tensorrt_static-320x320.py`](https://github.com/open-mmlab/mmdeploy/blob/1.x/configs/mmdet/detection/detection_tensorrt_static-320x320.py) + + \- RTMPose:[`pose-detection_simcc_tensorrt_dynamic-256x192.py`](https://github.com/open-mmlab/mmdeploy/blob/1.x/configs/mmpose/pose-detection_simcc_tensorrt_dynamic-256x192.py) + +如果你需要对部署配置进行修改,请参考 [MMDeploy config tutorial](https://mmdeploy.readthedocs.io/zh_CN/1.x/02-how-to-run/write_config.html). + +本教程中使用的文件结构如下: + +```Python +|----mmdeploy +|----mmdetection +|----mmpose +|----rtmdet_nano +| |----rtmdet_nano.pth +|----rtmpose_m + |----rtmpose_m.pth +``` + +#### ONNX + +运行如下命令: + +```shell +# 前往 mmdeploy 目录 +cd ${PATH_TO_MMDEPLOY} + +# 转换 RTMDet +python tools/deploy.py \ + configs/mmdet/detection/detection_onnxrumtime_static.py \ + {RTMPOSE_PROJECT}/rtmdet/person/rtmdet_nano_320-8xb32_coco-person.py \ + ../rtmdet_nano/rtmdet_nano.pth \ + demo/resources/human-pose.jpg \ + --work-dir mmdeploy_models/mmdet/ort \ + --device cpu \ + --show + +# 转换 RTMPose +python tools/deploy.py \ + configs/mmpose/pose-detection_simcc_onnxruntime_dynamic.py \ + {RTMPOSE_PROJECT}/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py \ + ../rtmpose_m/rtmpose_m.pth \ + demo/resources/human-pose.jpg \ + --work-dir mmdeploy_models/mmpose/ort \ + --device cpu \ + --show +``` + +默认导出模型文件为 `{work-dir}/end2end.onnx` + +#### TensorRT + +运行如下命令: + +```shell +# 前往 mmdeploy 目录 +cd ${PATH_TO_MMDEPLOY} + +# 转换 RTMDet +python tools/deploy.py \ + configs/mmdet/detection/detection_tensorrt_static-320x320.py \ + {RTMPOSE_PROJECT}/rtmdet/person/rtmdet_nano_320-8xb32_coco-person.py \ + ../rtmdet_nano/rtmdet_nano.pth \ + demo/resources/human-pose.jpg \ + --work-dir mmdeploy_models/mmdet/trt \ + --device cuda:0 \ + --show + +# 转换 RTMPose +python tools/deploy.py \ + configs/mmpose/pose-detection_simcc_tensorrt_dynamic-256x192.py \ + {RTMPOSE_PROJECT}/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py \ + ../rtmpose_m/rtmpose_m.pth \ + demo/resources/human-pose.jpg \ + --work-dir mmdeploy_models/mmpose/trt \ + --device cuda:0 \ + --show +``` + +默认导出模型文件为 `{work-dir}/end2end.engine` + +🎊 如果模型顺利导出,你将会看到样例图片上的检测结果: + +![convert_models](https://user-images.githubusercontent.com/13503330/217726963-7815dd01-561a-4605-b0c6-07b6fe1956c3.png) + +#### 高级设置 + +如果需要使用 TensorRT-FP16,你可以通过修改以下配置开启: + +```Python +# in MMDeploy config +backend_config = dict( + type='tensorrt', + common_config=dict( + fp16_mode=True # 打开 fp16 + )) +``` + +### 🕹️ SDK 推理 + +要进行 Pipeline 推理,需要先用 MMDeploy 导出 SDK 版本的 det 和 pose 模型,只需要在参数中加上`--dump-info`。 + +此处以 onnxruntime 的 cpu 模型为例,运行如下命令: + +```shell +# RTMDet +python tools/deploy.py \ + configs/mmdet/detection/detection_onnxrumtime_dynamic.py \ + {RTMPOSE_PROJECT}/rtmdet/person/rtmdet_nano_320-8xb32_coco-person.py \ + ../rtmdet_nano/rtmdet_nano.pth \ + demo/resources/human-pose.jpg \ + --work-dir mmdeploy_models/mmdet/sdk \ + --device cpu \ + --show \ + --dump-info # 导出 sdk info + +# RTMPose +python tools/deploy.py \ + configs/mmpose/pose-detection_simcc_onnxruntime_dynamic.py \ + {RTMPOSE_PROJECT}/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py \ + ../rtmpose_m/rtmpose_m.pth \ + demo/resources/human-pose.jpg \ + --work-dir mmdeploy_models/mmpose/sdk \ + --device cpu \ + --show \ + --dump-info # 导出 sdk info +``` + +默认会导出三个 json 文件: + +``` +|----sdk + |----end2end.onnx # ONNX model + |----end2end.engine # TensorRT engine file + + |----pipeline.json # + |----deploy.json # json files for the SDK + |----detail.json # +``` + +#### Python API + +```Python +# Copyright (c) OpenMMLab. All rights reserved. +import argparse + +import cv2 +import numpy as np +from mmdeploy_python import PoseDetector + +def parse_args(): + parser = argparse.ArgumentParser( + description='show how to use sdk python api') + parser.add_argument('device_name', help='name of device, cuda or cpu') + parser.add_argument( + 'model_path', + help='path of mmdeploy SDK model dumped by model converter') + parser.add_argument('image_path', help='path of an image') + parser.add_argument( + '--bbox', + default=None, + nargs='+', + type=int, + help='bounding box of an object in format (x, y, w, h)') + args = parser.parse_args() + return args + +def main(): + args = parse_args() + + img = cv2.imread(args.image_path) + + detector = PoseDetector( + model_path=args.model_path, device_name=args.device_name, device_id=0) + + if args.bbox is None: + result = detector(img) + else: + # converter (x, y, w, h) -> (left, top, right, bottom) + print(args.bbox) + bbox = np.array(args.bbox, dtype=int) + bbox[2:] += bbox[:2] + result = detector(img, bbox) + print(result) + + _, point_num, _ = result.shape + points = result[:, :, :2].reshape(point_num, 2) + for [x, y] in points.astype(int): + cv2.circle(img, (x, y), 1, (0, 255, 0), 2) + + cv2.imwrite('output_pose.png', img) + +if __name__ == '__main__': + main() +``` + +#### C++ API + +```C++ +#include "mmdeploy/detector.hpp" + +#include "opencv2/imgcodecs/imgcodecs.hpp" +#include "utils/argparse.h" +#include "utils/visualize.h" + +DEFINE_ARG_string(model, "Model path"); +DEFINE_ARG_string(image, "Input image path"); +DEFINE_string(device, "cpu", R"(Device name, e.g. "cpu", "cuda")"); +DEFINE_string(output, "detector_output.jpg", "Output image path"); + +DEFINE_double(det_thr, .5, "Detection score threshold"); + +int main(int argc, char* argv[]) { + if (!utils::ParseArguments(argc, argv)) { + return -1; + } + + cv::Mat img = cv::imread(ARGS_image); + if (img.empty()) { + fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str()); + return -1; + } + + // construct a detector instance + mmdeploy::Detector detector(mmdeploy::Model{ARGS_model}, mmdeploy::Device{FLAGS_device}); + + // apply the detector, the result is an array-like class holding references to + // `mmdeploy_detection_t`, will be released automatically on destruction + mmdeploy::Detector::Result dets = detector.Apply(img); + + // visualize + utils::Visualize v; + auto sess = v.get_session(img); + int count = 0; + for (const mmdeploy_detection_t& det : dets) { + if (det.score > FLAGS_det_thr) { // filter bboxes + sess.add_det(det.bbox, det.label_id, det.score, det.mask, count++); + } + } + + if (!FLAGS_output.empty()) { + cv::imwrite(FLAGS_output, sess.get()); + } + + return 0; +} +``` + +对于 C++ API 示例,请将 MMDeploy 加入到 CMake 项目中: + +```CMake +find_package(MMDeploy REQUIRED) +target_link_libraries(${name} PRIVATE mmdeploy ${OpenCV_LIBS}) +``` + +#### 其他语言 + +- [C# API 示例](https://github.com/open-mmlab/mmdeploy/tree/1.x/demo/csharp) +- [JAVA API 示例](https://github.com/open-mmlab/mmdeploy/tree/1.x/demo/java) + +### 🚀 Pipeline 推理 + +#### 图片推理 + +如果用户有跟随 MMDeploy 安装教程进行正确编译,在 `mmdeploy/build/bin/` 路径下会看到 `det_pose` 的可执行文件。 + +```shell +# 前往 mmdeploy 目录 +cd ${PATH_TO_MMDEPLOY}/build/bin/ + +# 单张图片推理 +./det_pose {det work-dir} {pose work-dir} {your_img.jpg} --device cpu + +required arguments: + det_model Detection 模型路径 [string] + pose_model Pose 模型路径 [string] + image 输入图片路径 [string] + +optional arguments: + --device 推理设备 "cpu", "cuda" [string = "cpu"] + --output 导出图片路径 [string = "det_pose_output.jpg"] + --skeleton 骨架定义文件路径,或使用预定义骨架: + "coco" [string = "coco", "coco-wholoebody"] + --det_label 用于姿势估计的检测标签 [int32 = 0] + (0 在 coco 中对应 person) + --det_thr 检测分数阈值 [double = 0.5] + --det_min_bbox_size 最小检测框大小 [double = -1] + --pose_thr 关键点置信度阈值 [double = 0] +``` + +**API** **示例** + +\- [`det_pose.py`](https://github.com/open-mmlab/mmdeploy/blob/dev-1.x/demo/python/det_pose.py) + +\- [`det_pose.cxx`](https://github.com/open-mmlab/mmdeploy/blob/dev-1.x/demo/csrc/cpp/det_pose.cxx) + +#### 视频推理 + +如果用户有跟随 MMDeploy 安装教程进行正确编译,在 `mmdeploy/build/bin/` 路径下会看到 `pose_tracker` 的可执行文件。 + +```shell +# 前往 mmdeploy 目录 +cd ${PATH_TO_MMDEPLOY}/build/bin/ + +# 视频推理 +./pose_tracker {det work-dir} {pose work-dir} {your_video.mp4} --device cpu + +required arguments: + det_model Detection 模型路径 [string] + pose_model Pose 模型路径 [string] + input 输入图片路径或摄像头序号 [string] + +optional arguments: + --device 推理设备 "cpu", "cuda" [string = "cpu"] + --output 导出视频路径 [string = ""] + --output_size 输出视频帧的长边 [int32 = 0] + --flip 设置为1,用于水平翻转输入 [int32 = 0] + --show 使用`cv::imshow`时,传递给`cv::waitKey`的延迟; + -1: 关闭 [int32 = 1] + --skeleton 骨架数据的路径或预定义骨架的名称: + "coco", "coco-wholebody" [string = "coco"] + --background 导出视频背景颜色, "default": 原图, "black": + 纯黑背景 [string = "default"] + --det_interval 检测间隔 [int32 = 1] + --det_label 用于姿势估计的检测标签 [int32 = 0] + (0 在 coco 中对应 person) + --det_thr 检测分数阈值 [double = 0.5] + --det_min_bbox_size 最小检测框大小 [double = -1] + --det_nms_thr NMS IOU阈值,用于合并检测到的bboxes和 + 追踪到的目标的 bboxes [double = 0.7] + --pose_max_num_bboxes 每一帧用于姿势估计的 bboxes 的最大数量 + [int32 = -1] + --pose_kpt_thr 可见关键点的阈值 [double = 0.5] + --pose_min_keypoints 有效姿势的最小关键点数量,-1表示上限(n_kpts/2) [int32 = -1] + --pose_bbox_scale 将关键点扩展到 bbox 的比例 [double = 1.25] + --pose_min_bbox_size 最小追踪尺寸,尺寸小于阈值的 bbox 将被剔除 [double = -1] + --pose_nms_thr 用于抑制重叠姿势的 NMS OKS/IOU阈值。 + 当多个姿态估计重叠到同一目标时非常有用 [double = 0.5] + --track_iou_thr 追踪 IOU 阈值 [double = 0.4] + --track_max_missing 最大追踪容错 [int32 = 10] +``` + +**API** **示例** + +\- [`pose_tracker.py`](https://github.com/open-mmlab/mmdeploy/blob/dev-1.x/demo/python/pose_tracker.py) + +\- [`pose_tracker.cxx`](https://github.com/open-mmlab/mmdeploy/blob/dev-1.x/demo/csrc/cpp/pose_tracker.cxx) + +## 📚 常用功能 [🔝](#-table-of-contents) + +### 🚀 模型测速 [🔝](#-table-of-contents) + +如果需要测试模型在部署框架下的推理速度,MMDeploy 提供了方便的 `tools/profiler.py` 脚本。 + +用户需要准备一个存放测试图片的文件夹`./test_images`,profiler 将随机从该目录下抽取图片用于模型测速。 + +```shell +python tools/profiler.py \ + configs/mmpose/pose-detection_simcc_onnxruntime_dynamic.py \ + {RTMPOSE_PROJECT}/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py \ + ../test_images \ + --model {WORK_DIR}/end2end.onnx \ + --shape 256x192 \ + --device cpu \ + --warmup 50 \ + --num-iter 200 +``` + +测试结果如下: + +```shell +01/30 15:06:35 - mmengine - INFO - [onnxruntime]-70 times per count: 8.73 ms, 114.50 FPS +01/30 15:06:36 - mmengine - INFO - [onnxruntime]-90 times per count: 9.05 ms, 110.48 FPS +01/30 15:06:37 - mmengine - INFO - [onnxruntime]-110 times per count: 9.87 ms, 101.32 FPS +01/30 15:06:37 - mmengine - INFO - [onnxruntime]-130 times per count: 9.99 ms, 100.10 FPS +01/30 15:06:38 - mmengine - INFO - [onnxruntime]-150 times per count: 10.39 ms, 96.29 FPS +01/30 15:06:39 - mmengine - INFO - [onnxruntime]-170 times per count: 10.77 ms, 92.86 FPS +01/30 15:06:40 - mmengine - INFO - [onnxruntime]-190 times per count: 10.98 ms, 91.05 FPS +01/30 15:06:40 - mmengine - INFO - [onnxruntime]-210 times per count: 11.19 ms, 89.33 FPS +01/30 15:06:41 - mmengine - INFO - [onnxruntime]-230 times per count: 11.16 ms, 89.58 FPS +01/30 15:06:42 - mmengine - INFO - [onnxruntime]-250 times per count: 11.06 ms, 90.41 FPS +----- Settings: ++------------+---------+ +| batch size | 1 | +| shape | 256x192 | +| iterations | 200 | +| warmup | 50 | ++------------+---------+ +----- Results: ++--------+------------+---------+ +| Stats | Latency/ms | FPS | ++--------+------------+---------+ +| Mean | 11.060 | 90.412 | +| Median | 11.852 | 84.375 | +| Min | 7.812 | 128.007 | +| Max | 13.690 | 73.044 | ++--------+------------+---------+ +``` + +如果你希望详细了解 profiler 的更多参数设置与功能,可以前往 [Profiler Docs](https://mmdeploy.readthedocs.io/en/1.x/02-how-to-run/useful_tools.html#profiler) + +### 📊 精度验证 [🔝](#-table-of-contents) + +如果需要测试模型在部署框架下的推理精度,MMDeploy 提供了方便的 `tools/test.py` 脚本。 + +```shell +python tools/test.py \ + configs/mmpose/pose-detection_simcc_onnxruntime_dynamic.py \ + {RTMPOSE_PROJECT}/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py \ + --model {PATH_TO_MODEL}/rtmpose_m.pth \ + --device cpu +``` + +详细内容请参考 [MMDeploys Docs](https://github.com/open-mmlab/mmdeploy/blob/dev-1.x/docs/zh_cn/02-how-to-run/profile_model.md) + +## 📜 引用 [🔝](#-table-of-contents) + +如果您觉得 RTMPose 对您的研究工作有所帮助,请考虑引用它: + +```bibtex +@misc{https://doi.org/10.48550/arxiv.2303.07399, + doi = {10.48550/ARXIV.2303.07399}, + url = {https://arxiv.org/abs/2303.07399}, + author = {Jiang, Tao and Lu, Peng and Zhang, Li and Ma, Ningsheng and Han, Rui and Lyu, Chengqi and Li, Yining and Chen, Kai}, + keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose}, + publisher = {arXiv}, + year = {2023}, + copyright = {Creative Commons Attribution 4.0 International} +} + +@misc{mmpose2020, + title={OpenMMLab Pose Estimation Toolbox and Benchmark}, + author={MMPose Contributors}, + howpublished = {\url{https://github.com/open-mmlab/mmpose}}, + year={2020} +} +``` diff --git a/projects/rtmpose/benchmark/README.md b/projects/rtmpose/benchmark/README.md new file mode 100644 index 0000000000..13fe9c183f --- /dev/null +++ b/projects/rtmpose/benchmark/README.md @@ -0,0 +1,116 @@ +# RTMPose Benchmarks + +English | [简体中文](./README_CN.md) + +Community users are welcome to contribute to this project directory by performing inference speed tests on different hardware devices. + +Currently tested: + +- CPU + - Intel i7-11700 +- GPU + - NVIDIA GeForce 1660 Ti + - NVIDIA GeForce RTX 3090 +- Nvidia Jetson + - AGX Orin + - Orin NX +- ARM + - Snapdragon 865 + +## Body 2d (17 Keypoints) + +### Model Info + +| Config | Input Size | AP
(COCO) | Params(M) | FLOPS(G) | +| :-------------------------------------------------------------------------------: | :--------: | :---------------: | :-------: | :------: | +| [RTMPose-t](../rtmpose/body_2d_keypoint/rtmpose-tiny_8xb256-420e_coco-256x192.py) | 256x192 | 68.5 | 3.34 | 0.36 | +| [RTMPose-s](../rtmpose/body_2d_keypoint/rtmpose-s_8xb256-420e_coco-256x192.py) | 256x192 | 72.2 | 5.47 | 0.68 | +| [RTMPose-m](../rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py) | 256x192 | 75.8 | 13.59 | 1.93 | +| [RTMPose-l](../rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-256x192.py) | 256x192 | 76.5 | 27.66 | 4.16 | +| [RTMPose-m](../rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-384x288.py) | 384x288 | 77.0 | 13.72 | 4.33 | +| [RTMPose-l](../rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-384x288.py) | 384x288 | 77.3 | 27.79 | 9.35 | + +### Speed Benchmark + +- Numbers displayed in the table are inference latencies in millisecond(ms). + +| Config | Input Size | ORT
(i7-11700) | TRT-FP16
(GTX 1660Ti) | TRT-FP16
(RTX 3090) | ncnn-FP16
(Snapdragon 865) | TRT-FP16
(Jetson AGX Orin) | TRT-FP16
(Jetson Orin NX) | +| :---------: | :--------: | :--------------------: | :---------------------------: | :-------------------------: | :--------------------------------: | :--------------------------------: | :-------------------------------: | +| [RTMPose-t](../rtmpose/body_2d_keypoint/rtmpose-tiny_8xb256-420e_coco-256x192.py) | 256x192 | 3.20 | 1.06 | 0.98 | 9.02 | 1.63 | 1.97 | +| [RTMPose-s](../rtmpose/body_2d_keypoint/rtmpose-s_8xb256-420e_coco-256x192.py) | 256x192 | 4.48 | 1.39 | 1.12 | 13.89 | 1.85 | 2.18 | +| [RTMPose-m](../rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py) | 256x192 | 11.06 | 2.29 | 1.18 | 26.44 | 2.72 | 3.35 | +| [RTMPose-l](../rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-256x192.py) | 256x192 | 18.85 | 3.46 | 1.37 | 45.37 | 3.67 | 4.78 | +| [RTMPose-m](../rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-384x288.py) | 384x288 | 24.78 | 3.66 | 1.20 | 26.44 | 3.45 | 5.08 | +| [RTMPose-l](../rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-384x288.py) | 384x288 | - | 6.05 | 1.74 | - | 4.93 | 7.23 | + +## WholeBody 2d (133 Keypoints) + +### Model Info + +| Config | Input Size | Whole AP | Whole AR | FLOPS(G) | +| :------------------------------------------------------------------------------------------- | :--------: | :------: | :------: | :------: | +| [RTMPose-m](../rtmpose/wholebody_2d_keypoint/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py) | 256x192 | 60.4 | 66.7 | 2.22 | +| [RTMPose-l](../rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py) | 256x192 | 63.2 | 69.4 | 4.52 | +| [RTMPose-l](../rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py) | 384x288 | 67.0 | 72.3 | 10.07 | + +### Speed Benchmark + +- Numbers displayed in the table are inference latencies in millisecond(ms). +- Data from different community users are separated by `|`. + +| Config | Input Size | ORT
(i7-11700) | TRT-FP16
(GTX 1660Ti) | TRT-FP16
(RTX 3090) | TRT-FP16
(Jetson AGX Orin) | TRT-FP16
(Jetson Orin NX) | +| :-------------------------------------------- | :--------: | :--------------------: | :---------------------------: | :-------------------------: | :--------------------------------: | :-------------------------------: | +| [RTMPose-m](../rtmpose/wholebody_2d_keypoint/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py) | 256x192 | 13.50 | 4.00 | 1.17 \| 1.84 | 2.79 | 3.51 | +| [RTMPose-l](../rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py) | 256x192 | 23.41 | 5.67 | 1.44 \| 2.61 | 3.80 | 4.95 | +| [RTMPose-l](../rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py) | 384x288 | 44.58 | 7.68 | 1.75 \| 4.24 | 5.08 | 7.20 | + +## How To Test Speed + +If you need to test the inference speed of the model under the deployment framework, MMDeploy provides a convenient `tools/profiler.py` script. + +The user needs to prepare a folder for the test images `./test_images`, the profiler will randomly read images from this directory for the model speed test. + +```shell +python tools/profiler.py \ + configs/mmpose/pose-detection_simcc_onnxruntime_dynamic.py \ + {RTMPOSE_PROJECT}/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py \ + ../test_images \ + --model {WORK_DIR}/end2end.onnx \ + --shape 256x192 \ + --device cpu \ + --warmup 50 \ + --num-iter 200 +``` + +The result is as follows: + +```shell +01/30 15:06:35 - mmengine - INFO - [onnxruntime]-70 times per count: 8.73 ms, 114.50 FPS +01/30 15:06:36 - mmengine - INFO - [onnxruntime]-90 times per count: 9.05 ms, 110.48 FPS +01/30 15:06:37 - mmengine - INFO - [onnxruntime]-110 times per count: 9.87 ms, 101.32 FPS +01/30 15:06:37 - mmengine - INFO - [onnxruntime]-130 times per count: 9.99 ms, 100.10 FPS +01/30 15:06:38 - mmengine - INFO - [onnxruntime]-150 times per count: 10.39 ms, 96.29 FPS +01/30 15:06:39 - mmengine - INFO - [onnxruntime]-170 times per count: 10.77 ms, 92.86 FPS +01/30 15:06:40 - mmengine - INFO - [onnxruntime]-190 times per count: 10.98 ms, 91.05 FPS +01/30 15:06:40 - mmengine - INFO - [onnxruntime]-210 times per count: 11.19 ms, 89.33 FPS +01/30 15:06:41 - mmengine - INFO - [onnxruntime]-230 times per count: 11.16 ms, 89.58 FPS +01/30 15:06:42 - mmengine - INFO - [onnxruntime]-250 times per count: 11.06 ms, 90.41 FPS +----- Settings: ++------------+---------+ +| batch size | 1 | +| shape | 256x192 | +| iterations | 200 | +| warmup | 50 | ++------------+---------+ +----- Results: ++--------+------------+---------+ +| Stats | Latency/ms | FPS | ++--------+------------+---------+ +| Mean | 11.060 | 90.412 | +| Median | 11.852 | 84.375 | +| Min | 7.812 | 128.007 | +| Max | 13.690 | 73.044 | ++--------+------------+---------+ +``` + +If you want to learn more details of profiler, you can refer to the [Profiler Docs](https://mmdeploy.readthedocs.io/en/1.x/02-how-to-run/useful_tools.html#profiler). diff --git a/projects/rtmpose/benchmark/README_CN.md b/projects/rtmpose/benchmark/README_CN.md new file mode 100644 index 0000000000..08578f44f5 --- /dev/null +++ b/projects/rtmpose/benchmark/README_CN.md @@ -0,0 +1,116 @@ +# RTMPose Benchmarks + +简体中文 | [English](./README.md) + +欢迎社区用户在不同硬件设备上进行推理速度测试,贡献到本项目目录下。 + +当前已测试: + +- CPU + - Intel i7-11700 +- GPU + - NVIDIA GeForce 1660 Ti + - NVIDIA GeForce RTX 3090 +- Nvidia Jetson + - AGX Orin + - Orin NX +- ARM + - Snapdragon 865 + +### 人体 2d 关键点 (17 Keypoints) + +### Model Info + +| Config | Input Size | AP
(COCO) | Params(M) | FLOPS(G) | +| :-------------------------------------------------------------------------------: | :--------: | :---------------: | :-------: | :------: | +| [RTMPose-t](../rtmpose/body_2d_keypoint/rtmpose-tiny_8xb256-420e_coco-256x192.py) | 256x192 | 68.5 | 3.34 | 0.36 | +| [RTMPose-s](../rtmpose/body_2d_keypoint/rtmpose-s_8xb256-420e_coco-256x192.py) | 256x192 | 72.2 | 5.47 | 0.68 | +| [RTMPose-m](../rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py) | 256x192 | 75.8 | 13.59 | 1.93 | +| [RTMPose-l](../rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-256x192.py) | 256x192 | 76.5 | 27.66 | 4.16 | +| [RTMPose-m](../rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-384x288.py) | 384x288 | 77.0 | 13.72 | 4.33 | +| [RTMPose-l](../rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-384x288.py) | 384x288 | 77.3 | 27.79 | 9.35 | + +### Speed Benchmark + +图中所示为模型推理时间,单位毫秒。 + +| Config | Input Size | ORT
(i7-11700) | TRT-FP16
(GTX 1660Ti) | TRT-FP16
(RTX 3090) | ncnn-FP16
(Snapdragon 865) | TRT-FP16
(Jetson AGX Orin) | TRT-FP16
(Jetson Orin NX) | +| :---------: | :--------: | :--------------------: | :---------------------------: | :-------------------------: | :--------------------------------: | :--------------------------------: | :-------------------------------: | +| [RTMPose-t](../rtmpose/body_2d_keypoint/rtmpose-tiny_8xb256-420e_coco-256x192.py) | 256x192 | 3.20 | 1.06 | 0.98 | 9.02 | 1.63 | 1.97 | +| [RTMPose-s](../rtmpose/body_2d_keypoint/rtmpose-s_8xb256-420e_coco-256x192.py) | 256x192 | 4.48 | 1.39 | 1.12 | 13.89 | 1.85 | 2.18 | +| [RTMPose-m](../rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py) | 256x192 | 11.06 | 2.29 | 1.18 | 26.44 | 2.72 | 3.35 | +| [RTMPose-l](../rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-256x192.py) | 256x192 | 18.85 | 3.46 | 1.37 | 45.37 | 3.67 | 4.78 | +| [RTMPose-m](../rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-384x288.py) | 384x288 | 24.78 | 3.66 | 1.20 | 26.44 | 3.45 | 5.08 | +| [RTMPose-l](../rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-384x288.py) | 384x288 | - | 6.05 | 1.74 | - | 4.93 | 7.23 | + +### 人体全身 2d 关键点 (133 Keypoints) + +### Model Info + +| Config | Input Size | Whole AP | Whole AR | FLOPS(G) | +| :------------------------------------------------------------------------------------------- | :--------: | :------: | :------: | :------: | +| [RTMPose-m](../rtmpose/wholebody_2d_keypoint/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py) | 256x192 | 60.4 | 66.7 | 2.22 | +| [RTMPose-l](../rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py) | 256x192 | 63.2 | 69.4 | 4.52 | +| [RTMPose-l](../rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py) | 384x288 | 67.0 | 72.3 | 10.07 | + +### Speed Benchmark + +- 图中所示为模型推理时间,单位毫秒。 +- 来自不同社区用户的测试数据用 `|` 分隔开。 + +| Config | Input Size | ORT
(i7-11700) | TRT-FP16
(GTX 1660Ti) | TRT-FP16
(RTX 3090) | TRT-FP16
(Jetson AGX Orin) | TRT-FP16
(Jetson Orin NX) | +| :-------------------------------------------- | :--------: | :--------------------: | :---------------------------: | :-------------------------: | :--------------------------------: | :-------------------------------: | +| [RTMPose-m](../rtmpose/wholebody_2d_keypoint/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py) | 256x192 | 13.50 | 4.00 | 1.17 \| 1.84 | 2.79 | 3.51 | +| [RTMPose-l](../rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py) | 256x192 | 23.41 | 5.67 | 1.44 \| 2.61 | 3.80 | 4.95 | +| [RTMPose-l](../rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py) | 384x288 | 44.58 | 7.68 | 1.75 \| 4.24 | 5.08 | 7.20 | + +## 如何测试推理速度 + +我们使用 MMDeploy 提供的 `tools/profiler.py` 脚本进行模型测速。 + +用户需要准备一个存放测试图片的文件夹`./test_images`,profiler 将随机从该目录下抽取图片用于模型测速。 + +```shell +python tools/profiler.py \ + configs/mmpose/pose-detection_simcc_onnxruntime_dynamic.py \ + {RTMPOSE_PROJECT}/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py \ + ../test_images \ + --model {WORK_DIR}/end2end.onnx \ + --shape 256x192 \ + --device cpu \ + --warmup 50 \ + --num-iter 200 +``` + +The result is as follows: + +```shell +01/30 15:06:35 - mmengine - INFO - [onnxruntime]-70 times per count: 8.73 ms, 114.50 FPS +01/30 15:06:36 - mmengine - INFO - [onnxruntime]-90 times per count: 9.05 ms, 110.48 FPS +01/30 15:06:37 - mmengine - INFO - [onnxruntime]-110 times per count: 9.87 ms, 101.32 FPS +01/30 15:06:37 - mmengine - INFO - [onnxruntime]-130 times per count: 9.99 ms, 100.10 FPS +01/30 15:06:38 - mmengine - INFO - [onnxruntime]-150 times per count: 10.39 ms, 96.29 FPS +01/30 15:06:39 - mmengine - INFO - [onnxruntime]-170 times per count: 10.77 ms, 92.86 FPS +01/30 15:06:40 - mmengine - INFO - [onnxruntime]-190 times per count: 10.98 ms, 91.05 FPS +01/30 15:06:40 - mmengine - INFO - [onnxruntime]-210 times per count: 11.19 ms, 89.33 FPS +01/30 15:06:41 - mmengine - INFO - [onnxruntime]-230 times per count: 11.16 ms, 89.58 FPS +01/30 15:06:42 - mmengine - INFO - [onnxruntime]-250 times per count: 11.06 ms, 90.41 FPS +----- Settings: ++------------+---------+ +| batch size | 1 | +| shape | 256x192 | +| iterations | 200 | +| warmup | 50 | ++------------+---------+ +----- Results: ++--------+------------+---------+ +| Stats | Latency/ms | FPS | ++--------+------------+---------+ +| Mean | 11.060 | 90.412 | +| Median | 11.852 | 84.375 | +| Min | 7.812 | 128.007 | +| Max | 13.690 | 73.044 | ++--------+------------+---------+ +``` + +If you want to learn more details of profiler, you can refer to the [Profiler Docs](https://mmdeploy.readthedocs.io/en/1.x/02-how-to-run/useful_tools.html#profiler). diff --git a/projects/rtmpose/rtmdet/person/rtmdet_m_640-8xb32_coco-person.py b/projects/rtmpose/rtmdet/person/rtmdet_m_640-8xb32_coco-person.py new file mode 100644 index 0000000000..620de8dc8f --- /dev/null +++ b/projects/rtmpose/rtmdet/person/rtmdet_m_640-8xb32_coco-person.py @@ -0,0 +1,20 @@ +_base_ = 'mmdet::rtmdet/rtmdet_m_8xb32-300e_coco.py' + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth' # noqa + +model = dict( + backbone=dict( + init_cfg=dict( + type='Pretrained', prefix='backbone.', checkpoint=checkpoint)), + bbox_head=dict(num_classes=1), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +train_dataloader = dict(dataset=dict(metainfo=dict(classes=('person', )))) + +val_dataloader = dict(dataset=dict(metainfo=dict(classes=('person', )))) +test_dataloader = val_dataloader diff --git a/projects/rtmpose/rtmdet/person/rtmdet_nano_320-8xb32_coco-person.py b/projects/rtmpose/rtmdet/person/rtmdet_nano_320-8xb32_coco-person.py new file mode 100644 index 0000000000..b93d651735 --- /dev/null +++ b/projects/rtmpose/rtmdet/person/rtmdet_nano_320-8xb32_coco-person.py @@ -0,0 +1,110 @@ +_base_ = 'mmdet::rtmdet/rtmdet_l_8xb32-300e_coco.py' + +input_shape = 320 + +model = dict( + backbone=dict( + deepen_factor=0.33, + widen_factor=0.25, + use_depthwise=True, + ), + neck=dict( + in_channels=[64, 128, 256], + out_channels=64, + num_csp_blocks=1, + use_depthwise=True, + ), + bbox_head=dict( + in_channels=64, + feat_channels=64, + share_conv=False, + exp_on_reg=False, + use_depthwise=True, + num_classes=1), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +train_pipeline = [ + dict( + type='LoadImageFromFile', + file_client_args={{_base_.file_client_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='CachedMosaic', + img_scale=(input_shape, input_shape), + pad_val=114.0, + max_cached_images=20, + random_pop=False), + dict( + type='RandomResize', + scale=(input_shape * 2, input_shape * 2), + ratio_range=(0.5, 1.5), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(input_shape, input_shape)), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict( + type='Pad', + size=(input_shape, input_shape), + pad_val=dict(img=(114, 114, 114))), + dict(type='PackDetInputs') +] + +train_pipeline_stage2 = [ + dict( + type='LoadImageFromFile', + file_client_args={{_base_.file_client_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', + scale=(input_shape, input_shape), + ratio_range=(0.5, 1.5), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(input_shape, input_shape)), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict( + type='Pad', + size=(input_shape, input_shape), + pad_val=dict(img=(114, 114, 114))), + dict(type='PackDetInputs') +] + +test_pipeline = [ + dict( + type='LoadImageFromFile', + file_client_args={{_base_.file_client_args}}), + dict(type='Resize', scale=(input_shape, input_shape), keep_ratio=True), + dict( + type='Pad', + size=(input_shape, input_shape), + pad_val=dict(img=(114, 114, 114))), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + dataset=dict(pipeline=train_pipeline, metainfo=dict(classes=('person', )))) + +val_dataloader = dict( + dataset=dict(pipeline=test_pipeline, metainfo=dict(classes=('person', )))) +test_dataloader = val_dataloader + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='PipelineSwitchHook', + switch_epoch=280, + switch_pipeline=train_pipeline_stage2) +] diff --git a/projects/rtmpose/rtmpose/animal_2d_keypoint/rtmpose-m_8xb64-210e_ap10k-256x256.py b/projects/rtmpose/rtmpose/animal_2d_keypoint/rtmpose-m_8xb64-210e_ap10k-256x256.py new file mode 100644 index 0000000000..0fa5c5d30c --- /dev/null +++ b/projects/rtmpose/rtmpose/animal_2d_keypoint/rtmpose-m_8xb64-210e_ap10k-256x256.py @@ -0,0 +1,246 @@ +_base_ = ['mmpose::_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=(8, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'AP10KDataset' +data_mode = 'topdown' +data_root = 'data/ap10k/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/pose/ap10k/', +# f'{data_root}': 's3://openmmlab/datasets/pose/ap10k/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-train-split1.json', + data_prefix=dict(img='data/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-val-split1.json', + data_prefix=dict(img='data/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/ap10k-test-split1.json', + data_prefix=dict(img='data/'), + test_mode=True, + pipeline=val_pipeline, + )) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/ap10k-val-split1.json') +test_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/ap10k-test-split1.json') diff --git a/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-256x192.py b/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-256x192.py new file mode 100644 index 0000000000..b44df792a1 --- /dev/null +++ b/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-256x192.py @@ -0,0 +1,232 @@ +_base_ = ['mmpose::_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=1024, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file=f'{data_root}person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-384x288.py b/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-384x288.py new file mode 100644 index 0000000000..2468c40d53 --- /dev/null +++ b/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-l_8xb256-420e_coco-384x288.py @@ -0,0 +1,232 @@ +_base_ = ['mmpose::_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(288, 384), + sigma=(6., 6.93), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=1024, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=(9, 12), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file=f'{data_root}person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py b/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py new file mode 100644 index 0000000000..c7e3061c53 --- /dev/null +++ b/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-256x192.py @@ -0,0 +1,232 @@ +_base_ = ['mmpose::_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file=f'{data_root}person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-384x288.py b/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-384x288.py new file mode 100644 index 0000000000..16a7b0c493 --- /dev/null +++ b/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-384x288.py @@ -0,0 +1,232 @@ +_base_ = ['mmpose::_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(288, 384), + sigma=(6., 6.93), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=(9, 12), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file=f'{data_root}person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-s_8xb256-420e_coco-256x192.py b/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-s_8xb256-420e_coco-256x192.py new file mode 100644 index 0000000000..dca589bef9 --- /dev/null +++ b/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-s_8xb256-420e_coco-256x192.py @@ -0,0 +1,232 @@ +_base_ = ['mmpose::_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.33, + widen_factor=0.5, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-s_udp-aic-coco_210e-256x192-92f5a029_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=512, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file=f'{data_root}person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-t_8xb256-420e_coco-256x192.py b/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-t_8xb256-420e_coco-256x192.py new file mode 100644 index 0000000000..cd16e0a98a --- /dev/null +++ b/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-t_8xb256-420e_coco-256x192.py @@ -0,0 +1,233 @@ +_base_ = ['mmpose::_base_/default_runtime.py'] + +# runtime +max_epochs = 420 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 210 to 420 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.167, + widen_factor=0.375, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-tiny_udp-aic-coco_210e-256x192-cbed682d_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=384, + out_channels=17, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True)) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file=f'{data_root}person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + # Turn off EMA while training the tiny model + # dict( + # type='EMAHook', + # ema_type='ExpMomentumEMA', + # momentum=0.0002, + # update_buffers=True, + # priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/projects/rtmpose/rtmpose/face_2d_keypoint/rtmpose-m_8xb32-60e_coco-wholebody-face-256x256.py b/projects/rtmpose/rtmpose/face_2d_keypoint/rtmpose-m_8xb32-60e_coco-wholebody-face-256x256.py new file mode 100644 index 0000000000..dba43a7d72 --- /dev/null +++ b/projects/rtmpose/rtmpose/face_2d_keypoint/rtmpose-m_8xb32-60e_coco-wholebody-face-256x256.py @@ -0,0 +1,232 @@ +_base_ = ['mmpose::_base_/default_runtime.py'] + +# runtime +max_epochs = 60 +stage2_num_epochs = 10 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=1) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=68, + input_size=codec['input_size'], + in_featuremap_size=(8, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoWholeBodyFaceDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + # dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + # dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='NME', rule='less', max_keep_ckpts=1, interval=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='NME', + norm_mode='keypoint_distance', +) +test_evaluator = val_evaluator diff --git a/projects/rtmpose/rtmpose/hand_2d_keypoint/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py b/projects/rtmpose/rtmpose/hand_2d_keypoint/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py new file mode 100644 index 0000000000..63049aa4d1 --- /dev/null +++ b/projects/rtmpose/rtmpose/hand_2d_keypoint/rtmpose-m_8xb32-210e_coco-wholebody-hand-256x256.py @@ -0,0 +1,233 @@ +_base_ = ['mmpose::_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=256) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(256, 256), + sigma=(5.66, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=21, + input_size=codec['input_size'], + in_featuremap_size=(8, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoWholeBodyHandDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + # dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.5, 1.5], + rotate_factor=180), + dict(type='RandomFlip', direction='horizontal'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + # dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=180), + dict(type='RandomFlip', direction='horizontal'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='AUC', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = [ + dict(type='PCKAccuracy', thr=0.2), + dict(type='AUC'), + dict(type='EPE') +] +test_evaluator = val_evaluator diff --git a/projects/rtmpose/rtmpose/pretrain_cspnext_udp/cspnext-l_udp_8xb256-210e_coco-256x192.py b/projects/rtmpose/rtmpose/pretrain_cspnext_udp/cspnext-l_udp_8xb256-210e_coco-256x192.py new file mode 100644 index 0000000000..148976f792 --- /dev/null +++ b/projects/rtmpose/rtmpose/pretrain_cspnext_udp/cspnext-l_udp_8xb256-210e_coco-256x192.py @@ -0,0 +1,214 @@ +_base_ = ['mmpose::_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 105 to 210 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-l_8xb256-rsb-a1-600e_in1k-6a760974.pth')), + head=dict( + type='HeatmapHead', + in_channels=1024, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=False, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/projects/rtmpose/rtmpose/pretrain_cspnext_udp/cspnext-m_udp_8xb256-210e_coco-256x192.py b/projects/rtmpose/rtmpose/pretrain_cspnext_udp/cspnext-m_udp_8xb256-210e_coco-256x192.py new file mode 100644 index 0000000000..b42699fd11 --- /dev/null +++ b/projects/rtmpose/rtmpose/pretrain_cspnext_udp/cspnext-m_udp_8xb256-210e_coco-256x192.py @@ -0,0 +1,214 @@ +_base_ = ['mmpose::_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 105 to 210 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth')), + head=dict( + type='HeatmapHead', + in_channels=768, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=False, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/projects/rtmpose/rtmpose/pretrain_cspnext_udp/cspnext-s_udp_8xb256-210e_coco-256x192.py b/projects/rtmpose/rtmpose/pretrain_cspnext_udp/cspnext-s_udp_8xb256-210e_coco-256x192.py new file mode 100644 index 0000000000..0a458406ff --- /dev/null +++ b/projects/rtmpose/rtmpose/pretrain_cspnext_udp/cspnext-s_udp_8xb256-210e_coco-256x192.py @@ -0,0 +1,214 @@ +_base_ = ['mmpose::_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 105 to 210 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.33, + widen_factor=0.5, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-s_imagenet_600e-ea671761.pth')), + head=dict( + type='HeatmapHead', + in_channels=512, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=False, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/projects/rtmpose/rtmpose/pretrain_cspnext_udp/cspnext-tiny_udp_8xb256-210e_coco-256x192.py b/projects/rtmpose/rtmpose/pretrain_cspnext_udp/cspnext-tiny_udp_8xb256-210e_coco-256x192.py new file mode 100644 index 0000000000..adde3c0af3 --- /dev/null +++ b/projects/rtmpose/rtmpose/pretrain_cspnext_udp/cspnext-tiny_udp_8xb256-210e_coco-256x192.py @@ -0,0 +1,214 @@ +_base_ = ['mmpose::_base_/default_runtime.py'] + +# runtime +max_epochs = 210 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 105 to 210 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=1024) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.167, + widen_factor=0.375, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'rtmdet/cspnext_rsb_pretrain/' + 'cspnext-tiny_imagenet_600e-3a2dd350.pth')), + head=dict( + type='HeatmapHead', + in_channels=384, + out_channels=17, + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=False, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +dataset_type = 'CocoDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=256, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + # bbox_file='data/coco/person_detection_results/' + # 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + # dict( + # type='EMAHook', + # ema_type='ExpMomentumEMA', + # momentum=0.0002, + # update_buffers=True, + # priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator diff --git a/projects/rtmpose/rtmpose/pruning/README.md b/projects/rtmpose/rtmpose/pruning/README.md new file mode 100644 index 0000000000..28be530cc1 --- /dev/null +++ b/projects/rtmpose/rtmpose/pruning/README.md @@ -0,0 +1,117 @@ +# GroupFisher Pruning for RTMPose + +# Description + +We try to apply a pruning algorithm to RTMPose models. In detail, we prune a RTMPose model to a smaller size as the same as a smaller RTMPose model, like pruning RTMPose-S to the size of RTMPose-T. +The expriments show that the pruned model have better performance(AP) than the RTMPose model with the similar size and inference speed. + +Concretly, we select the RTMPose-S as the base model and prune it to the size of RTMPose-T, and use GroupFisher pruning algorithm which is able to determine the pruning structure automatically. +Furthermore, we provide two version of the pruned models including only using coco and using both of coco and ai-challenge datasets. + +# Results and Models + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | Flops | Params | ckpt | log | +| :-------------------------------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :---: | :----: | :---------------------------------------: | :------------: | +| [rtmpose-s-pruned](./group_fisher_finetune_rtmpose-s_8xb256-420e_coco-256x192.py) | 256x192 | 0.691 | 0.885 | 0.765 | 0.745 | 0.925 | 0.34 | 3.42 | [pruned][rp_sc_p] \| [finetuned][rp_sc_f] | [log][rp_sc_l] | +| [rtmpose-s-aic-coco-pruned](./group_fisher_finetune_rtmpose-s_8xb256-420e_aic-coco-256x192.py) | 256x192 | 0.694 | 0.884 | 0.771 | 0.747 | 0.922 | 0.35 | 3.43 | [pruned][rp_sa_p] \| [finetuned][rp_sa_f] | [log][rp_sa_l] | + +## Get Started + +We have three steps to apply GroupFisher to your model, including Prune, Finetune, Deploy. + +Note: please use torch>=1.12, as we need fxtracer to parse the models automatically. + +### Prune + +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 PORT=29500 ./tools/dist_train.sh \ + {config_folder}/group_fisher_{normalization_type}_prune_{model_name}.py 8 \ + --work-dir $WORK_DIR +``` + +In the pruning config file. You have to fill some args as below. + +```python +""" +_base_ (str): The path to your pretrained model checkpoint. +pretrained_path (str): The path to your pretrained model checkpoint. + +interval (int): Interval between pruning two channels. You should ensure you + can reach your target pruning ratio when the training ends. +normalization_type (str): GroupFisher uses two methods to normlized the channel + importance, including ['flops','act']. The former uses flops, while the + latter uses the memory occupation of activation feature maps. +lr_ratio (float): Ratio to decrease lr rate. As pruning progress is unstable, + you need to decrease the original lr rate until the pruning training work + steadly without getting nan. + +target_flop_ratio (float): The target flop ratio to prune your model. +input_shape (Tuple): input shape to measure the flops. +""" +``` + +After the pruning process, you will get a checkpoint of the pruned model named flops\_{target_flop_ratio}.pth in your workdir. + +### Finetune + +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 PORT=29500 ./tools/dist_train.sh \ + {config_folder}/group_fisher_{normalization_type}_finetune_{model_name}.py 8 \ + --work-dir $WORK_DIR +``` + +There are also some args for you to fill in the config file as below. + +```python +""" +_base_(str): The path to your pruning config file. +pruned_path (str): The path to the checkpoint of the pruned model. +finetune_lr (float): The lr rate to finetune. Usually, we directly use the lr + rate of the pretrain. +""" +``` + +After finetuning, except a checkpoint of the best model, there is also a fix_subnet.json, which records the pruned model structure. It will be used when deploying. + +### Test + +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 PORT=29500 ./tools/dist_test.sh \ + {config_folder}/group_fisher_{normalization_type}_finetune_{model_name}.py {checkpoint_path} 8 +``` + +### Deploy + +For a pruned model, you only need to use the pruning deploy config to instead the pretrain config to deploy the pruned version of your model. If you are not familiar with mmdeploy, it's recommended to refer to [MMDeploy document](https://mmdeploy.readthedocs.io/en/1.x/02-how-to-run/convert_model.html). + +```bash +python {mmdeploy}/tools/deploy.py \ + {mmdeploy}/{mmdeploy_config}.py \ + {config_folder}/group_fisher_{normalization_type}_deploy_{model_name}.py \ + {path_to_finetuned_checkpoint}.pth \ + {mmdeploy}/tests/data/tiger.jpeg +``` + +The deploy config has some args as below: + +```python +""" +_base_ (str): The path to your pretrain config file. +fix_subnet (Union[dict,str]): The dict store the pruning structure or the + json file including it. +divisor (int): The divisor the make the channel number divisible. +""" +``` + +The divisor is important for the actual inference speed, and we suggest you to test it in \[1,2,4,8,16,32\] to find the fastest divisor. + +## Reference + +[GroupFisher in MMRazor](https://github.com/open-mmlab/mmrazor/tree/dev-1.x/configs/pruning/base/group_fisher) + +[rp_sa_f]: https://download.openmmlab.com/mmrazor/v1/pruning/group_fisher/rtmpose-s/group_fisher_finetune_rtmpose-s_8xb256-420e_aic-coco-256x192.pth +[rp_sa_l]: https://download.openmmlab.com/mmrazor/v1/pruning/group_fisher/rtmpose-s/group_fisher_finetune_rtmpose-s_8xb256-420e_aic-coco-256x192.json +[rp_sa_p]: https://download.openmmlab.com/mmrazor/v1/pruning/group_fisher/rtmpose-s/group_fisher_prune_rtmpose-s_8xb256-420e_aic-coco-256x192.pth +[rp_sc_f]: https://download.openmmlab.com/mmrazor/v1/pruning/group_fisher/rtmpose-s/group_fisher_finetune_rtmpose-s_8xb256-420e_coco-256x192.pth +[rp_sc_l]: https://download.openmmlab.com/mmrazor/v1/pruning/group_fisher/rtmpose-s/group_fisher_finetune_rtmpose-s_8xb256-420e_coco-256x192.json +[rp_sc_p]: https://download.openmmlab.com/mmrazor/v1/pruning/group_fisher/rtmpose-s/group_fisher_prune_rtmpose-s_8xb256-420e_coco-256x192.pth diff --git a/projects/rtmpose/rtmpose/pruning/README_CN.md b/projects/rtmpose/rtmpose/pruning/README_CN.md new file mode 100644 index 0000000000..945160b246 --- /dev/null +++ b/projects/rtmpose/rtmpose/pruning/README_CN.md @@ -0,0 +1,116 @@ +# 使用GroupFisher剪枝RTMPose + +# 概述 + +我们尝试使用 GroupFisher 算法对 RTMPose 模型进行剪枝。具体来说,我们将一个 RTMPose 模型剪枝到与较小的 RTMPose 模型相同的大小,例如将 RTMPose-S 剪枝到 RTMPose-T 的大小。 +实验表明,剪枝后的模型比具有相似大小和推理速度的 RTMPose 模型具有更好的性能(AP)。 + +我们使用能自动确定剪枝结构的 GroupFisher 剪枝算法,将 RTMPose-S 剪枝到 RTMPose-T 的大小。 +此外,我们提供了两个版本的剪枝模型,其中一个只使用 coco 数据集,另一个同时使用 coco 和 ai-challenge 数据集。 + +# 实验结果 + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | Flops | Params | ckpt | log | +| :-------------------------------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :---: | :----: | :---------------------------------------: | :------------: | +| [rtmpose-s-pruned](./group_fisher_finetune_rtmpose-s_8xb256-420e_coco-256x192.py) | 256x192 | 0.691 | 0.885 | 0.765 | 0.745 | 0.925 | 0.34 | 3.42 | [pruned][rp_sc_p] \| [finetuned][rp_sc_f] | [log][rp_sc_l] | +| [rtmpose-s-aic-coco-pruned](./group_fisher_finetune_rtmpose-s_8xb256-420e_aic-coco-256x192.py) | 256x192 | 0.694 | 0.884 | 0.771 | 0.747 | 0.922 | 0.35 | 3.43 | [pruned][rp_sa_p] \| [finetuned][rp_sa_f] | [log][rp_sa_l] | + +## Get Started + +我们需要三个步骤来将 GroupFisher 应用于你的模型,包括剪枝(Prune),微调(Finetune),部署(Deploy)。 +注意:请使用torch>=1.12,因为我们需要fxtracer来自动解析模型。 + +### Prune + +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 PORT=29500 ./tools/dist_train.sh \ + {config_folder}/group_fisher_{normalization_type}_prune_{model_name}.py 8 \ + --work-dir $WORK_DIR +``` + +在剪枝配置文件中,你需要填写以下参数。 + +```python +""" +_base_ (str): The path to your pretrained model checkpoint. +pretrained_path (str): The path to your pretrained model checkpoint. + +interval (int): Interval between pruning two channels. You should ensure you + can reach your target pruning ratio when the training ends. +normalization_type (str): GroupFisher uses two methods to normlized the channel + importance, including ['flops','act']. The former uses flops, while the + latter uses the memory occupation of activation feature maps. +lr_ratio (float): Ratio to decrease lr rate. As pruning progress is unstable, + you need to decrease the original lr rate until the pruning training work + steadly without getting nan. + +target_flop_ratio (float): The target flop ratio to prune your model. +input_shape (Tuple): input shape to measure the flops. +""" +``` + +在剪枝结束后,你将获得一个剪枝模型的 checkpoint,该 checkpoint 的名称为 flops\_{target_flop_ratio}.pth,位于你的 workdir 中。 + +### Finetune + +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 PORT=29500 ./tools/dist_train.sh \ + {config_folder}/group_fisher_{normalization_type}_finetune_{model_name}.py 8 \ + --work-dir $WORK_DIR +``` + +微调时也有一些参数需要你填写。 + +```python +""" +_base_(str): The path to your pruning config file. +pruned_path (str): The path to the checkpoint of the pruned model. +finetune_lr (float): The lr rate to finetune. Usually, we directly use the lr + rate of the pretrain. +""" +``` + +在微调结束后,除了最佳模型的 checkpoint 外,还有一个 fix_subnet.json,它记录了剪枝模型的结构。它将在部署时使用。 + +### Test + +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 PORT=29500 ./tools/dist_test.sh \ + {config_folder}/group_fisher_{normalization_type}_finetune_{model_name}.py {checkpoint_path} 8 +``` + +### Deploy + +对于剪枝模型,你只需要使用剪枝部署 config 来代替预训练 config 来部署模型的剪枝版本。如果你不熟悉 MMDeploy,请参看[MMDeploy document](https://mmdeploy.readthedocs.io/en/1.x/02-how-to-run/convert_model.html)。 + +```bash +python {mmdeploy}/tools/deploy.py \ + {mmdeploy}/{mmdeploy_config}.py \ + {config_folder}/group_fisher_{normalization_type}_deploy_{model_name}.py \ + {path_to_finetuned_checkpoint}.pth \ + {mmdeploy}/tests/data/tiger.jpeg +``` + +部署配置文件有如下参数: + +```python +""" +_base_ (str): The path to your pretrain config file. +fix_subnet (Union[dict,str]): The dict store the pruning structure or the + json file including it. +divisor (int): The divisor the make the channel number divisible. +""" +``` + +divisor 设置十分重要,我们建议你在尝试 \[1,2,4,8,16,32\],以找到最佳设置。 + +## Reference + +[GroupFisher in MMRazor](https://github.com/open-mmlab/mmrazor/tree/dev-1.x/configs/pruning/base/group_fisher) + +[rp_sa_f]: https://download.openmmlab.com/mmrazor/v1/pruning/group_fisher/rtmpose-s/group_fisher_finetune_rtmpose-s_8xb256-420e_aic-coco-256x192.pth +[rp_sa_l]: https://download.openmmlab.com/mmrazor/v1/pruning/group_fisher/rtmpose-s/group_fisher_finetune_rtmpose-s_8xb256-420e_aic-coco-256x192.json +[rp_sa_p]: https://download.openmmlab.com/mmrazor/v1/pruning/group_fisher/rtmpose-s/group_fisher_prune_rtmpose-s_8xb256-420e_aic-coco-256x192.pth +[rp_sc_f]: https://download.openmmlab.com/mmrazor/v1/pruning/group_fisher/rtmpose-s/group_fisher_finetune_rtmpose-s_8xb256-420e_coco-256x192.pth +[rp_sc_l]: https://download.openmmlab.com/mmrazor/v1/pruning/group_fisher/rtmpose-s/group_fisher_finetune_rtmpose-s_8xb256-420e_coco-256x192.json +[rp_sc_p]: https://download.openmmlab.com/mmrazor/v1/pruning/group_fisher/rtmpose-s/group_fisher_prune_rtmpose-s_8xb256-420e_coco-256x192.pth diff --git a/projects/rtmpose/rtmpose/pruning/group_fisher_deploy_rtmpose-s_8xb256-420e_aic-coco-256x192.py b/projects/rtmpose/rtmpose/pruning/group_fisher_deploy_rtmpose-s_8xb256-420e_aic-coco-256x192.py new file mode 100644 index 0000000000..3c720566f0 --- /dev/null +++ b/projects/rtmpose/rtmpose/pruning/group_fisher_deploy_rtmpose-s_8xb256-420e_aic-coco-256x192.py @@ -0,0 +1,53 @@ +############################################################################# +"""You have to fill these args. + +_base_(str): The path to your pretrain config file. +fix_subnet (Union[dict,str]): The dict store the pruning structure or the + json file including it. +divisor (int): The divisor the make the channel number divisible. +""" + +_base_ = 'mmpose::body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_aic-coco-256x192.py' # noqa +fix_subnet = { + 'backbone.stem.0.conv_(0, 16)_16': 8, + 'backbone.stem.1.conv_(0, 16)_16': 9, + 'backbone.stem.2.conv_(0, 32)_32': 9, + 'backbone.stage1.0.conv_(0, 64)_64': 32, + 'backbone.stage1.1.short_conv.conv_(0, 32)_32': 30, + 'backbone.stage1.1.main_conv.conv_(0, 32)_32': 29, + 'backbone.stage1.1.blocks.0.conv1.conv_(0, 32)_32': 24, + 'backbone.stage1.1.final_conv.conv_(0, 64)_64': 27, + 'backbone.stage2.0.conv_(0, 128)_128': 62, + 'backbone.stage2.1.short_conv.conv_(0, 64)_64': 63, + 'backbone.stage2.1.main_conv.conv_(0, 64)_64': 64, + 'backbone.stage2.1.blocks.0.conv1.conv_(0, 64)_64': 56, + 'backbone.stage2.1.blocks.1.conv1.conv_(0, 64)_64': 62, + 'backbone.stage2.1.final_conv.conv_(0, 128)_128': 65, + 'backbone.stage3.0.conv_(0, 256)_256': 167, + 'backbone.stage3.1.short_conv.conv_(0, 128)_128': 127, + 'backbone.stage3.1.main_conv.conv_(0, 128)_128': 128, + 'backbone.stage3.1.blocks.0.conv1.conv_(0, 128)_128': 124, + 'backbone.stage3.1.blocks.1.conv1.conv_(0, 128)_128': 123, + 'backbone.stage3.1.final_conv.conv_(0, 256)_256': 172, + 'backbone.stage4.0.conv_(0, 512)_512': 337, + 'backbone.stage4.1.conv1.conv_(0, 256)_256': 256, + 'backbone.stage4.1.conv2.conv_(0, 512)_512': 379, + 'backbone.stage4.2.short_conv.conv_(0, 256)_256': 188, + 'backbone.stage4.2.main_conv.conv_(0, 256)_256': 227, + 'backbone.stage4.2.blocks.0.conv1.conv_(0, 256)_256': 238, + 'backbone.stage4.2.blocks.0.conv2.pointwise_conv.conv_(0, 256)_256': 195, + 'backbone.stage4.2.final_conv.conv_(0, 512)_512': 163 +} +divisor = 8 +############################################################################## + +architecture = _base_.model + +model = dict( + _delete_=True, + _scope_='mmrazor', + type='GroupFisherDeploySubModel', + architecture=architecture, + fix_subnet=fix_subnet, + divisor=divisor, +) diff --git a/projects/rtmpose/rtmpose/pruning/group_fisher_deploy_rtmpose-s_8xb256-420e_coco-256x192.py b/projects/rtmpose/rtmpose/pruning/group_fisher_deploy_rtmpose-s_8xb256-420e_coco-256x192.py new file mode 100644 index 0000000000..64fa6c2b6b --- /dev/null +++ b/projects/rtmpose/rtmpose/pruning/group_fisher_deploy_rtmpose-s_8xb256-420e_coco-256x192.py @@ -0,0 +1,53 @@ +############################################################################# +"""You have to fill these args. + +_base_(str): The path to your pretrain config file. +fix_subnet (Union[dict,str]): The dict store the pruning structure or the + json file including it. +divisor (int): The divisor the make the channel number divisible. +""" + +_base_ = 'mmpose::body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_coco-256x192.py' # noqa +fix_subnet = { + 'backbone.stem.0.conv_(0, 16)_16': 8, + 'backbone.stem.1.conv_(0, 16)_16': 10, + 'backbone.stem.2.conv_(0, 32)_32': 11, + 'backbone.stage1.0.conv_(0, 64)_64': 32, + 'backbone.stage1.1.short_conv.conv_(0, 32)_32': 32, + 'backbone.stage1.1.main_conv.conv_(0, 32)_32': 23, + 'backbone.stage1.1.blocks.0.conv1.conv_(0, 32)_32': 25, + 'backbone.stage1.1.final_conv.conv_(0, 64)_64': 25, + 'backbone.stage2.0.conv_(0, 128)_128': 71, + 'backbone.stage2.1.short_conv.conv_(0, 64)_64': 61, + 'backbone.stage2.1.main_conv.conv_(0, 64)_64': 62, + 'backbone.stage2.1.blocks.0.conv1.conv_(0, 64)_64': 57, + 'backbone.stage2.1.blocks.1.conv1.conv_(0, 64)_64': 59, + 'backbone.stage2.1.final_conv.conv_(0, 128)_128': 69, + 'backbone.stage3.0.conv_(0, 256)_256': 177, + 'backbone.stage3.1.short_conv.conv_(0, 128)_128': 122, + 'backbone.stage3.1.main_conv.conv_(0, 128)_128': 123, + 'backbone.stage3.1.blocks.0.conv1.conv_(0, 128)_128': 125, + 'backbone.stage3.1.blocks.1.conv1.conv_(0, 128)_128': 123, + 'backbone.stage3.1.final_conv.conv_(0, 256)_256': 171, + 'backbone.stage4.0.conv_(0, 512)_512': 351, + 'backbone.stage4.1.conv1.conv_(0, 256)_256': 256, + 'backbone.stage4.1.conv2.conv_(0, 512)_512': 367, + 'backbone.stage4.2.short_conv.conv_(0, 256)_256': 183, + 'backbone.stage4.2.main_conv.conv_(0, 256)_256': 216, + 'backbone.stage4.2.blocks.0.conv1.conv_(0, 256)_256': 238, + 'backbone.stage4.2.blocks.0.conv2.pointwise_conv.conv_(0, 256)_256': 195, + 'backbone.stage4.2.final_conv.conv_(0, 512)_512': 187 +} +divisor = 16 +############################################################################## + +architecture = _base_.model + +model = dict( + _delete_=True, + _scope_='mmrazor', + type='GroupFisherDeploySubModel', + architecture=architecture, + fix_subnet=fix_subnet, + divisor=divisor, +) diff --git a/projects/rtmpose/rtmpose/pruning/group_fisher_finetune_rtmpose-s_8xb256-420e_aic-coco-256x192.py b/projects/rtmpose/rtmpose/pruning/group_fisher_finetune_rtmpose-s_8xb256-420e_aic-coco-256x192.py new file mode 100644 index 0000000000..b4fb4f827c --- /dev/null +++ b/projects/rtmpose/rtmpose/pruning/group_fisher_finetune_rtmpose-s_8xb256-420e_aic-coco-256x192.py @@ -0,0 +1,32 @@ +############################################################################# +"""# You have to fill these args. + +_base_(str): The path to your pruning config file. +pruned_path (str): The path to the checkpoint of the pruned model. +finetune_lr (float): The lr rate to finetune. Usually, we directly use the lr + rate of the pretrain. +""" + +_base_ = './group_fisher_prune_rtmpose-s_8xb256-420e_aic-coco-256x192.py' # noqa +pruned_path = 'https://download.openmmlab.com/mmrazor/v1/pruning/group_fisher/rtmpose-s/group_fisher_prune_rtmpose-s_8xb256-420e_aic-coco-256x192.pth' # noqa +finetune_lr = 4e-3 +############################################################################## + +algorithm = _base_.model +algorithm.init_cfg = dict(type='Pretrained', checkpoint=pruned_path) + +model = dict( + _delete_=True, + _scope_='mmrazor', + type='GroupFisherSubModel', + algorithm=algorithm, +) + +# restore lr +optim_wrapper = dict(optimizer=dict(lr=finetune_lr)) + +# remove pruning related hooks +custom_hooks = _base_.custom_hooks[:-2] + +# delete ddp +model_wrapper_cfg = None diff --git a/projects/rtmpose/rtmpose/pruning/group_fisher_finetune_rtmpose-s_8xb256-420e_coco-256x192.py b/projects/rtmpose/rtmpose/pruning/group_fisher_finetune_rtmpose-s_8xb256-420e_coco-256x192.py new file mode 100644 index 0000000000..5cc6db15e4 --- /dev/null +++ b/projects/rtmpose/rtmpose/pruning/group_fisher_finetune_rtmpose-s_8xb256-420e_coco-256x192.py @@ -0,0 +1,33 @@ +############################################################################# +"""# You have to fill these args. + +_base_(str): The path to your pruning config file. +pruned_path (str): The path to the checkpoint of the pruned model. +finetune_lr (float): The lr rate to finetune. Usually, we directly use the lr + rate of the pretrain. +""" + +_base_ = './group_fisher_prune_rtmpose-s_8xb256-420e_coco-256x192.py' +pruned_path = 'https://download.openmmlab.com/mmrazor/v1/pruning/group_fisher/rtmpose-s/group_fisher_prune_rtmpose-s_8xb256-420e_coco-256x192.pth' # noqa +finetune_lr = 4e-3 +############################################################################## + +algorithm = _base_.model +algorithm.init_cfg = dict(type='Pretrained', checkpoint=pruned_path) +# algorithm.update(dict(architecture=dict(test_cfg=dict(flip_test=False), ))) # disable flip test # noqa + +model = dict( + _delete_=True, + _scope_='mmrazor', + type='GroupFisherSubModel', + algorithm=algorithm, +) + +# restore lr +optim_wrapper = dict(optimizer=dict(lr=finetune_lr)) + +# remove pruning related hooks +custom_hooks = _base_.custom_hooks[:-2] + +# delete ddp +model_wrapper_cfg = None diff --git a/projects/rtmpose/rtmpose/pruning/group_fisher_prune_rtmpose-s_8xb256-420e_aic-coco-256x192.py b/projects/rtmpose/rtmpose/pruning/group_fisher_prune_rtmpose-s_8xb256-420e_aic-coco-256x192.py new file mode 100644 index 0000000000..14bdc96f5e --- /dev/null +++ b/projects/rtmpose/rtmpose/pruning/group_fisher_prune_rtmpose-s_8xb256-420e_aic-coco-256x192.py @@ -0,0 +1,75 @@ +############################################################################# +"""You have to fill these args. + +_base_ (str): The path to your pretrained model checkpoint. +pretrained_path (str): The path to your pretrained model checkpoint. + +interval (int): Interval between pruning two channels. You should ensure you + can reach your target pruning ratio when the training ends. +normalization_type (str): GroupFisher uses two methods to normlized the channel + importance, including ['flops','act']. The former uses flops, while the + latter uses the memory occupation of activation feature maps. +lr_ratio (float): Ratio to decrease lr rate. As pruning progress is unstable, + you need to decrease the original lr rate until the pruning training work + steadly without getting nan. + +target_flop_ratio (float): The target flop ratio to prune your model. +input_shape (Tuple): input shape to measure the flops. +""" + +_base_ = 'mmpose::body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_aic-coco-256x192.py' # noqa +pretrained_path = 'https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-s_simcc-aic-coco_pt-aic-coco_420e-256x192-fcb2599b_20230126.pth' # noqa + +interval = 10 +normalization_type = 'act' +lr_ratio = 0.1 + +target_flop_ratio = 0.51 +input_shape = (1, 3, 256, 192) +############################################################################## + +architecture = _base_.model + +if hasattr(_base_, 'data_preprocessor'): + architecture.update({'data_preprocessor': _base_.data_preprocessor}) + data_preprocessor = None + +architecture.init_cfg = dict(type='Pretrained', checkpoint=pretrained_path) +architecture['_scope_'] = _base_.default_scope + +model = dict( + _delete_=True, + _scope_='mmrazor', + type='GroupFisherAlgorithm', + architecture=architecture, + interval=interval, + mutator=dict( + type='GroupFisherChannelMutator', + parse_cfg=dict(type='ChannelAnalyzer', tracer_type='FxTracer'), + channel_unit_cfg=dict( + type='GroupFisherChannelUnit', + default_args=dict(normalization_type=normalization_type, ), + ), + ), +) + +model_wrapper_cfg = dict( + type='mmrazor.GroupFisherDDP', + broadcast_buffers=False, +) + +optim_wrapper = dict( + optimizer=dict(lr=_base_.optim_wrapper.optimizer.lr * lr_ratio)) + +custom_hooks = getattr(_base_, 'custom_hooks', []) + [ + dict(type='mmrazor.PruningStructureHook'), + dict( + type='mmrazor.ResourceInfoHook', + interval=interval, + demo_input=dict( + type='mmrazor.DefaultDemoInput', + input_shape=input_shape, + ), + save_ckpt_thr=[target_flop_ratio], + ), +] diff --git a/projects/rtmpose/rtmpose/pruning/group_fisher_prune_rtmpose-s_8xb256-420e_coco-256x192.py b/projects/rtmpose/rtmpose/pruning/group_fisher_prune_rtmpose-s_8xb256-420e_coco-256x192.py new file mode 100644 index 0000000000..5a998e5934 --- /dev/null +++ b/projects/rtmpose/rtmpose/pruning/group_fisher_prune_rtmpose-s_8xb256-420e_coco-256x192.py @@ -0,0 +1,75 @@ +############################################################################# +"""You have to fill these args. + +_base_ (str): The path to your pretrained model checkpoint. +pretrained_path (str): The path to your pretrained model checkpoint. + +interval (int): Interval between pruning two channels. You should ensure you + can reach your target pruning ratio when the training ends. +normalization_type (str): GroupFisher uses two methods to normlized the channel + importance, including ['flops','act']. The former uses flops, while the + latter uses the memory occupation of activation feature maps. +lr_ratio (float): Ratio to decrease lr rate. As pruning progress is unstable, + you need to decrease the original lr rate until the pruning training work + steadly without getting nan. + +target_flop_ratio (float): The target flop ratio to prune your model. +input_shape (Tuple): input shape to measure the flops. +""" + +_base_ = 'mmpose::body_2d_keypoint/rtmpose/coco/rtmpose-s_8xb256-420e_coco-256x192.py' # noqa +pretrained_path = 'https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmpose-s_simcc-coco_pt-aic-coco_420e-256x192-8edcf0d7_20230127.pth' # noqa + +interval = 10 +normalization_type = 'act' +lr_ratio = 0.1 + +target_flop_ratio = 0.51 +input_shape = (1, 3, 256, 192) +############################################################################## + +architecture = _base_.model + +if hasattr(_base_, 'data_preprocessor'): + architecture.update({'data_preprocessor': _base_.data_preprocessor}) + data_preprocessor = None + +architecture.init_cfg = dict(type='Pretrained', checkpoint=pretrained_path) +architecture['_scope_'] = _base_.default_scope + +model = dict( + _delete_=True, + _scope_='mmrazor', + type='GroupFisherAlgorithm', + architecture=architecture, + interval=interval, + mutator=dict( + type='GroupFisherChannelMutator', + parse_cfg=dict(type='ChannelAnalyzer', tracer_type='FxTracer'), + channel_unit_cfg=dict( + type='GroupFisherChannelUnit', + default_args=dict(normalization_type=normalization_type, ), + ), + ), +) + +model_wrapper_cfg = dict( + type='mmrazor.GroupFisherDDP', + broadcast_buffers=False, +) + +optim_wrapper = dict( + optimizer=dict(lr=_base_.optim_wrapper.optimizer.lr * lr_ratio)) + +custom_hooks = getattr(_base_, 'custom_hooks', []) + [ + dict(type='mmrazor.PruningStructureHook'), + dict( + type='mmrazor.ResourceInfoHook', + interval=interval, + demo_input=dict( + type='mmrazor.DefaultDemoInput', + input_shape=input_shape, + ), + save_ckpt_thr=[target_flop_ratio], + ), +] diff --git a/projects/rtmpose/rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py b/projects/rtmpose/rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py new file mode 100644 index 0000000000..83f1bdce00 --- /dev/null +++ b/projects/rtmpose/rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb32-270e_coco-wholebody-384x288.py @@ -0,0 +1,231 @@ +_base_ = ['mmpose::_base_/default_runtime.py'] + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(288, 384), + sigma=(6., 6.93), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=1024, + out_channels=133, + input_size=codec['input_size'], + in_featuremap_size=(9, 12), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/projects/rtmpose/rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py b/projects/rtmpose/rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py new file mode 100644 index 0000000000..a060d59a40 --- /dev/null +++ b/projects/rtmpose/rtmpose/wholebody_2d_keypoint/rtmpose-l_8xb64-270e_coco-wholebody-256x192.py @@ -0,0 +1,231 @@ +_base_ = ['mmpose::_base_/default_runtime.py'] + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1., + widen_factor=1., + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=1024, + out_channels=133, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/projects/rtmpose/rtmpose/wholebody_2d_keypoint/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py b/projects/rtmpose/rtmpose/wholebody_2d_keypoint/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py new file mode 100644 index 0000000000..f1f86f24b7 --- /dev/null +++ b/projects/rtmpose/rtmpose/wholebody_2d_keypoint/rtmpose-m_8xb64-270e_coco-wholebody-256x192.py @@ -0,0 +1,231 @@ +_base_ = ['mmpose::_base_/default_runtime.py'] + +# runtime +max_epochs = 270 +stage2_num_epochs = 30 +base_lr = 4e-3 + +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +randomness = dict(seed=21) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# codec settings +codec = dict( + type='SimCCLabel', + input_size=(192, 256), + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False, + use_dark=False) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + _scope_='mmdet', + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=0.67, + widen_factor=0.75, + out_indices=(4, ), + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU'), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint='https://download.openmmlab.com/mmpose/v1/projects/' + 'rtmpose/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth' # noqa + )), + head=dict( + type='RTMCCHead', + in_channels=768, + out_channels=133, + input_size=codec['input_size'], + in_featuremap_size=(6, 8), + simcc_split_ratio=codec['simcc_split_ratio'], + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True), + decoder=codec), + test_cfg=dict(flip_test=True, )) + +# base dataset settings +dataset_type = 'CocoWholeBodyDataset' +data_mode = 'topdown' +data_root = 'data/coco/' + +file_client_args = dict(backend='disk') +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/', +# f'{data_root}': 's3://openmmlab/datasets/detection/coco/' +# })) + +# pipelines +train_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=1.0), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='PackPoseInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImage', file_client_args=file_client_args), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict( + type='RandomBBoxTransform', + shift_factor=0., + scale_factor=[0.75, 1.25], + rotate_factor=60), + dict(type='TopdownAffine', input_size=codec['input_size']), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='Albumentation', + transforms=[ + dict(type='Blur', p=0.1), + dict(type='MedianBlur', p=0.1), + dict( + type='CoarseDropout', + max_holes=1, + max_height=0.4, + max_width=0.4, + min_holes=1, + min_height=0.2, + min_width=0.2, + p=0.5), + ]), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=10, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_train_v1.0.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=10, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/coco_wholebody_val_v1.0.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# hooks +default_hooks = dict( + checkpoint=dict( + save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +# evaluators +val_evaluator = dict( + type='CocoWholeBodyMetric', + ann_file=data_root + 'annotations/coco_wholebody_val_v1.0.json') +test_evaluator = val_evaluator diff --git a/requirements/mminstall.txt b/requirements/mminstall.txt index 580af80e5e..7b85a2f3f6 100644 --- a/requirements/mminstall.txt +++ b/requirements/mminstall.txt @@ -1,2 +1,2 @@ mmcv>=2.0.0rc1 -mmengine +mmengine>=0.4.0,<1.0.0 diff --git a/requirements/readthedocs.txt b/requirements/readthedocs.txt index ba5d0bfd79..6615950400 100644 --- a/requirements/readthedocs.txt +++ b/requirements/readthedocs.txt @@ -1,5 +1,5 @@ mmcv>=2.0.0rc1 -mmengine +mmengine>=0.4.0,<1.0.0 munkres regex scipy diff --git a/requirements/runtime.txt b/requirements/runtime.txt index ed0648440c..ab5c0172e4 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -1,5 +1,4 @@ chumpy -dataclasses; python_version == '3.6' json_tricks matplotlib munkres diff --git a/setup.py b/setup.py index b436a7431d..7222188e2f 100644 --- a/setup.py +++ b/setup.py @@ -176,14 +176,13 @@ def add_mim_extension(): 'License :: OSI Approved :: Apache Software License', 'Operating System :: OS Independent', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', ], url='https://github.com/open-mmlab/mmpose', license='Apache License 2.0', + python_requires='>=3.7', install_requires=parse_requirements('requirements/runtime.txt'), extras_require={ 'all': parse_requirements('requirements.txt'), diff --git a/tests/data/coco/test_keypoint_partition_metric.json b/tests/data/coco/test_keypoint_partition_metric.json new file mode 100644 index 0000000000..9d04f5e87b --- /dev/null +++ b/tests/data/coco/test_keypoint_partition_metric.json @@ -0,0 +1,7647 @@ +{ + "info": { + "description": "COCO-WholeBody sample", + "url": "https://github.com/jin-s13/COCO-WholeBody", + "version": "1.0", + "year": "2020", + "date_created": "2020/09/18" + }, + "licenses": [ + { + "url": "http://creativecommons.org/licenses/by-nc-sa/2.0/", + "id": 1, + "name": "Attribution-NonCommercial-ShareAlike License" + }, + { + "url": "http://creativecommons.org/licenses/by-nc/2.0/", + "id": 2, + "name": "Attribution-NonCommercial License" + }, + { + "url": "http://creativecommons.org/licenses/by-nc-nd/2.0/", + "id": 3, + "name": "Attribution-NonCommercial-NoDerivs License" + }, + { + "url": "http://creativecommons.org/licenses/by/2.0/", + "id": 4, + "name": "Attribution License" + }, + { + "url": "http://creativecommons.org/licenses/by-sa/2.0/", + "id": 5, + "name": "Attribution-ShareAlike License" + }, + { + "url": "http://creativecommons.org/licenses/by-nd/2.0/", + "id": 6, + "name": "Attribution-NoDerivs License" + }, + { + "url": "http://flickr.com/commons/usage/", + "id": 7, + "name": "No known copyright restrictions" + }, + { + "url": "http://www.usa.gov/copyright.shtml", + "id": 8, + "name": "United States Government Work" + } + ], + "categories": [ + { + "supercategory": "person", + "id": 1, + "name": "person", + "keypoints": [ + "nose", + "left_eye", + "right_eye", + "left_ear", + "right_ear", + "left_shoulder", + "right_shoulder", + "left_elbow", + "right_elbow", + "left_wrist", + "right_wrist", + "left_hip", + "right_hip", + "left_knee", + "right_knee", + "left_ankle", + "right_ankle" + ], + "skeleton": [ + [ + 16, + 14 + ], + [ + 14, + 12 + ], + [ + 17, + 15 + ], + [ + 15, + 13 + ], + [ + 12, + 13 + ], + [ + 6, + 12 + ], + [ + 7, + 13 + ], + [ + 6, + 7 + ], + [ + 6, + 8 + ], + [ + 7, + 9 + ], + [ + 8, + 10 + ], + [ + 9, + 11 + ], + [ + 2, + 3 + ], + [ + 1, + 2 + ], + [ + 1, + 3 + ], + [ + 2, + 4 + ], + [ + 3, + 5 + ], + [ + 4, + 6 + ], + [ + 5, + 7 + ] + ] + } + ], + "images": [ + { + "license": 4, + "file_name": "000000000785.jpg", + "coco_url": "http://images.cocodataset.org/val2017/000000000785.jpg", + "height": 425, + "width": 640, + "date_captured": "2013-11-19 21:22:42", + "flickr_url": "http://farm8.staticflickr.com/7015/6795644157_f019453ae7_z.jpg", + "id": 785 + }, + { + "license": 3, + "file_name": "000000040083.jpg", + "coco_url": "http://images.cocodataset.org/val2017/000000040083.jpg", + "height": 333, + "width": 500, + "date_captured": "2013-11-18 03:30:24", + "flickr_url": "http://farm1.staticflickr.com/116/254881838_e21c6d17b8_z.jpg", + "id": 40083 + }, + { + "license": 1, + "file_name": "000000196141.jpg", + "coco_url": "http://images.cocodataset.org/val2017/000000196141.jpg", + "height": 429, + "width": 640, + "date_captured": "2013-11-22 22:37:15", + "flickr_url": "http://farm4.staticflickr.com/3310/3611902235_57d4ae496d_z.jpg", + "id": 196141 + }, + { + "license": 3, + "file_name": "000000197388.jpg", + "coco_url": "http://images.cocodataset.org/val2017/000000197388.jpg", + "height": 392, + "width": 640, + "date_captured": "2013-11-19 20:10:37", + "flickr_url": "http://farm9.staticflickr.com/8375/8507321836_5b8b13188f_z.jpg", + "id": 197388 + } + ], + "annotations": [ + { + "segmentation": [ + [ + 353.37, + 67.65, + 358.15, + 52.37, + 362.92, + 47.59, + 374.38, + 44.73, + 389.66, + 52.37, + 389.66, + 67.65, + 389.66, + 76.25, + 393.48, + 83.89, + 396.35, + 88.66, + 397.3, + 91.53, + 406.85, + 99.17, + 413.54, + 104.9, + 451.74, + 148.83, + 458.43, + 153.6, + 462.25, + 166.02, + 467.02, + 173.66, + 463.2, + 181.3, + 449.83, + 183.21, + 448.88, + 191.81, + 455.56, + 226.19, + 448.88, + 254.84, + 453.65, + 286.36, + 475.62, + 323.6, + 491.85, + 361.81, + 494.72, + 382.82, + 494.72, + 382.82, + 499.49, + 391.41, + 416.4, + 391.41, + 424.04, + 383.77, + 439.33, + 374.22, + 445.06, + 360.85, + 436.46, + 334.11, + 421.18, + 303.55, + 416.4, + 289.22, + 409.72, + 268.21, + 396.35, + 280.63, + 405.9, + 298.77, + 417.36, + 324.56, + 425, + 349.39, + 425, + 357.99, + 419.27, + 360.85, + 394.44, + 367.54, + 362.92, + 370.4, + 346.69, + 367.54, + 360.06, + 362.76, + 369.61, + 360.85, + 382.98, + 340.8, + 355.28, + 271.08, + 360.06, + 266.3, + 386.8, + 219.5, + 368.65, + 162.2, + 348.6, + 175.57, + 309.44, + 187.03, + 301.8, + 192.76, + 288.43, + 193.72, + 282.7, + 193.72, + 280.79, + 187.03, + 280.79, + 174.62, + 287.47, + 171.75, + 291.29, + 171.75, + 295.11, + 171.75, + 306.57, + 166.98, + 312.3, + 165.07, + 345.73, + 142.14, + 350.51, + 117.31, + 350.51, + 102.03, + 350.51, + 90.57, + 353.37, + 65.74 + ] + ], + "num_keypoints": 112, + "area": 27789.11055, + "iscrowd": 0, + "keypoints": [ + 367, + 81, + 2, + 374, + 73, + 2, + 360, + 75, + 2, + 386, + 78, + 2, + 356, + 81, + 2, + 399, + 108, + 2, + 358, + 129, + 2, + 433, + 142, + 2, + 341, + 159, + 2, + 449, + 165, + 2, + 309, + 178, + 2, + 424, + 203, + 2, + 393, + 214, + 2, + 429, + 294, + 2, + 367, + 273, + 2, + 466, + 362, + 2, + 396, + 341, + 2, + 439, + 378, + 2, + 446, + 380, + 2, + 479, + 370, + 2, + 377, + 359, + 2, + 376, + 358, + 2, + 413, + 353, + 2, + 355.823, + 75.36, + 1.0, + 356.354, + 79.0837, + 1.0, + 357.244, + 82.7374, + 1.0, + 358.518, + 86.2722, + 1.0, + 360.146, + 89.6578, + 1.0, + 362.266, + 92.7538, + 1.0, + 365.004, + 95.3223, + 1.0, + 368.487, + 96.6454, + 1.0, + 372.191, + 96.1419, + 1.0, + 375.644, + 94.6832, + 1.0, + 378.601, + 92.3665, + 1.0, + 381.101, + 89.5662, + 1.0, + 382.903, + 86.2741, + 1.0, + 383.896, + 82.6509, + 1.0, + 384.075, + 78.9011, + 1.0, + 384.1, + 75.1408, + 1.0, + 383.903, + 71.3861, + 1.0, + 357.084, + 72.9743, + 1.0, + 358.602, + 71.7848, + 1.0, + 360.42, + 71.3443, + 1.0, + 362.377, + 71.1566, + 1.0, + 364.36, + 71.1889, + 1.0, + 368.971, + 70.4992, + 1.0, + 370.945, + 69.8179, + 1.0, + 373.001, + 69.3543, + 1.0, + 375.14, + 69.2666, + 1.0, + 377.358, + 69.8865, + 1.0, + 366.57, + 73.9588, + 1.0, + 366.734, + 76.1499, + 1.0, + 366.88, + 78.3018, + 1.0, + 366.99, + 80.4957, + 1.0, + 365.104, + 82.5589, + 1.0, + 366.308, + 82.8331, + 1.0, + 367.645, + 82.8037, + 1.0, + 369.172, + 82.2061, + 1.0, + 370.693, + 81.6521, + 1.0, + 358.705, + 75.4542, + 1.0, + 360.294, + 74.0903, + 1.0, + 362.376, + 73.8423, + 1.0, + 364.302, + 74.6834, + 1.0, + 362.543, + 75.568, + 1.0, + 360.612, + 75.8883, + 1.0, + 369.771, + 73.7734, + 1.0, + 371.409, + 72.2638, + 1.0, + 373.615, + 71.9502, + 1.0, + 375.722, + 72.7144, + 1.0, + 373.888, + 73.699, + 1.0, + 371.835, + 74.0238, + 1.0, + 363.184, + 86.9317, + 1.0, + 364.788, + 85.4484, + 1.0, + 367.021, + 84.7474, + 1.0, + 368.048, + 84.5364, + 1.0, + 369.083, + 84.3709, + 1.0, + 372.183, + 84.0529, + 1.0, + 375.083, + 84.8901, + 1.0, + 373.687, + 87.0735, + 1.0, + 371.644, + 88.8121, + 1.0, + 369.024, + 89.6982, + 1.0, + 366.67, + 89.6039, + 1.0, + 364.721, + 88.606, + 1.0, + 363.588, + 86.903, + 1.0, + 365.723, + 85.8496, + 1.0, + 368.184, + 85.2863, + 1.0, + 371.444, + 84.8294, + 1.0, + 374.647, + 85.0454, + 1.0, + 372.166, + 87.2914, + 1.0, + 368.81, + 88.3791, + 1.0, + 365.965, + 88.3238, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 304.10366, + 181.75134, + 1, + 300.70183, + 182.77567, + 1, + 297.3, + 183.8, + 1, + 294.7, + 186.5, + 1, + 290.1, + 187.8, + 1, + 290.9, + 176.6, + 1, + 287.5, + 176.0, + 1, + 285.5, + 178.4, + 1, + 286.4, + 182.4, + 1, + 288.8, + 179.4, + 1, + 285.0, + 181.0, + 1, + 287.3, + 186.1, + 1, + 291.8, + 189.5, + 1, + 287.7, + 182.7, + 1, + 283.8, + 184.1, + 1, + 286.5, + 189.1, + 1, + 290.0, + 192.0, + 1, + 286.7, + 185.3, + 1, + 282.8, + 187.4, + 1, + 284.8, + 191.6, + 1, + 288.4, + 194.5, + 1 + ], + "image_id": 785, + "bbox": [ + 280.79, + 44.73, + 218.7, + 346.68 + ], + "category_id": 1, + "id": 442619, + "face_box": [ + 358.2, + 69.86, + 26.360000000000014, + 25.849999999999994 + ], + "lefthand_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "righthand_box": [ + 280.43, + 173.12, + 27.860000000000014, + 24.849999999999994 + ], + "face_valid": true, + "lefthand_valid": false, + "righthand_valid": true, + "foot_valid": true + }, + { + "segmentation": [ + [ + 98.56, + 273.72, + 132.9, + 267, + 140.37, + 281.93, + 165.75, + 285.66, + 156.79, + 264.01, + 170.23, + 261.02, + 177.7, + 272.97, + 182.18, + 279.69, + 200.85, + 268.49, + 212.79, + 255.05, + 188.9, + 256.54, + 164.26, + 240.12, + 139.62, + 212.49, + 109.01, + 221.45, + 103.04, + 220.71, + 122.45, + 202.04, + 113.49, + 196.07, + 96.32, + 168.44, + 97.06, + 162.47, + 110.5, + 136.34, + 112, + 124.39, + 91.09, + 110.95, + 80.64, + 114.68, + 71.68, + 131.86, + 62.72, + 147.54, + 57.49, + 156.5, + 48.53, + 168.44, + 41.07, + 180.39, + 38.08, + 193.08, + 40.32, + 205.03, + 47.04, + 213.24, + 54.5, + 216.23, + 82.13, + 252.06, + 91.09, + 271.48 + ] + ], + "num_keypoints": 106, + "area": 11025.219, + "iscrowd": 0, + "keypoints": [ + 99, + 144, + 2, + 104, + 141, + 2, + 96, + 137, + 2, + 0, + 0, + 0, + 78, + 133, + 2, + 56, + 161, + 2, + 81, + 162, + 2, + 0, + 0, + 0, + 103, + 208, + 2, + 116, + 204, + 2, + 0, + 0, + 0, + 57, + 246, + 1, + 82, + 259, + 1, + 137, + 219, + 2, + 138, + 247, + 2, + 177, + 256, + 2, + 158, + 296, + 1, + 208.16049, + 257.42419, + 2.0, + 205.8824, + 259.13276, + 2.0, + 183.38626, + 275.93367, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 82.9654, + 131.144, + 1.0, + 81.8046, + 134.328, + 1.0, + 80.7007, + 137.531, + 1.0, + 79.8836, + 140.818, + 1.0, + 79.734, + 144.196, + 1.0, + 80.4763, + 147.486, + 1.0, + 82.0188, + 150.498, + 1.0, + 84.2352, + 153.057, + 1.0, + 86.8081, + 155.258, + 1.0, + 89.652, + 157.095, + 1.0, + 92.9128, + 157.812, + 1.0, + 95.962, + 156.474, + 1.0, + 98.5377, + 154.281, + 1.0, + 100.557, + 151.568, + 1.0, + 102.508, + 148.799, + 1.0, + 103.987, + 145.756, + 1.0, + 105.345, + 142.655, + 1.0, + 93.6074, + 132.13, + 1.0, + 95.8108, + 132.112, + 1.0, + 97.7956, + 132.618, + 1.0, + 99.6897, + 133.398, + 1.0, + 101.364, + 134.432, + 1.0, + 105.0, + 136.896, + 1.0, + 105.708, + 137.334, + 1.0, + 106.267, + 137.852, + 1.0, + 106.759, + 138.404, + 1.0, + 107.013, + 139.401, + 1.0, + 100.904, + 139.994, + 1.0, + 100.551, + 142.0, + 1.0, + 100.202, + 143.956, + 1.0, + 99.8116, + 145.919, + 1.0, + 94.7941, + 146.187, + 1.0, + 95.9823, + 147.027, + 1.0, + 97.3054, + 147.849, + 1.0, + 98.2362, + 148.403, + 1.0, + 99.2812, + 148.491, + 1.0, + 93.151, + 135.98, + 1.0, + 94.9184, + 136.187, + 1.0, + 96.5441, + 136.903, + 1.0, + 97.6034, + 138.308, + 1.0, + 95.8998, + 138.017, + 1.0, + 94.3941, + 137.178, + 1.0, + 102.085, + 141.003, + 1.0, + 103.379, + 141.05, + 1.0, + 104.485, + 141.71, + 1.0, + 104.899, + 142.915, + 1.0, + 103.704, + 142.739, + 1.0, + 102.729, + 142.026, + 1.0, + 89.8433, + 148.685, + 1.0, + 92.6494, + 149.006, + 1.0, + 95.2801, + 149.78, + 1.0, + 96.1096, + 150.259, + 1.0, + 96.7411, + 150.719, + 1.0, + 97.3853, + 151.82, + 1.0, + 97.337, + 153.217, + 1.0, + 96.5124, + 153.108, + 1.0, + 95.6091, + 152.796, + 1.0, + 94.7518, + 152.399, + 1.0, + 93.0313, + 151.317, + 1.0, + 91.3461, + 150.149, + 1.0, + 90.24, + 148.802, + 1.0, + 92.9121, + 149.883, + 1.0, + 95.4213, + 151.204, + 1.0, + 96.3082, + 152.03, + 1.0, + 97.1377, + 152.997, + 1.0, + 96.3098, + 152.035, + 1.0, + 95.406, + 151.234, + 1.0, + 92.8725, + 149.984, + 1.0, + 109.88978, + 204.46047, + 1, + 113.101195, + 201.939065, + 1, + 116.31261, + 199.41766, + 1, + 113.19977, + 199.3139, + 1, + 109.8794, + 200.24775, + 1, + 117.86903, + 199.10638, + 2, + 113.9261, + 199.00262, + 2, + 109.56812, + 198.48381, + 2, + 106.6628, + 198.38004999999998, + 1, + 117.1427, + 202.32298, + 2, + 111.2283, + 201.80417, + 2, + 107.07784000000001, + 201.38913, + 2, + 103.65371999999999, + 201.18161, + 1, + 116.52013, + 205.95463, + 2, + 112.5772, + 205.53958, + 2, + 107.59665, + 204.39821, + 2, + 104.27629, + 203.77564, + 2, + 116.41637, + 209.69004, + 2, + 112.16215, + 209.48252, + 2, + 108.73803000000001, + 208.34114, + 2, + 105.72895, + 206.68096, + 2, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "image_id": 40083, + "bbox": [ + 38.08, + 110.95, + 174.71, + 174.71 + ], + "category_id": 1, + "id": 198196, + "face_box": [ + 79.19, + 131.64, + 29.290000000000006, + 28.480000000000018 + ], + "lefthand_box": [ + 104.83, + 196.48, + 16.400000000000006, + 15.810000000000002 + ], + "righthand_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "face_valid": true, + "lefthand_valid": true, + "righthand_valid": false, + "foot_valid": true + }, + { + "segmentation": [ + [ + 257.76, + 288.05, + 273.4, + 258.26, + 325.55, + 253.79, + 335.23, + 232.93, + 326.3, + 186.74, + 333.74, + 177.05, + 327.79, + 153.21, + 333.74, + 142.04, + 344.17, + 139.06, + 353.11, + 139.06, + 359.07, + 145.02, + 360.56, + 148.74, + 362.05, + 168.86, + 388.87, + 197.17, + 397.81, + 276.88, + 372.48, + 293.27 + ] + ], + "num_keypoints": 83, + "area": 10171.9544, + "iscrowd": 0, + "keypoints": [ + 343, + 164, + 2, + 348, + 160, + 2, + 340, + 160, + 2, + 359, + 163, + 2, + 332, + 164, + 2, + 370, + 189, + 2, + 334, + 190, + 2, + 358, + 236, + 2, + 348, + 234, + 2, + 339, + 270, + 2, + 330, + 262, + 2, + 378, + 262, + 2, + 343, + 254, + 2, + 338, + 280, + 2, + 283, + 272, + 2, + 0, + 0, + 0, + 0, + 0, + 0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 333.383, + 160.62, + 1.0, + 333.607, + 163.811, + 1.0, + 334.137, + 166.965, + 1.0, + 334.934, + 170.062, + 1.0, + 336.036, + 173.062, + 1.0, + 337.69, + 175.794, + 1.0, + 340.01, + 177.986, + 1.0, + 342.889, + 179.347, + 1.0, + 346.063, + 179.445, + 1.0, + 349.16, + 178.674, + 1.0, + 351.892, + 177.033, + 1.0, + 354.132, + 174.761, + 1.0, + 355.652, + 171.957, + 1.0, + 356.482, + 168.871, + 1.0, + 356.751, + 165.691, + 1.0, + 356.914, + 162.496, + 1.0, + 356.913, + 159.299, + 1.0, + 335.435, + 157.491, + 1.0, + 336.759, + 156.383, + 1.0, + 338.264, + 155.821, + 1.0, + 339.903, + 155.445, + 1.0, + 341.565, + 155.312, + 1.0, + 345.805, + 155.039, + 1.0, + 347.424, + 154.896, + 1.0, + 349.044, + 154.957, + 1.0, + 350.677, + 155.266, + 1.0, + 352.333, + 156.08, + 1.0, + 343.65, + 159.186, + 1.0, + 343.687, + 161.041, + 1.0, + 343.68, + 162.886, + 1.0, + 343.657, + 164.752, + 1.0, + 341.61, + 167.049, + 1.0, + 342.69, + 167.145, + 1.0, + 343.906, + 167.123, + 1.0, + 345.179, + 166.907, + 1.0, + 346.456, + 166.707, + 1.0, + 336.707, + 159.932, + 1.0, + 338.078, + 158.999, + 1.0, + 339.726, + 158.864, + 1.0, + 341.204, + 159.605, + 1.0, + 339.755, + 160.185, + 1.0, + 338.21, + 160.321, + 1.0, + 346.612, + 159.27, + 1.0, + 348.028, + 158.307, + 1.0, + 349.739, + 158.245, + 1.0, + 351.302, + 158.965, + 1.0, + 349.802, + 159.575, + 1.0, + 348.188, + 159.642, + 1.0, + 340.049, + 171.873, + 1.0, + 341.307, + 170.304, + 1.0, + 343.097, + 169.499, + 1.0, + 343.987, + 169.41, + 1.0, + 344.876, + 169.314, + 1.0, + 346.909, + 169.61, + 1.0, + 348.603, + 170.874, + 1.0, + 347.548, + 172.219, + 1.0, + 346.133, + 173.242, + 1.0, + 344.378, + 173.742, + 1.0, + 342.683, + 173.666, + 1.0, + 341.218, + 173.038, + 1.0, + 340.398, + 171.815, + 1.0, + 342.1, + 170.752, + 1.0, + 344.043, + 170.287, + 1.0, + 346.21, + 170.271, + 1.0, + 348.214, + 170.913, + 1.0, + 346.462, + 171.947, + 1.0, + 344.283, + 172.468, + 1.0, + 342.246, + 172.507, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "image_id": 40083, + "bbox": [ + 257.76, + 139.06, + 140.05, + 154.21 + ], + "category_id": 1, + "id": 230195, + "face_box": [ + 333.96, + 154.32, + 23.28000000000003, + 26.79000000000002 + ], + "lefthand_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "righthand_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "face_valid": true, + "lefthand_valid": false, + "righthand_valid": false, + "foot_valid": false + }, + { + "segmentation": [ + [ + 285.37, + 126.5, + 281.97, + 127.72, + 280.76, + 132.33, + 280.76, + 136.46, + 275.17, + 143.26, + 275.9, + 158.08, + 277.6, + 164.4, + 278.33, + 173.87, + 278.33, + 183.83, + 279.79, + 191.11, + 281.97, + 194.76, + 284.89, + 192.09, + 284.89, + 186.99, + 284.89, + 181.16, + 284.64, + 177.51, + 285.86, + 173.87 + ] + ], + "num_keypoints": 0, + "area": 491.2669, + "iscrowd": 0, + "keypointsimage_id": 40083, + "bbox": [ + 275.17, + 126.5, + 10.69, + 68.26 + ], + "category_id": 1, + "id": 1202706, + "face_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "lefthand_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "righthand_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "face_valid": false, + "lefthand_valid": false, + "righthand_valid": false, + "foot_valid": false + }, + { + "segmentation": [ + [ + 339.34, + 107.97, + 338.38, + 102.19, + 339.34, + 91.58, + 335.49, + 84.84, + 326.81, + 74.23, + 312.35, + 74.23, + 301.75, + 74.23, + 295, + 86.76, + 295, + 93.51, + 292.11, + 99.3, + 287.29, + 102.19, + 291.14, + 107.01, + 295, + 107.01, + 295.96, + 112.79, + 301.75, + 115.69, + 305.6, + 119.54, + 307.53, + 123.4, + 317.17, + 123.4, + 311.39, + 129.18, + 286.32, + 139.79, + 274.75, + 139.79, + 264.15, + 138.82, + 262.22, + 144.61, + 261.26, + 147.5, + 253.54, + 147.5, + 247.76, + 150.39, + 249.69, + 159.07, + 256.44, + 161, + 262.22, + 161, + 268, + 161, + 276.68, + 161.96, + 284.39, + 168.71, + 293.07, + 174.49, + 301.75, + 174.49, + 308.49, + 169.67, + 308.49, + 188.95, + 311.39, + 194.74, + 312.35, + 208.23, + 307.53, + 221.73, + 297.89, + 229.44, + 281.5, + 250.65, + 269.93, + 262.22, + 278.61, + 320.06, + 281.5, + 331.63, + 276.68, + 338.38, + 270.9, + 349.95, + 262.22, + 356.7, + 253.54, + 359.59, + 253.54, + 365.37, + 274.75, + 365.37, + 291.14, + 365.37, + 306.57, + 359.59, + 303.67, + 352.84, + 297.89, + 340.31, + 293.07, + 318.13, + 295, + 294.03, + 293.07, + 278.61, + 294.03, + 270.9, + 305.6, + 259.33, + 313.31, + 299.82, + 319.1, + 309.46, + 341.27, + 317.17, + 384.65, + 330.67, + 387.55, + 335.49, + 383.69, + 341.27, + 397.19, + 350.91, + 398.15, + 363.44, + 398.15, + 375.01, + 405.86, + 374.05, + 409.72, + 357.66, + 411.65, + 342.24, + 416.47, + 328.74, + 417.43, + 321.03, + 410.68, + 319.1, + 401.04, + 318.13, + 392.37, + 318.13, + 382.73, + 314.28, + 348.98, + 300.78, + 339.34, + 293.07, + 334.52, + 285.36, + 340.31, + 259.33, + 340.31, + 246.8, + 340.31, + 242.94, + 350.91, + 228.48, + 358.62, + 214.98, + 355.22, + 204.32, + 357.05, + 196.11, + 361.61, + 188.82, + 361.61, + 181.97, + 365.26, + 165.63, + 367.54, + 139.18, + 366.17, + 123.68, + 361.15, + 112.73, + 353.86, + 107.72, + 351.58, + 105.89, + 344.74, + 105.89, + 340.18, + 109.08 + ] + ], + "num_keypoints": 63, + "area": 17123.92955, + "iscrowd": 0, + "keypoints": [ + 297, + 111, + 2, + 299, + 106, + 2, + 0, + 0, + 0, + 314, + 108, + 2, + 0, + 0, + 0, + 329, + 141, + 2, + 346, + 125, + 2, + 295, + 164, + 2, + 323, + 130, + 2, + 266, + 155, + 2, + 279, + 143, + 2, + 329, + 225, + 2, + 331, + 221, + 2, + 327, + 298, + 2, + 283, + 269, + 2, + 398, + 327, + 2, + 288, + 349, + 2, + 401.79499, + 364.28207, + 2.0, + 407.21854, + 361.57029, + 2.0, + 407.21854, + 325.86523, + 2.0, + 257.16687, + 361.57029, + 2.0, + 258.52276, + 361.11833, + 2.0, + 297.84353, + 355.69477, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 265.1, + 155.9, + 1, + 260.05, + 152.25, + 1, + 255.0, + 148.6, + 1, + 250.6, + 148.6, + 1, + 249.1, + 151.0, + 1, + 253.4, + 158.9, + 1, + 251.9, + 155.1, + 1, + 252.0, + 151.9, + 1, + 252.9, + 150.0, + 1, + 257.4, + 157.9, + 1, + 256.7, + 154.2, + 1, + 256.3, + 151.6, + 1, + 256.9, + 149.3, + 1, + 260.2, + 156.5, + 1, + 260.1, + 153.0, + 1, + 259.9, + 150.7, + 1, + 260.2, + 148.7, + 1, + 262.8, + 154.8, + 1, + 262.7, + 152.5, + 1, + 262.7, + 150.9, + 1, + 262.6, + 148.8, + 1, + 280.8, + 146.5, + 1, + 275.4, + 149.15, + 1, + 270.0, + 151.8, + 1, + 266.2, + 152.2, + 1, + 263.5, + 151.9, + 1, + 266.6, + 142.5, + 1, + 263.6, + 147.0, + 1, + 264.9, + 151.0, + 1, + 268.5, + 152.9, + 1, + 270.6, + 142.0, + 1, + 267.9, + 146.0, + 1, + 269.4, + 149.6, + 1, + 272.5, + 151.5, + 1, + 273.8, + 142.1, + 1, + 272.2, + 146.0, + 1, + 274.2, + 149.1, + 1, + 276.5, + 149.6, + 1, + 277.4, + 142.3, + 1, + 276.6, + 145.2, + 1, + 277.6, + 148.3, + 1, + 279.4, + 148.6, + 1 + ], + "image_id": 196141, + "bbox": [ + 247.76, + 74.23, + 169.67, + 300.78 + ], + "category_id": 1, + "id": 460541, + "face_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "lefthand_box": [ + 249.12, + 146.31, + 19.920000000000016, + 15.819999999999993 + ], + "righthand_box": [ + 262.82, + 139.96, + 18.930000000000007, + 14.679999999999978 + ], + "face_valid": false, + "lefthand_valid": true, + "righthand_valid": true, + "foot_valid": true + }, + { + "segmentation": [ + [ + 578.76, + 112.4, + 589.39, + 100.81, + 589.39, + 99.84, + 596.16, + 116.27, + 603.89, + 122.07, + 603.89, + 138.49, + 598.09, + 159.75, + 597.12, + 181, + 594.22, + 191.63, + 589.39, + 212.89, + 583.59, + 208.06, + 583.59, + 206.13, + 582.63, + 200.33, + 582.63, + 193.57, + 582.63, + 182.94, + 575.86, + 181, + 567.17, + 197.43, + 571.03, + 203.23, + 567.17, + 207.09, + 555.57, + 208.06, + 562.34, + 200.33, + 565.24, + 190.67, + 565.24, + 173.27, + 566.2, + 163.61, + 568.14, + 156.85, + 570.07, + 148.15, + 566.2, + 143.32, + 565.24, + 133.66, + 575.86, + 118.2 + ] + ], + "num_keypoints": 36, + "area": 2789.0208, + "iscrowd": 0, + "keypoints": [ + 589, + 113, + 2, + 0, + 0, + 0, + 0, + 0, + 0, + 595, + 112, + 1, + 584, + 110, + 2, + 598, + 123, + 2, + 579, + 119, + 2, + 594, + 141, + 2, + 570, + 137, + 2, + 576, + 135, + 2, + 585, + 139, + 2, + 590, + 157, + 2, + 574, + 156, + 2, + 589, + 192, + 2, + 565, + 189, + 1, + 587, + 222, + 1, + 557, + 219, + 1, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 578.8, + 135.7, + 2, + 577.55, + 134.35, + 2, + 576.3, + 133.0, + 1, + 574.6, + 134.1, + 1, + 574.0, + 135.5, + 1, + 574.3, + 132.9, + 2, + 572.0, + 132.4, + 2, + 570.3, + 131.8, + 2, + 568.9, + 130.7, + 2, + 573.3, + 134.4, + 2, + 570.9, + 134.0, + 2, + 569.5, + 133.9, + 2, + 568.2, + 133.8, + 2, + 572.8, + 135.7, + 2, + 572.6, + 138.3, + 2, + 574.1, + 139.4, + 2, + 576.2, + 139.4, + 1, + 574.4, + 138.0, + 2, + 575.4, + 139.5, + 2, + 576.3, + 140.2, + 2, + 577.6, + 140.8, + 2, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "image_id": 196141, + "bbox": [ + 555.57, + 99.84, + 48.32, + 113.05 + ], + "category_id": 1, + "id": 488308, + "face_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "lefthand_box": [ + 568.2, + 130.89, + 10.75, + 11.130000000000024 + ], + "righthand_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "face_valid": false, + "lefthand_valid": true, + "righthand_valid": false, + "foot_valid": false + }, + { + "segmentation": [ + [ + 446.96, + 73.13, + 445.81, + 77.71, + 443.33, + 78.29, + 441.61, + 81.72, + 441.23, + 84.58, + 440.85, + 90.5, + 442.19, + 94.32, + 443.52, + 97.18, + 443.52, + 102.33, + 442.57, + 105.58, + 446.58, + 105.19, + 447.15, + 99.85, + 447.53, + 94.89, + 446, + 93.55, + 446.38, + 92.03, + 453.64, + 92.41, + 454.02, + 94.51, + 457.64, + 94.51, + 455.74, + 88.4, + 455.35, + 82.29, + 453.64, + 78.48, + 451.92, + 77.71, + 452.87, + 74.47, + 450.58, + 73.13 + ] + ], + "num_keypoints": 0, + "area": 285.7906, + "iscrowd": 0, + "keypointsimage_id": 196141, + "bbox": [ + 440.85, + 73.13, + 16.79, + 32.45 + ], + "category_id": 1, + "id": 508900, + "face_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "lefthand_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "righthand_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "face_valid": false, + "lefthand_valid": false, + "righthand_valid": false, + "foot_valid": false + }, + { + "segmentation": [ + [ + 497.15, + 413.95, + 531.55, + 417.68, + 548.74, + 411.7, + 551.74, + 403.48, + 546.5, + 394.5, + 543.51, + 386.28, + 571.93, + 390.76, + 574.92, + 391.51, + 579.4, + 409.46, + 605.58, + 409.46, + 615.3, + 408.71, + 607.07, + 389.27, + 598.1, + 381.79, + 607.82, + 366.83, + 607.82, + 352.63, + 610.06, + 338.42, + 619.04, + 345.15, + 631, + 344.4, + 630.25, + 336.92, + 626.51, + 318.98, + 616.05, + 286.07, + 598.85, + 263.64, + 585.39, + 257.66, + 593.61, + 244.2, + 601.09, + 235.97, + 596.6, + 219.52, + 587.63, + 211.29, + 577.91, + 208.3, + 563.7, + 206.81, + 556.22, + 214.29, + 548, + 217.28, + 539.77, + 229.99, + 539.77, + 241.95, + 539.02, + 247.19, + 523.32, + 247.19, + 503.88, + 254.67, + 485.93, + 254.67, + 479.95, + 248.68, + 473.22, + 241.21, + 485.93, + 227, + 477.7, + 215.78, + 457.51, + 215.78, + 453.77, + 235.22, + 463.5, + 246.44, + 465.74, + 261.4, + 490.42, + 274.11, + 501.63, + 275.6, + 504.62, + 286.07, + 519.58, + 286.07, + 522.57, + 292.06, + 512.85, + 310, + 515.09, + 330.94, + 530.05, + 343.65, + 505.37, + 341.41, + 479.95, + 339.91, + 465.74, + 346.64, + 463.5, + 358.61, + 473.97, + 381.04, + 485.18, + 390.02, + 501.63, + 398.99, + 504.62, + 404.22, + 491.16, + 412.45, + 495.65, + 417.68 + ] + ], + "num_keypoints": 15, + "area": 21608.94075, + "iscrowd": 0, + "keypoints": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 552, + 234, + 2, + 0, + 0, + 0, + 531, + 262, + 2, + 600, + 283, + 2, + 480, + 260, + 2, + 622, + 336, + 2, + 466, + 242, + 2, + 0, + 0, + 0, + 546, + 365, + 2, + 592, + 371, + 2, + 470, + 351, + 2, + 551, + 330, + 2, + 519, + 394, + 2, + 589, + 391, + 2, + 0.0, + 0.0, + 0.0, + 498.08009, + 412.23863, + 2.0, + 541.66626, + 400.39384, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 602.22109, + 403.58794, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "image_id": 196141, + "bbox": [ + 453.77, + 206.81, + 177.23, + 210.87 + ], + "category_id": 1, + "id": 1717641, + "face_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "lefthand_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "righthand_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "face_valid": false, + "lefthand_valid": false, + "righthand_valid": false, + "foot_valid": true + }, + { + "segmentation": [ + [ + 58.93, + 163.67, + 47.18, + 161.59, + 36.12, + 93.86, + 41.65, + 82.8, + 40.27, + 69.66, + 50.64, + 67.59, + 55.48, + 73.81, + 63.08, + 92.47, + 66.53, + 99.38, + 65.15, + 109.06, + 61, + 127.03, + 59.62, + 162.97 + ] + ], + "num_keypoints": 20, + "area": 1870.14015, + "iscrowd": 0, + "keypoints": [ + 48, + 79, + 2, + 50, + 77, + 2, + 46, + 77, + 2, + 54, + 78, + 2, + 45, + 78, + 2, + 57, + 90, + 2, + 42, + 90, + 2, + 63, + 103, + 2, + 42, + 105, + 2, + 56, + 113, + 2, + 49, + 112, + 2, + 55, + 117, + 2, + 44, + 117, + 2, + 55, + 140, + 2, + 47, + 140, + 2, + 56, + 160, + 2, + 49, + 159, + 2, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 44.4, + 162.6, + 2.0, + 43.4, + 161.5, + 2.0, + 51.7, + 160.7, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "image_id": 196141, + "bbox": [ + 36.12, + 67.59, + 30.41, + 96.08 + ], + "category_id": 1, + "id": 1724673, + "face_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "lefthand_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "righthand_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "face_valid": false, + "lefthand_valid": false, + "righthand_valid": false, + "foot_valid": true + }, + { + "segmentation": [ + [ + 139.41, + 321.58, + 144.78, + 326.56, + 196.92, + 314.68, + 196.16, + 309.31, + 207.28, + 292.05, + 213.03, + 284, + 228.75, + 270.2, + 233.35, + 261.38, + 244.47, + 252.56, + 254.44, + 237.61, + 267.86, + 215.37, + 272.08, + 212.68, + 285.5, + 232.62, + 294.7, + 250.64, + 295.08, + 264.06, + 290.87, + 277.87, + 290.87, + 286.3, + 289.71, + 298.19, + 281.66, + 318.89, + 282.05, + 334.23, + 295.08, + 340.37, + 315.02, + 343.82, + 314.25, + 336.53, + 310.42, + 330.4, + 301.98, + 322.34, + 304.29, + 310.84, + 304.67, + 302.79, + 306.2, + 292.05, + 311.19, + 275.56, + 313.87, + 251.79, + 311.19, + 234.54, + 312.72, + 224.57, + 310.42, + 212.3, + 307.74, + 201.56, + 306.2, + 193.51, + 306.59, + 183.16, + 310.04, + 177.41, + 314.64, + 173.19, + 316.94, + 171.65, + 328.06, + 163.99, + 337.64, + 157.85, + 343.4, + 159.77, + 346.46, + 166.67, + 346.85, + 170.5, + 346.46, + 179.71, + 346.85, + 188.53, + 346.85, + 191.98, + 344.55, + 198.11, + 342.25, + 203.48, + 338.41, + 208.46, + 335.34, + 212.68, + 335.34, + 217.67, + 343.01, + 222.65, + 354.9, + 210.76, + 359.12, + 196.19, + 361.8, + 173.19, + 361.42, + 161.69, + 356.43, + 150.18, + 344.93, + 135.61, + 343.01, + 132.93, + 345.31, + 126.41, + 345.7, + 124.88, + 343.4, + 115.29, + 340.33, + 104.17, + 337.26, + 102.25, + 330.36, + 103.4, + 326.14, + 106.09, + 320.01, + 111.07, + 314.64, + 119.89, + 310.42, + 121.04, + 292.02, + 121.81, + 279.75, + 127.94, + 244.09, + 138.68, + 240.25, + 142.51, + 238.72, + 154.4, + 239.1, + 163.6, + 239.87, + 173.96, + 241.79, + 181.24, + 248.3, + 192.36, + 240.25, + 206.55, + 236.42, + 219.2, + 229.9, + 236.45, + 225.3, + 247.57, + 218.4, + 254.48, + 208.81, + 265.6, + 202.29, + 278.25, + 195.39, + 285.92, + 188.49, + 292.05, + 183.5, + 295.89, + 176.6, + 302.41, + 172, + 308.54, + 167.78, + 313.14, + 146.31, + 318.89 + ] + ], + "num_keypoints": 132, + "area": 14250.29385, + "iscrowd": 0, + "keypoints": [ + 334, + 135, + 2, + 340, + 129, + 2, + 331, + 129, + 2, + 0, + 0, + 0, + 319, + 123, + 2, + 340, + 146, + 2, + 292, + 133, + 2, + 353, + 164, + 2, + 246, + 144, + 2, + 354, + 197, + 2, + 250, + 185, + 2, + 293, + 197, + 2, + 265, + 187, + 2, + 305, + 252, + 2, + 231, + 254, + 2, + 293, + 321, + 2, + 193, + 297, + 2, + 300.24175, + 336.83838, + 2.0, + 306.59015, + 335.34464, + 2.0, + 290.07408, + 326.47826, + 2.0, + 182.60972, + 314.05885, + 2.0, + 175.88789, + 305.84328, + 2.0, + 189.70499, + 302.48236, + 2.0, + 319.681, + 126.613, + 1.0, + 319.155, + 129.261, + 1.0, + 318.92, + 131.954, + 1.0, + 319.187, + 134.631, + 1.0, + 319.707, + 137.271, + 1.0, + 320.991, + 139.649, + 1.0, + 322.846, + 141.606, + 1.0, + 325.009, + 143.216, + 1.0, + 327.359, + 144.544, + 1.0, + 329.907, + 145.384, + 1.0, + 332.347, + 144.347, + 1.0, + 334.268, + 142.449, + 1.0, + 335.767, + 140.222, + 1.0, + 336.675, + 137.69, + 1.0, + 337.019, + 135.009, + 1.0, + 336.982, + 132.311, + 1.0, + 337.13, + 129.618, + 1.0, + 328.503, + 125.823, + 1.0, + 329.531, + 125.489, + 1.0, + 330.619, + 125.626, + 1.0, + 331.573, + 125.909, + 1.0, + 332.529, + 126.431, + 1.0, + 334.479, + 127.459, + 1.0, + 334.815, + 127.43, + 1.0, + 335.157, + 127.316, + 1.0, + 335.52, + 127.327, + 1.0, + 335.949, + 127.701, + 1.0, + 332.762, + 129.334, + 1.0, + 333.168, + 130.389, + 1.0, + 333.603, + 131.342, + 1.0, + 333.928, + 132.331, + 1.0, + 331.671, + 134.291, + 1.0, + 332.232, + 134.389, + 1.0, + 332.931, + 134.487, + 1.0, + 333.332, + 134.463, + 1.0, + 333.645, + 134.212, + 1.0, + 329.271, + 128.208, + 1.0, + 329.963, + 128.464, + 1.0, + 330.676, + 128.659, + 1.0, + 331.392, + 128.839, + 1.0, + 330.672, + 128.659, + 1.0, + 330.003, + 128.334, + 1.0, + 333.792, + 129.611, + 1.0, + 334.158, + 129.741, + 1.0, + 334.546, + 129.765, + 1.0, + 334.878, + 129.954, + 1.0, + 334.523, + 129.822, + 1.0, + 334.161, + 129.704, + 1.0, + 327.38, + 138.818, + 1.0, + 329.757, + 138.136, + 1.0, + 332.086, + 137.874, + 1.0, + 332.75, + 138.208, + 1.0, + 333.221, + 138.515, + 1.0, + 334.495, + 139.634, + 1.0, + 335.213, + 141.054, + 1.0, + 334.12, + 140.754, + 1.0, + 333.208, + 140.234, + 1.0, + 332.2, + 139.888, + 1.0, + 330.765, + 139.414, + 1.0, + 329.069, + 139.351, + 1.0, + 327.561, + 138.814, + 1.0, + 329.88, + 138.346, + 1.0, + 332.517, + 138.668, + 1.0, + 334.031, + 139.589, + 1.0, + 335.123, + 140.862, + 1.0, + 333.726, + 140.572, + 1.0, + 332.203, + 140.032, + 1.0, + 329.731, + 139.403, + 1.0, + 353.87482, + 196.49984999999998, + 1, + 349.01957500000003, + 201.76511, + 1, + 344.16433, + 207.03037, + 1, + 340.81534, + 210.64729, + 1, + 337.46165, + 216.59183000000002, + 1, + 346.65868, + 216.02586, + 1, + 342.27241, + 219.28019999999998, + 1, + 337.88613, + 219.70467, + 1, + 334.4903, + 218.57273, + 1, + 345.5, + 215.0, + 1, + 342.27241, + 217.72377, + 1, + 338.73509, + 218.00675999999999, + 1, + 334.77329, + 216.30885, + 1, + 343.7, + 213.8, + 1, + 341.42345, + 215.74288, + 1, + 338.73509, + 215.60138, + 1, + 335.62225, + 213.76198, + 1, + 342.4139, + 212.63003, + 1, + 340.85748, + 213.76198, + 1, + 338.87658, + 214.04496, + 1, + 337.17867, + 213.76198, + 1, + 249.4, + 180.4, + 1, + 254.3, + 184.9, + 1, + 259.2, + 189.4, + 1, + 259.3, + 192.1, + 1, + 258.2, + 194.9, + 1, + 254.9, + 193.2, + 1, + 255.9, + 192.3, + 1, + 255.9, + 190.5, + 1, + 255.4, + 188.5, + 1, + 252.2, + 194.0, + 1, + 253.2, + 193.6, + 1, + 253.2, + 191.1, + 1, + 252.9, + 188.8, + 1, + 249.4, + 193.6, + 1, + 250.4, + 193.6, + 1, + 250.4, + 191.3, + 1, + 249.9, + 188.7, + 1, + 247.1, + 192.2, + 1, + 248.0, + 192.2, + 1, + 247.9, + 190.3, + 1, + 247.5, + 188.3, + 1 + ], + "image_id": 197388, + "bbox": [ + 139.41, + 102.25, + 222.39, + 241.57 + ], + "category_id": 1, + "id": 437295, + "face_box": [ + 320.23, + 123.84, + 21.049999999999955, + 23.5 + ], + "lefthand_box": [ + 333.65, + 198.45, + 23.150000000000034, + 23.57000000000002 + ], + "righthand_box": [ + 247.5, + 184.92, + 23.30000000000001, + 22.360000000000014 + ], + "face_valid": true, + "lefthand_valid": true, + "righthand_valid": true, + "foot_valid": true + }, + { + "segmentation": [ + [ + 287.17, + 121.42, + 294.22, + 106.44, + 302.15, + 116.13, + 303.03, + 121.42 + ], + [ + 297.74, + 99.39, + 310.08, + 76.49, + 326.81, + 76.49, + 329.46, + 67.68, + 337.38, + 61.52, + 346.19, + 62.4, + 353.24, + 65.92, + 353.24, + 76.49, + 355.88, + 84.42, + 359.41, + 87.94, + 362.05, + 96.75, + 354.12, + 139.04, + 349.72, + 142.56, + 345.31, + 139.92, + 349.72, + 117.89, + 348.84, + 108.2, + 345.31, + 113.49, + 336.5, + 101.16, + 325.93, + 110.85, + 311.84, + 123.18 + ], + [ + 324.17, + 176.91, + 332.1, + 191.89, + 328.58, + 198.94, + 327.69, + 205.98, + 333.86, + 213.03, + 337.38, + 227.13, + 332.98, + 227.13, + 319.77, + 219.2, + 313.6, + 211.27 + ], + [ + 332.98, + 165.46, + 341.79, + 161.06, + 336.5, + 174.27, + 333.86, + 186.6, + 326.81, + 176.03 + ] + ], + "num_keypoints": 19, + "area": 3404.869, + "iscrowd": 0, + "keypoints": [ + 345, + 92, + 2, + 350, + 87, + 2, + 341, + 87, + 2, + 0, + 0, + 0, + 330, + 83, + 2, + 357, + 94, + 2, + 316, + 92, + 2, + 357, + 104, + 2, + 291, + 123, + 1, + 351, + 133, + 2, + 281, + 136, + 1, + 326, + 131, + 1, + 305, + 128, + 1, + 336, + 152, + 1, + 303, + 171, + 1, + 318, + 206, + 2, + 294, + 211, + 1, + 322.595, + 216.245, + 2.0, + 327.23077, + 215.42692, + 2.0, + 316.81553, + 207.67155, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "image_id": 197388, + "bbox": [ + 287.17, + 61.52, + 74.88, + 165.61 + ], + "category_id": 1, + "id": 467657, + "face_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "lefthand_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "righthand_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "face_valid": false, + "lefthand_valid": false, + "righthand_valid": false, + "foot_valid": true + }, + { + "segmentation": [ + [ + 547.95, + 201.57, + 546.73, + 190.62, + 547.95, + 181.49, + 547.95, + 169.31, + 547.95, + 156.53, + 546.73, + 144.36, + 544.3, + 139.49, + 540.04, + 132.19, + 540.04, + 121.84, + 542.47, + 107.24, + 544.3, + 99.33, + 548.56, + 88.98, + 561.95, + 78.03, + 572.29, + 71.33, + 572.29, + 71.33, + 572.29, + 65.25, + 574.12, + 51.86, + 583.86, + 48.81, + 592.99, + 48.81, + 597.86, + 57.33, + 599.07, + 64.64, + 608.2, + 76.81, + 614.9, + 82.89, + 620.98, + 89.59, + 628.89, + 93.24, + 636.81, + 101.76, + 640, + 109.67, + 640, + 115.76, + 640, + 127.93, + 620.37, + 111.5, + 619.16, + 111.5, + 618.55, + 112.11, + 608.2, + 105.41, + 600.9, + 119.41, + 592.99, + 131.58, + 596.03, + 148.01, + 605.16, + 162.01, + 612.46, + 190.01, + 614.9, + 204.61, + 606.98, + 216.78, + 603.94, + 226.52, + 606.38, + 239.91, + 605.16, + 256.95, + 604.55, + 264.26, + 602.12, + 271.56, + 586.29, + 272.17, + 584.47, + 255.13, + 588.73, + 237.48, + 592.99, + 221.65, + 596.64, + 207.05, + 596.64, + 197.31, + 594.2, + 186.96, + 584.47, + 172.36, + 577.77, + 166.27, + 570.47, + 170.53, + 558.91, + 179.66, + 555.86, + 192.44, + 548.56, + 198.53, + 547.95, + 198.53 + ] + ], + "num_keypoints": 39, + "area": 8913.98475, + "iscrowd": 0, + "keypoints": [ + 591, + 78, + 2, + 594, + 74, + 2, + 586, + 74, + 2, + 0, + 0, + 0, + 573, + 70, + 2, + 598, + 86, + 2, + 566, + 93, + 2, + 626, + 105, + 2, + 546, + 126, + 2, + 0, + 0, + 0, + 561, + 150, + 2, + 582, + 150, + 2, + 557, + 154, + 2, + 606, + 194, + 2, + 558, + 209, + 1, + 591, + 252, + 2, + 539, + 262, + 1, + 599.72032, + 264.75714, + 2.0, + 603.91172, + 265.80499, + 2.0, + 585.74897, + 265.10642, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 565.0, + 153.0, + 0.08773341029882431, + 568.0, + 156.0, + 0.04602484405040741, + 571.0, + 159.0, + 0.04602484405040741, + 573.0, + 161.0, + 0.06972061097621918, + 575.0, + 164.0, + 0.06297813355922699, + 569.0, + 158.0, + 0.294232040643692, + 570.0, + 162.0, + 0.26472434401512146, + 570.0, + 166.0, + 0.2826344072818756, + 571.0, + 171.0, + 0.374575674533844, + 565.0, + 159.0, + 0.2154899388551712, + 566.0, + 162.0, + 0.21613340079784393, + 566.0, + 164.0, + 0.2544613480567932, + 567.0, + 168.0, + 0.31771761178970337, + 562.0, + 160.0, + 0.23286579549312592, + 563.0, + 166.0, + 0.1579097956418991, + 564.0, + 166.0, + 0.17961391806602478, + 564.0, + 166.0, + 0.17504136264324188, + 559.0, + 160.0, + 0.3428754508495331, + 559.0, + 162.0, + 0.2897874116897583, + 561.0, + 165.0, + 0.24125981330871582, + 562.0, + 166.0, + 0.20118576288223267 + ], + "image_id": 197388, + "bbox": [ + 540.04, + 48.81, + 99.96, + 223.36 + ], + "category_id": 1, + "id": 531914, + "face_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "lefthand_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "righthand_box": [ + 557.05, + 149.73, + 19.879999999999995, + 21.76000000000002 + ], + "face_valid": false, + "lefthand_valid": false, + "righthand_valid": true, + "foot_valid": true + }, + { + "segmentation": [ + [ + 561.51, + 385.38, + 572.11, + 352.71, + 570.34, + 317.4, + 559.75, + 282.08, + 552.68, + 267.07, + 565.93, + 236.17, + 583.59, + 236.17, + 602.13, + 260.01, + 614.49, + 286.5, + 628.61, + 302.39, + 639.21, + 281.2, + 614.49, + 251.18, + 588, + 218.51, + 595.95, + 202.62, + 594.18, + 185.85, + 580.05, + 170.84, + 562.4, + 179.67, + 557.98, + 198.21, + 554.45, + 202.62, + 532.38, + 199.97, + 525.32, + 202.62, + 511.19, + 229.11, + 493.53, + 256.48, + 484.7, + 276.78, + 451.15, + 323.58, + 423.78, + 338.59, + 388.47, + 373.9, + 372.58, + 387.14, + 396.41, + 388.03, + 418.49, + 367.72, + 450.27, + 345.65, + 501.48, + 306.8, + 520.02, + 301.5, + 552.68, + 340.35, + 543.86, + 369.49 + ] + ], + "num_keypoints": 60, + "area": 14267.20475, + "iscrowd": 0, + "keypoints": [ + 580, + 211, + 2, + 586, + 206, + 2, + 574, + 204, + 2, + 0, + 0, + 0, + 562, + 198, + 2, + 584, + 220, + 2, + 529, + 215, + 2, + 599, + 242, + 2, + 512, + 260, + 2, + 619, + 274, + 2, + 538, + 285, + 2, + 537, + 288, + 2, + 506, + 277, + 2, + 562, + 332, + 2, + 452, + 332, + 2, + 550, + 387, + 1, + 402, + 371, + 2, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 375.00826, + 386.35839, + 2.0, + 399.52454, + 375.91627, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 620.284, + 274.54006, + 1, + 621.65135, + 282.30908999999997, + 1, + 623.0187, + 290.07812, + 1, + 625.38048, + 294.55308, + 1, + 628.86101, + 298.90373999999997, + 1, + 630.22836, + 289.20799, + 1, + 634.57901, + 292.43991, + 1, + 633.08736, + 295.54752, + 1, + 628.6124, + 295.42321, + 1, + 632.46584, + 286.5976, + 1, + 631.3, + 291.9, + 1, + 627.7, + 291.6, + 1, + 625.6, + 288.9, + 1, + 633.7, + 284.2, + 1, + 632.3, + 288.0, + 1, + 629.1, + 288.0, + 1, + 627.0, + 285.9, + 1, + 633.2, + 280.4, + 1, + 632.8, + 283.6, + 1, + 630.8, + 284.4, + 1, + 629.1, + 283.2, + 1, + 544.0, + 291.0, + 0.09089653939008713, + 551.0, + 291.0, + 0.041192591190338135, + 558.0, + 291.0, + 0.041192591190338135, + 559.0, + 294.0, + 0.056781601160764694, + 563.0, + 298.0, + 0.2960541546344757, + 559.0, + 296.0, + 0.18105527758598328, + 562.0, + 301.0, + 0.12244582921266556, + 559.0, + 308.0, + 0.05529222637414932, + 564.0, + 306.0, + 0.05997529253363609, + 555.0, + 299.0, + 0.18805834650993347, + 556.0, + 302.0, + 0.1534559577703476, + 555.0, + 306.0, + 0.20564205944538116, + 556.0, + 309.0, + 0.06228385493159294, + 550.0, + 300.0, + 0.1409723311662674, + 550.0, + 301.0, + 0.2223101258277893, + 551.0, + 305.0, + 0.2001882642507553, + 553.0, + 308.0, + 0.1712668538093567, + 545.0, + 302.0, + 0.1908813714981079, + 546.0, + 304.0, + 0.13619276881217957, + 547.0, + 306.0, + 0.19773860275745392, + 549.0, + 308.0, + 0.1341865360736847 + ], + "image_id": 197388, + "bbox": [ + 372.58, + 170.84, + 266.63, + 217.19 + ], + "category_id": 1, + "id": 533949, + "face_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "lefthand_box": [ + 615.22, + 271.56, + 22.139999999999986, + 28.839999999999975 + ], + "righthand_box": [ + 538.83, + 283.74, + 25.639999999999986, + 30.659999999999968 + ], + "face_valid": false, + "lefthand_valid": true, + "righthand_valid": true, + "foot_valid": true + }, + { + "segmentation": [ + [ + 2.03, + 75.18, + 10.85, + 70.58, + 16.99, + 65.59, + 17.75, + 55.24, + 20.05, + 50.25, + 29.64, + 43.74, + 37.31, + 47.57, + 41.52, + 53.7, + 43.83, + 64.82, + 53.03, + 70.19, + 61.85, + 77.09, + 72.58, + 87.06, + 74.88, + 79.01, + 78.72, + 73.64, + 86.39, + 77.86, + 90.6, + 90.13, + 86, + 93.2, + 82.17, + 102.4, + 75.27, + 106.24, + 68.75, + 104.7, + 50.34, + 90.9, + 43.06, + 112.37, + 40.76, + 123.11, + 42.29, + 130.78, + 48.04, + 161.83, + 52.26, + 190.59, + 50.73, + 210.15, + 44.21, + 245.04, + 50.34, + 256.16, + 53.03, + 261.53, + 47.28, + 263.83, + 40.37, + 263.83, + 31.56, + 260.76, + 28.1, + 256.16, + 26.95, + 244.65, + 29.25, + 233.54, + 32.71, + 223.95, + 33.09, + 213.98, + 32.32, + 206.31, + 32.71, + 194.81, + 33.09, + 185.61, + 24.65, + 177.17, + 16.99, + 161.45, + 13.53, + 176.02, + 10.85, + 206.31, + 1.65, + 231.62, + 1.65, + 235.84, + 0.5, + 146.88, + 0.88, + 122.34, + 1.65, + 75.56 + ] + ], + "num_keypoints": 16, + "area": 8260.75085, + "iscrowd": 0, + "keypoints": [ + 36, + 79, + 2, + 40, + 74, + 2, + 31, + 75, + 2, + 0, + 0, + 0, + 19, + 69, + 2, + 45, + 77, + 2, + 2, + 89, + 2, + 74, + 99, + 2, + 0, + 0, + 0, + 78, + 92, + 2, + 0, + 0, + 0, + 33, + 149, + 2, + 7, + 153, + 2, + 44, + 196, + 2, + 2, + 205, + 2, + 35, + 245, + 2, + 0, + 0, + 0, + 43.80826, + 259.40011, + 2.0, + 48.63752, + 257.67537, + 2.0, + 32.08007, + 256.29558, + 2.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "image_id": 197388, + "bbox": [ + 0.5, + 43.74, + 90.1, + 220.09 + ], + "category_id": 1, + "id": 543117, + "face_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "lefthand_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "righthand_box": [ + 0.0, + 0.0, + 0.0, + 0.0 + ], + "face_valid": false, + "lefthand_valid": false, + "righthand_valid": false, + "foot_valid": true + } + ] +} \ No newline at end of file diff --git a/tests/test_apis/test_inference.py b/tests/test_apis/test_inference.py index fa08970f73..c38c619a9e 100644 --- a/tests/test_apis/test_inference.py +++ b/tests/test_apis/test_inference.py @@ -6,13 +6,13 @@ import numpy as np import torch -from mmcv.image import imwrite +from mmcv.image import imread, imwrite from mmengine.utils import is_list_of from parameterized import parameterized -from mmpose.apis import inference_topdown, init_model +from mmpose.apis import inference_bottomup, inference_topdown, init_model from mmpose.structures import PoseDataSample -from mmpose.testing._utils import _rand_bboxes +from mmpose.testing._utils import _rand_bboxes, get_config_file, get_repo_dir from mmpose.utils import register_all_modules @@ -25,9 +25,7 @@ def setUp(self) -> None: 'td-hm_hrnet-w32_8xb64-210e_coco-256x192.py'), ('cpu', 'cuda'))]) def test_init_model(self, config, devices): - project_dir = osp.abspath(osp.dirname(osp.dirname(__file__))) - project_dir = osp.join(project_dir, '..') - config_file = osp.join(project_dir, config) + config_file = get_config_file(config) for device in devices: if device == 'cuda' and not torch.cuda.is_available(): @@ -89,3 +87,31 @@ def test_inference_topdown(self, config, devices): self.assertEqual(len(results), 1) self.assertTrue(results[0].pred_instances.keypoints.shape, (1, 17, 2)) + + @parameterized.expand([(('configs/body_2d_keypoint/' + 'associative_embedding/coco/' + 'ae_hrnet-w32_8xb24-300e_coco-512x512.py'), + ('cpu', 'cuda'))]) + def test_inference_bottomup(self, config, devices): + config_file = get_config_file(config) + img = osp.join(get_repo_dir(), 'tests/data/coco/000000000785.jpg') + + for device in devices: + if device == 'cuda' and not torch.cuda.is_available(): + # Skip the test if cuda is required but unavailable + continue + model = init_model(config_file, device=device) + + # test inference from image + results = inference_bottomup(model, img=imread(img)) + self.assertTrue(is_list_of(results, PoseDataSample)) + self.assertEqual(len(results), 1) + self.assertTrue(results[0].pred_instances.keypoints.shape, + (1, 17, 2)) + + # test inference from file + results = inference_bottomup(model, img=img) + self.assertTrue(is_list_of(results, PoseDataSample)) + self.assertEqual(len(results), 1) + self.assertTrue(results[0].pred_instances.keypoints.shape, + (1, 17, 2)) diff --git a/tests/test_apis/test_inferencers/test_mmpose_inferencer.py b/tests/test_apis/test_inferencers/test_mmpose_inferencer.py new file mode 100644 index 0000000000..3df85fc46e --- /dev/null +++ b/tests/test_apis/test_inferencers/test_mmpose_inferencer.py @@ -0,0 +1,74 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +from collections import defaultdict +from tempfile import TemporaryDirectory +from unittest import TestCase + +import mmcv + +from mmpose.apis.inferencers import MMPoseInferencer +from mmpose.structures import PoseDataSample + + +class TestMMPoseInferencer(TestCase): + + def test_call(self): + + # top-down model + inferencer = MMPoseInferencer('human') + + img_path = 'tests/data/coco/000000197388.jpg' + img = mmcv.imread(img_path) + + # `inputs` is path to an image + inputs = img_path + results1 = next(inferencer(inputs, return_vis=True)) + self.assertIn('visualization', results1) + self.assertSequenceEqual(results1['visualization'][0].shape, img.shape) + self.assertIn('predictions', results1) + self.assertIn('keypoints', results1['predictions'][0][0]) + self.assertEqual(len(results1['predictions'][0][0]['keypoints']), 17) + + # `inputs` is an image array + inputs = img + results2 = next(inferencer(inputs)) + self.assertEqual( + len(results1['predictions'][0]), len(results2['predictions'][0])) + self.assertSequenceEqual(results1['predictions'][0][0]['keypoints'], + results2['predictions'][0][0]['keypoints']) + results2 = next(inferencer(inputs, return_datasample=True)) + self.assertIsInstance(results2['predictions'][0], PoseDataSample) + + # `inputs` is path to a directory + inputs = osp.dirname(img_path) + with TemporaryDirectory() as tmp_dir: + # only save visualizations + for res in inferencer(inputs, vis_out_dir=tmp_dir): + pass + self.assertEqual(len(os.listdir(tmp_dir)), 4) + # save both visualizations and predictions + results3 = defaultdict(list) + for res in inferencer(inputs, out_dir=tmp_dir): + for key in res: + results3[key].extend(res[key]) + self.assertEqual(len(os.listdir(f'{tmp_dir}/visualizations')), 4) + self.assertEqual(len(os.listdir(f'{tmp_dir}/predictions')), 4) + self.assertEqual(len(results3['predictions']), 4) + self.assertSequenceEqual(results1['predictions'][0][0]['keypoints'], + results3['predictions'][3][0]['keypoints']) + + # `inputs` is path to a video + inputs = 'tests/data/posetrack18/videos/000001_mpiinew_test/' \ + '000001_mpiinew_test.mp4' + with TemporaryDirectory() as tmp_dir: + results = defaultdict(list) + for res in inferencer(inputs, out_dir=tmp_dir): + for key in res: + results[key].extend(res[key]) + self.assertIn('000001_mpiinew_test.mp4', + os.listdir(f'{tmp_dir}/visualizations')) + self.assertIn('000001_mpiinew_test.json', + os.listdir(f'{tmp_dir}/predictions')) + self.assertTrue(inferencer._video_input) + self.assertIn(len(results['predictions']), (4, 5)) diff --git a/tests/test_apis/test_inferencers/test_pose2d_inferencer.py b/tests/test_apis/test_inferencers/test_pose2d_inferencer.py new file mode 100644 index 0000000000..f402d05d19 --- /dev/null +++ b/tests/test_apis/test_inferencers/test_pose2d_inferencer.py @@ -0,0 +1,119 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +from collections import defaultdict +from tempfile import TemporaryDirectory +from unittest import TestCase + +import mmcv +import torch +from mmengine.infer.infer import BaseInferencer + +from mmpose.apis.inferencers import Pose2DInferencer +from mmpose.structures import PoseDataSample + + +class TestPose2DInferencer(TestCase): + + def _test_init(self): + + # 1. init with config path and checkpoint + inferencer = Pose2DInferencer( + model='configs/body_2d_keypoint/simcc/coco/' + 'simcc_res50_8xb64-210e_coco-256x192.py', + weights='https://download.openmmlab.com/mmpose/' + 'v1/body_2d_keypoint/simcc/coco/' + 'simcc_res50_8xb64-210e_coco-256x192-8e0f5b59_20220919.pth', + ) + self.assertIsInstance(inferencer.model, torch.nn.Module) + self.assertIsInstance(inferencer.detector, BaseInferencer) + self.assertSequenceEqual(inferencer.det_cat_ids, (0, )) + + # 2. init with config name + inferencer = Pose2DInferencer( + model='td-hm_res50_8xb32-210e_onehand10k-256x256') + self.assertIsInstance(inferencer.model, torch.nn.Module) + self.assertIsInstance(inferencer.detector, BaseInferencer) + self.assertSequenceEqual(inferencer.det_cat_ids, (0, )) + + # 3. init with alias + with self.assertWarnsRegex( + Warning, 'dataset_meta are not saved in ' + 'the checkpoint\'s meta data, load via config.'): + inferencer = Pose2DInferencer(model='animal') + self.assertIsInstance(inferencer.model, torch.nn.Module) + self.assertIsInstance(inferencer.detector, BaseInferencer) + self.assertSequenceEqual(inferencer.det_cat_ids, + (15, 16, 17, 18, 19, 20, 21, 22, 23)) + + # 4. init with bottom-up model + inferencer = Pose2DInferencer( + model='configs/body_2d_keypoint/dekr/coco/' + 'dekr_hrnet-w32_8xb10-140e_coco-512x512.py', + weights='https://download.openmmlab.com/mmpose/v1/' + 'body_2d_keypoint/dekr/coco/' + 'dekr_hrnet-w32_8xb10-140e_coco-512x512_ac7c17bf-20221228.pth', + ) + self.assertIsInstance(inferencer.model, torch.nn.Module) + self.assertFalse(hasattr(inferencer, 'detector')) + + def test_call(self): + + # top-down model + inferencer = Pose2DInferencer('human') + + img_path = 'tests/data/coco/000000197388.jpg' + img = mmcv.imread(img_path) + + # `inputs` is path to an image + inputs = img_path + results1 = next(inferencer(inputs, return_vis=True)) + self.assertIn('visualization', results1) + self.assertSequenceEqual(results1['visualization'][0].shape, img.shape) + self.assertIn('predictions', results1) + self.assertIn('keypoints', results1['predictions'][0][0]) + self.assertEqual(len(results1['predictions'][0][0]['keypoints']), 17) + + # `inputs` is an image array + inputs = img + results2 = next(inferencer(inputs)) + self.assertEqual( + len(results1['predictions'][0]), len(results2['predictions'][0])) + self.assertSequenceEqual(results1['predictions'][0][0]['keypoints'], + results2['predictions'][0][0]['keypoints']) + results2 = next(inferencer(inputs, return_datasample=True)) + self.assertIsInstance(results2['predictions'][0], PoseDataSample) + + # `inputs` is path to a directory + inputs = osp.dirname(img_path) + + with TemporaryDirectory() as tmp_dir: + # only save visualizations + for res in inferencer(inputs, vis_out_dir=tmp_dir): + pass + self.assertEqual(len(os.listdir(tmp_dir)), 4) + # save both visualizations and predictions + results3 = defaultdict(list) + for res in inferencer(inputs, out_dir=tmp_dir): + for key in res: + results3[key].extend(res[key]) + self.assertEqual(len(os.listdir(f'{tmp_dir}/visualizations')), 4) + self.assertEqual(len(os.listdir(f'{tmp_dir}/predictions')), 4) + self.assertEqual(len(results3['predictions']), 4) + self.assertSequenceEqual(results1['predictions'][0][0]['keypoints'], + results3['predictions'][3][0]['keypoints']) + + # `inputs` is path to a video + inputs = 'tests/data/posetrack18/videos/000001_mpiinew_test/' \ + '000001_mpiinew_test.mp4' + with TemporaryDirectory() as tmp_dir: + results = defaultdict(list) + for res in inferencer(inputs, out_dir=tmp_dir): + for key in res: + results[key].extend(res[key]) + self.assertIn('000001_mpiinew_test.mp4', + os.listdir(f'{tmp_dir}/visualizations')) + self.assertIn('000001_mpiinew_test.json', + os.listdir(f'{tmp_dir}/predictions')) + self.assertTrue(inferencer._video_input) + self.assertIn(len(results['predictions']), (4, 5)) diff --git a/tests/test_apis/test_webcam/test_utils/test_misc.py b/tests/test_apis/test_webcam/test_utils/test_misc.py index 371aaf3f76..d60fdaa002 100644 --- a/tests/test_apis/test_webcam/test_utils/test_misc.py +++ b/tests/test_apis/test_webcam/test_utils/test_misc.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import os +import tempfile import unittest import mmcv @@ -16,11 +17,13 @@ class TestMISC(unittest.TestCase): def test_get_cached_file_path(self): url = 'https://user-images.githubusercontent.com/15977946/' \ '170850839-acc59e26-c6b3-48c9-a9ec-87556edb99ed.jpg' - cached_file = get_cached_file_path(url, file_name='sunglasses.jpg') - self.assertTrue(os.path.exists(cached_file)) - # check if image is successfully cached - img = mmcv.imread(cached_file) - self.assertIsNotNone(img) + with tempfile.TemporaryDirectory() as tmpdir: + cached_file = get_cached_file_path( + url, save_dir=tmpdir, file_name='sunglasses.jpg') + self.assertTrue(os.path.exists(cached_file)) + # check if image is successfully cached + img = mmcv.imread(cached_file) + self.assertIsNotNone(img) def test_get_config_path(self): cfg_path = 'configs/_base_/datasets/coco.py' diff --git a/tests/test_codecs/test_associative_embedding.py b/tests/test_codecs/test_associative_embedding.py index 6f5266d6d6..983fc93fb1 100644 --- a/tests/test_codecs/test_associative_embedding.py +++ b/tests/test_codecs/test_associative_embedding.py @@ -39,8 +39,11 @@ def test_encode(self): use_udp=False, decode_keypoint_order=self.decode_keypoint_order) - heatmaps, keypoint_indices, keypoint_weights = codec.encode( - data['keypoints'], data['keypoints_visible']) + encoded = codec.encode(data['keypoints'], data['keypoints_visible']) + + heatmaps = encoded['heatmaps'] + keypoint_indices = encoded['keypoint_indices'] + keypoint_weights = encoded['keypoint_weights'] self.assertEqual(heatmaps.shape, (17, 64, 64)) self.assertEqual(keypoint_indices.shape, (1, 17, 2)) @@ -58,8 +61,11 @@ def test_encode(self): use_udp=True, decode_keypoint_order=self.decode_keypoint_order) - heatmaps, keypoint_indices, keypoint_weights = codec.encode( - data['keypoints'], data['keypoints_visible']) + encoded = codec.encode(data['keypoints'], data['keypoints_visible']) + + heatmaps = encoded['heatmaps'] + keypoint_indices = encoded['keypoint_indices'] + keypoint_weights = encoded['keypoint_weights'] self.assertEqual(heatmaps.shape, (17, 64, 64)) self.assertEqual(keypoint_indices.shape, (1, 17, 2)) @@ -70,22 +76,26 @@ def test_encode(self): index_encoded = keypoint_indices[0, k, 0] self.assertEqual(index_expected, index_encoded) - def _get_tags(self, heatmaps, keypoint_indices, tag_per_keypoint: bool): + def _get_tags(self, + heatmaps, + keypoint_indices, + tag_per_keypoint: bool, + tag_dim: int = 1): K, H, W = heatmaps.shape N = keypoint_indices.shape[0] if tag_per_keypoint: - tags = np.zeros((K, H, W), dtype=np.float32) + tags = np.zeros((K * tag_dim, H, W), dtype=np.float32) else: - tags = np.zeros((1, H, W), dtype=np.float32) + tags = np.zeros((tag_dim, H, W), dtype=np.float32) for n, k in product(range(N), range(K)): y, x = np.unravel_index(keypoint_indices[n, k, 0], (H, W)) if tag_per_keypoint: - tags[k, y, x] = n + tags[k::K, y, x] = n else: - tags[0, y, x] = n + tags[:, y, x] = n return tags @@ -117,16 +127,17 @@ def test_decode(self): data = get_coco_sample( img_shape=(256, 256), num_instances=2, non_occlusion=True) - # w/o UDP, tag_per_keypoint==True + # w/o UDP codec = AssociativeEmbedding( input_size=(256, 256), heatmap_size=(64, 64), use_udp=False, - decode_keypoint_order=self.decode_keypoint_order, - tag_per_keypoint=True) + decode_keypoint_order=self.decode_keypoint_order) - heatmaps, keypoint_indices, _ = codec.encode(data['keypoints'], - data['keypoints_visible']) + encoded = codec.encode(data['keypoints'], data['keypoints_visible']) + + heatmaps = encoded['heatmaps'] + keypoint_indices = encoded['keypoint_indices'] tags = self._get_tags( heatmaps, keypoint_indices, tag_per_keypoint=True) @@ -154,56 +165,20 @@ def test_decode(self): self.assertTrue(np.allclose(keypoints, data['keypoints'], atol=4.0)) - # w/o UDP, tag_per_keypoint==False + # w/o UDP, tag_imd=2 codec = AssociativeEmbedding( input_size=(256, 256), heatmap_size=(64, 64), use_udp=False, - decode_keypoint_order=self.decode_keypoint_order, - tag_per_keypoint=False) - - heatmaps, keypoint_indices, _ = codec.encode(data['keypoints'], - data['keypoints_visible']) - - tags = self._get_tags( - heatmaps, keypoint_indices, tag_per_keypoint=False) - - # to Tensor - batch_heatmaps = torch.from_numpy(heatmaps[None]) - batch_tags = torch.from_numpy(tags[None]) - - batch_keypoints, batch_keypoint_scores = codec.batch_decode( - batch_heatmaps, batch_tags) - - self.assertIsInstance(batch_keypoints, list) - self.assertIsInstance(batch_keypoint_scores, list) - self.assertEqual(len(batch_keypoints), 1) - self.assertEqual(len(batch_keypoint_scores), 1) - - keypoints, scores = self._sort_preds(batch_keypoints[0], - batch_keypoint_scores[0], - data['keypoints']) - - self.assertIsInstance(keypoints, np.ndarray) - self.assertIsInstance(scores, np.ndarray) - self.assertEqual(keypoints.shape, (2, 17, 2)) - self.assertEqual(scores.shape, (2, 17)) + decode_keypoint_order=self.decode_keypoint_order) - self.assertTrue(np.allclose(keypoints, data['keypoints'], atol=4.0)) + encoded = codec.encode(data['keypoints'], data['keypoints_visible']) - # w/ UDP, tag_per_keypoint==True - codec = AssociativeEmbedding( - input_size=(256, 256), - heatmap_size=(64, 64), - use_udp=True, - decode_keypoint_order=self.decode_keypoint_order, - tag_per_keypoint=True) - - heatmaps, keypoint_indices, _ = codec.encode(data['keypoints'], - data['keypoints_visible']) + heatmaps = encoded['heatmaps'] + keypoint_indices = encoded['keypoint_indices'] tags = self._get_tags( - heatmaps, keypoint_indices, tag_per_keypoint=True) + heatmaps, keypoint_indices, tag_per_keypoint=True, tag_dim=2) # to Tensor batch_heatmaps = torch.from_numpy(heatmaps[None]) @@ -228,53 +203,17 @@ def test_decode(self): self.assertTrue(np.allclose(keypoints, data['keypoints'], atol=4.0)) - # w/ UDP, tag_per_keypoint==False + # w/ UDP codec = AssociativeEmbedding( input_size=(256, 256), heatmap_size=(64, 64), use_udp=True, - decode_keypoint_order=self.decode_keypoint_order, - tag_per_keypoint=False) - - heatmaps, keypoint_indices, _ = codec.encode(data['keypoints'], - data['keypoints_visible']) - - tags = self._get_tags( - heatmaps, keypoint_indices, tag_per_keypoint=False) - - # to Tensor - batch_heatmaps = torch.from_numpy(heatmaps[None]) - batch_tags = torch.from_numpy(tags[None]) - - batch_keypoints, batch_keypoint_scores = codec.batch_decode( - batch_heatmaps, batch_tags) - - self.assertIsInstance(batch_keypoints, list) - self.assertIsInstance(batch_keypoint_scores, list) - self.assertEqual(len(batch_keypoints), 1) - self.assertEqual(len(batch_keypoint_scores), 1) - - keypoints, scores = self._sort_preds(batch_keypoints[0], - batch_keypoint_scores[0], - data['keypoints']) - - self.assertIsInstance(keypoints, np.ndarray) - self.assertIsInstance(scores, np.ndarray) - self.assertEqual(keypoints.shape, (2, 17, 2)) - self.assertEqual(scores.shape, (2, 17)) + decode_keypoint_order=self.decode_keypoint_order) - self.assertTrue(np.allclose(keypoints, data['keypoints'], atol=4.0)) + encoded = codec.encode(data['keypoints'], data['keypoints_visible']) - # Dynamic input sizes in decoder - codec = AssociativeEmbedding( - input_size=(256, 256), - heatmap_size=(64, 64), - use_udp=False, - decode_keypoint_order=self.decode_keypoint_order, - tag_per_keypoint=True) - - heatmaps, keypoint_indices, _ = codec.encode(data['keypoints'], - data['keypoints_visible']) + heatmaps = encoded['heatmaps'] + keypoint_indices = encoded['keypoint_indices'] tags = self._get_tags( heatmaps, keypoint_indices, tag_per_keypoint=True) @@ -284,7 +223,7 @@ def test_decode(self): batch_tags = torch.from_numpy(tags[None]) batch_keypoints, batch_keypoint_scores = codec.batch_decode( - batch_heatmaps, batch_tags, input_sizes=[(256, 256)]) + batch_heatmaps, batch_tags) self.assertIsInstance(batch_keypoints, list) self.assertIsInstance(batch_keypoint_scores, list) diff --git a/tests/test_codecs/test_decoupled_heatmap.py b/tests/test_codecs/test_decoupled_heatmap.py new file mode 100644 index 0000000000..747491c185 --- /dev/null +++ b/tests/test_codecs/test_decoupled_heatmap.py @@ -0,0 +1,168 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import numpy as np + +from mmpose.codecs import DecoupledHeatmap +from mmpose.registry import KEYPOINT_CODECS +from mmpose.testing import get_coco_sample + + +class TestDecoupledHeatmap(TestCase): + + def setUp(self) -> None: + pass + + def _make_multi_instance_data(self, data): + bbox = data['bbox'].reshape(-1, 2, 2) + keypoints = data['keypoints'] + keypoints_visible = data['keypoints_visible'] + + keypoints_visible[..., 0] = 0 + + offset = keypoints.max(axis=1, keepdims=True) + bbox_outside = bbox - offset + keypoints_outside = keypoints - offset + keypoints_outside_visible = np.zeros(keypoints_visible.shape) + + bbox_overlap = bbox.mean( + axis=1, keepdims=True) + 0.8 * ( + bbox - bbox.mean(axis=1, keepdims=True)) + keypoint_overlap = keypoints.mean( + axis=1, keepdims=True) + 0.8 * ( + keypoints - keypoints.mean(axis=1, keepdims=True)) + keypoint_overlap_visible = keypoints_visible + + data['bbox'] = np.concatenate((bbox, bbox_outside, bbox_overlap), + axis=0) + data['keypoints'] = np.concatenate( + (keypoints, keypoints_outside, keypoint_overlap), axis=0) + data['keypoints_visible'] = np.concatenate( + (keypoints_visible, keypoints_outside_visible, + keypoint_overlap_visible), + axis=0) + + return data + + def test_build(self): + cfg = dict( + type='DecoupledHeatmap', + input_size=(512, 512), + heatmap_size=(128, 128), + ) + codec = KEYPOINT_CODECS.build(cfg) + self.assertIsInstance(codec, DecoupledHeatmap) + + def test_encode(self): + data = get_coco_sample(img_shape=(512, 512), num_instances=1) + data['bbox'] = np.tile(data['bbox'], 2).reshape(-1, 4, 2) + data['bbox'][:, 1:3, 0] = data['bbox'][:, 0:2, 0] + data = self._make_multi_instance_data(data) + + codec = DecoupledHeatmap( + input_size=(512, 512), + heatmap_size=(128, 128), + ) + + print(data['bbox'].shape) + encoded = codec.encode( + data['keypoints'], data['keypoints_visible'], bbox=data['bbox']) + + heatmaps = encoded['heatmaps'] + instance_heatmaps = encoded['instance_heatmaps'] + keypoint_weights = encoded['keypoint_weights'] + instance_coords = encoded['instance_coords'] + + self.assertEqual(heatmaps.shape, (18, 128, 128)) + self.assertEqual(keypoint_weights.shape, (2, 17)) + self.assertEqual(instance_heatmaps.shape, (34, 128, 128)) + self.assertEqual(instance_coords.shape, (2, 2)) + + # without bbox + encoded = codec.encode( + data['keypoints'], data['keypoints_visible'], bbox=None) + + heatmaps = encoded['heatmaps'] + instance_heatmaps = encoded['instance_heatmaps'] + keypoint_weights = encoded['keypoint_weights'] + instance_coords = encoded['instance_coords'] + + self.assertEqual(heatmaps.shape, (18, 128, 128)) + self.assertEqual(keypoint_weights.shape, (2, 17)) + self.assertEqual(instance_heatmaps.shape, (34, 128, 128)) + self.assertEqual(instance_coords.shape, (2, 2)) + + # root_type + with self.assertRaises(ValueError): + codec = DecoupledHeatmap( + input_size=(512, 512), + heatmap_size=(128, 128), + root_type='box_center', + ) + encoded = codec.encode( + data['keypoints'], + data['keypoints_visible'], + bbox=data['bbox']) + + codec = DecoupledHeatmap( + input_size=(512, 512), + heatmap_size=(128, 128), + root_type='bbox_center', + ) + + encoded = codec.encode( + data['keypoints'], data['keypoints_visible'], bbox=data['bbox']) + + heatmaps = encoded['heatmaps'] + instance_heatmaps = encoded['instance_heatmaps'] + keypoint_weights = encoded['keypoint_weights'] + instance_coords = encoded['instance_coords'] + + self.assertEqual(heatmaps.shape, (18, 128, 128)) + self.assertEqual(keypoint_weights.shape, (2, 17)) + self.assertEqual(instance_heatmaps.shape, (34, 128, 128)) + self.assertEqual(instance_coords.shape, (2, 2)) + + def test_decode(self): + data = get_coco_sample(img_shape=(512, 512), num_instances=2) + data['bbox'] = np.tile(data['bbox'], 2).reshape(-1, 4, 2) + data['bbox'][:, 1:3, 0] = data['bbox'][:, 0:2, 0] + + codec = DecoupledHeatmap( + input_size=(512, 512), + heatmap_size=(128, 128), + ) + + encoded = codec.encode( + data['keypoints'], data['keypoints_visible'], bbox=data['bbox']) + instance_heatmaps = encoded['instance_heatmaps'].reshape( + encoded['instance_coords'].shape[0], -1, + *encoded['instance_heatmaps'].shape[-2:]) + instance_scores = np.ones(encoded['instance_coords'].shape[0]) + decoded = codec.decode(instance_heatmaps, instance_scores[:, None]) + keypoints, keypoint_scores = decoded + + self.assertEqual(keypoints.shape, (2, 17, 2)) + self.assertEqual(keypoint_scores.shape, (2, 17)) + + def test_cicular_verification(self): + data = get_coco_sample(img_shape=(512, 512), num_instances=1) + data['bbox'] = np.tile(data['bbox'], 2).reshape(-1, 4, 2) + data['bbox'][:, 1:3, 0] = data['bbox'][:, 0:2, 0] + + codec = DecoupledHeatmap( + input_size=(512, 512), + heatmap_size=(128, 128), + ) + + encoded = codec.encode( + data['keypoints'], data['keypoints_visible'], bbox=data['bbox']) + instance_heatmaps = encoded['instance_heatmaps'].reshape( + encoded['instance_coords'].shape[0], -1, + *encoded['instance_heatmaps'].shape[-2:]) + instance_scores = np.ones(encoded['instance_coords'].shape[0]) + decoded = codec.decode(instance_heatmaps, instance_scores[:, None]) + keypoints, _ = decoded + keypoints += 1.5 + + self.assertTrue(np.allclose(keypoints, data['keypoints'], atol=5.)) diff --git a/tests/test_codecs/test_integral_regression_label.py b/tests/test_codecs/test_integral_regression_label.py index b94e596189..8f53a0b21f 100644 --- a/tests/test_codecs/test_integral_regression_label.py +++ b/tests/test_codecs/test_integral_regression_label.py @@ -42,12 +42,14 @@ def test_encode(self): for name, cfg in self.configs: codec = KEYPOINT_CODECS.build(cfg) - heatmaps, reg_label, keypoint_weights = codec.encode( - keypoints, keypoints_visible) + encoded = codec.encode(keypoints, keypoints_visible) + heatmaps = encoded['heatmaps'] + keypoint_labels = encoded['keypoint_labels'] + keypoint_weights = encoded['keypoint_weights'] self.assertEqual(heatmaps.shape, (17, 64, 48), f'Failed case: "{name}"') - self.assertEqual(reg_label.shape, (1, 17, 2), + self.assertEqual(keypoint_labels.shape, (1, 17, 2), f'Failed case: "{name}"') self.assertEqual(keypoint_weights.shape, (1, 17), f'Failed case: "{name}"') @@ -71,9 +73,10 @@ def test_cicular_verification(self): for name, cfg in self.configs: codec = KEYPOINT_CODECS.build(cfg) - _, reg_label, _ = codec.encode(keypoints, keypoints_visible) + encoded = codec.encode(keypoints, keypoints_visible) + keypoint_labels = encoded['keypoint_labels'] - _keypoints, _ = codec.decode(reg_label) + _keypoints, _ = codec.decode(keypoint_labels) self.assertTrue( np.allclose(keypoints, _keypoints, atol=5.), diff --git a/tests/test_codecs/test_megvii_heatmap.py b/tests/test_codecs/test_megvii_heatmap.py index b6277f9d1a..31a5a965c9 100644 --- a/tests/test_codecs/test_megvii_heatmap.py +++ b/tests/test_codecs/test_megvii_heatmap.py @@ -40,12 +40,11 @@ def test_encode(self): for name, cfg in self.configs: codec = KEYPOINT_CODECS.build(cfg) - heatmaps, keypoint_weights = codec.encode(keypoints, - keypoints_visible) + encoded = codec.encode(keypoints, keypoints_visible) - self.assertEqual(heatmaps.shape, (17, 64, 48), + self.assertEqual(encoded['heatmaps'].shape, (17, 64, 48), f'Failed case: "{name}"') - self.assertEqual(keypoint_weights.shape, + self.assertEqual(encoded['keypoint_weights'].shape, (1, 17)), f'Failed case: "{name}"' def test_decode(self): @@ -67,8 +66,8 @@ def test_cicular_verification(self): for name, cfg in self.configs: codec = KEYPOINT_CODECS.build(cfg) - heatmaps, _ = codec.encode(keypoints, keypoints_visible) - _keypoints, _ = codec.decode(heatmaps) + encoded = codec.encode(keypoints, keypoints_visible) + _keypoints, _ = codec.decode(encoded['heatmaps']) self.assertTrue( np.allclose(keypoints, _keypoints, atol=5.), diff --git a/tests/test_codecs/test_msra_heatmap.py b/tests/test_codecs/test_msra_heatmap.py index c6f4a876d9..5897d01461 100644 --- a/tests/test_codecs/test_msra_heatmap.py +++ b/tests/test_codecs/test_msra_heatmap.py @@ -49,12 +49,11 @@ def test_encode(self): for name, cfg in self.configs: codec = KEYPOINT_CODECS.build(cfg) - heatmaps, keypoint_weights = codec.encode(keypoints, - keypoints_visible) + encoded = codec.encode(keypoints, keypoints_visible) - self.assertEqual(heatmaps.shape, (17, 64, 48), + self.assertEqual(encoded['heatmaps'].shape, (17, 64, 48), f'Failed case: "{name}"') - self.assertEqual(keypoint_weights.shape, + self.assertEqual(encoded['keypoint_weights'].shape, (1, 17)), f'Failed case: "{name}"' def test_decode(self): @@ -76,8 +75,8 @@ def test_cicular_verification(self): for name, cfg in self.configs: codec = KEYPOINT_CODECS.build(cfg) - heatmaps, _ = codec.encode(keypoints, keypoints_visible) - _keypoints, _ = codec.decode(heatmaps) + encoded = codec.encode(keypoints, keypoints_visible) + _keypoints, _ = codec.decode(encoded['heatmaps']) self.assertTrue( np.allclose(keypoints, _keypoints, atol=5.), diff --git a/tests/test_codecs/test_regression_label.py b/tests/test_codecs/test_regression_label.py index 482bcf1ae9..e83a3aab18 100644 --- a/tests/test_codecs/test_regression_label.py +++ b/tests/test_codecs/test_regression_label.py @@ -41,12 +41,11 @@ def test_encode(self): for name, cfg in self.configs: codec = KEYPOINT_CODECS.build(cfg) - reg_label, keypoint_weights = codec.encode(keypoints, - keypoints_visible) + encoded = codec.encode(keypoints, keypoints_visible) - self.assertEqual(reg_label.shape, (1, 17, 2), + self.assertEqual(encoded['keypoint_labels'].shape, (1, 17, 2), f'Failed case: "{name}"') - self.assertEqual(keypoint_weights.shape, (1, 17), + self.assertEqual(encoded['keypoint_weights'].shape, (1, 17), f'Failed case: "{name}"') def test_decode(self): @@ -73,9 +72,9 @@ def test_cicular_verification(self): for name, cfg in self.configs: codec = KEYPOINT_CODECS.build(cfg) - reg_label, _ = codec.encode(keypoints, keypoints_visible) + encoded = codec.encode(keypoints, keypoints_visible) - _keypoints, _ = codec.decode(reg_label) + _keypoints, _ = codec.decode(encoded['keypoint_labels']) self.assertTrue( np.allclose(keypoints, _keypoints, atol=5.), diff --git a/tests/test_codecs/test_simcc_label.py b/tests/test_codecs/test_simcc_label.py index 98f02cc102..b4c242ef4e 100644 --- a/tests/test_codecs/test_simcc_label.py +++ b/tests/test_codecs/test_simcc_label.py @@ -40,6 +40,25 @@ def setUp(self) -> None: sigma=5.0, simcc_split_ratio=3.0), ), + ( + 'simcc dark', + dict( + type='SimCCLabel', + input_size=(192, 256), + smoothing_type='gaussian', + sigma=6.0, + simcc_split_ratio=2.0, + use_dark=True), + ), + ( + 'simcc separated sigmas', + dict( + type='SimCCLabel', + input_size=(192, 256), + smoothing_type='gaussian', + sigma=(4.9, 5.66), + simcc_split_ratio=2.0), + ), ] # The bbox is usually padded so the keypoint will not be near the @@ -57,16 +76,15 @@ def test_encode(self): for name, cfg in self.configs: codec = KEYPOINT_CODECS.build(cfg) - target_x, target_y, keypoint_weights = codec.encode( - keypoints, keypoints_visible) + encoded = codec.encode(keypoints, keypoints_visible) - self.assertEqual(target_x.shape, + self.assertEqual(encoded['keypoint_x_labels'].shape, (1, 17, int(192 * codec.simcc_split_ratio)), f'Failed case: "{name}"') - self.assertEqual(target_y.shape, + self.assertEqual(encoded['keypoint_y_labels'].shape, (1, 17, int(256 * codec.simcc_split_ratio)), f'Failed case: "{name}"') - self.assertEqual(keypoint_weights.shape, (1, 17), + self.assertEqual(encoded['keypoint_weights'].shape, (1, 17), f'Failed case: "{name}"') def test_decode(self): @@ -75,9 +93,8 @@ def test_decode(self): simcc_x = np.random.rand(1, 17, int(192 * codec.simcc_split_ratio)) simcc_y = np.random.rand(1, 17, int(256 * codec.simcc_split_ratio)) - encoded = (simcc_x, simcc_y) - keypoints, scores = codec.decode(encoded) + keypoints, scores = codec.decode(simcc_x, simcc_y) self.assertEqual(keypoints.shape, (1, 17, 2), f'Failed case: "{name}"') @@ -90,11 +107,11 @@ def test_cicular_verification(self): for name, cfg in self.configs: codec = KEYPOINT_CODECS.build(cfg) - target_x, target_y, _ = codec.encode(keypoints, keypoints_visible) - - encoded = (target_x, target_y) + encoded = codec.encode(keypoints, keypoints_visible) + keypoint_x_labels = encoded['keypoint_x_labels'] + keypoint_y_labels = encoded['keypoint_y_labels'] - _keypoints, _ = codec.decode(encoded) + _keypoints, _ = codec.decode(keypoint_x_labels, keypoint_y_labels) self.assertTrue( np.allclose(keypoints, _keypoints, atol=5.), diff --git a/tests/test_codecs/test_spr.py b/tests/test_codecs/test_spr.py new file mode 100644 index 0000000000..58eeeb7d17 --- /dev/null +++ b/tests/test_codecs/test_spr.py @@ -0,0 +1,188 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import numpy as np + +from mmpose.codecs import SPR +from mmpose.registry import KEYPOINT_CODECS +from mmpose.testing import get_coco_sample +from mmpose.utils.tensor_utils import to_numpy, to_tensor + + +class TestSPR(TestCase): + + def setUp(self) -> None: + pass + + def _make_multi_instance_data(self, data): + keypoints = data['keypoints'] + keypoints_visible = data['keypoints_visible'] + + keypoints_visible[..., 0] = 0 + + keypoints_outside = keypoints - keypoints.max(axis=-1, keepdims=True) + keypoints_outside_visible = np.zeros(keypoints_visible.shape) + + keypoint_overlap = keypoints.mean( + axis=-1, keepdims=True) + 0.8 * ( + keypoints - keypoints.mean(axis=-1, keepdims=True)) + keypoint_overlap_visible = keypoints_visible + + data['keypoints'] = np.concatenate( + (keypoints, keypoints_outside, keypoint_overlap), axis=0) + data['keypoints_visible'] = np.concatenate( + (keypoints_visible, keypoints_outside_visible, + keypoint_overlap_visible), + axis=0) + + return data + + def test_build(self): + cfg = dict( + type='SPR', + input_size=(512, 512), + heatmap_size=(128, 128), + sigma=4, + ) + codec = KEYPOINT_CODECS.build(cfg) + self.assertIsInstance(codec, SPR) + + def test_encode(self): + data = get_coco_sample(img_shape=(512, 512), num_instances=1) + data = self._make_multi_instance_data(data) + + # w/o keypoint heatmaps + codec = SPR( + input_size=(512, 512), + heatmap_size=(128, 128), + sigma=4, + ) + + encoded = codec.encode(data['keypoints'], data['keypoints_visible']) + + heatmaps = encoded['heatmaps'] + displacements = encoded['displacements'] + heatmap_weights = encoded['heatmap_weights'] + displacement_weights = encoded['displacement_weights'] + + self.assertEqual(heatmaps.shape, (1, 128, 128)) + self.assertEqual(heatmap_weights.shape, (1, 128, 128)) + self.assertEqual(displacements.shape, (34, 128, 128)) + self.assertEqual(displacement_weights.shape, (34, 128, 128)) + + # w/ keypoint heatmaps + with self.assertRaises(AssertionError): + codec = SPR( + input_size=(512, 512), + heatmap_size=(128, 128), + sigma=4, + generate_keypoint_heatmaps=True, + ) + + codec = SPR( + input_size=(512, 512), + heatmap_size=(128, 128), + sigma=(4, 2), + generate_keypoint_heatmaps=True, + ) + + encoded = codec.encode(data['keypoints'], data['keypoints_visible']) + + heatmaps = encoded['heatmaps'] + displacements = encoded['displacements'] + heatmap_weights = encoded['heatmap_weights'] + displacement_weights = encoded['displacement_weights'] + + self.assertEqual(heatmaps.shape, (18, 128, 128)) + self.assertEqual(heatmap_weights.shape, (18, 128, 128)) + self.assertEqual(displacements.shape, (34, 128, 128)) + self.assertEqual(displacement_weights.shape, (34, 128, 128)) + + # root_type + with self.assertRaises(ValueError): + codec = SPR( + input_size=(512, 512), + heatmap_size=(128, 128), + sigma=(4, ), + root_type='box_center', + ) + encoded = codec.encode(data['keypoints'], + data['keypoints_visible']) + + codec = SPR( + input_size=(512, 512), + heatmap_size=(128, 128), + sigma=(4, ), + root_type='bbox_center', + ) + + encoded = codec.encode(data['keypoints'], data['keypoints_visible']) + + heatmaps = encoded['heatmaps'] + displacements = encoded['displacements'] + heatmap_weights = encoded['heatmap_weights'] + displacement_weights = encoded['displacement_weights'] + + self.assertEqual(heatmaps.shape, (1, 128, 128)) + self.assertEqual(heatmap_weights.shape, (1, 128, 128)) + self.assertEqual(displacements.shape, (34, 128, 128)) + self.assertEqual(displacement_weights.shape, (34, 128, 128)) + + def test_decode(self): + data = get_coco_sample(img_shape=(512, 512), num_instances=1) + + # decode w/o keypoint heatmaps + codec = SPR( + input_size=(512, 512), + heatmap_size=(128, 128), + sigma=(4, ), + generate_keypoint_heatmaps=False, + ) + + encoded = codec.encode(data['keypoints'], data['keypoints_visible']) + decoded = codec.decode( + to_tensor(encoded['heatmaps']), + to_tensor(encoded['displacements'])) + + keypoints, (root_scores, keypoint_scores) = decoded + self.assertIsNone(keypoint_scores) + self.assertEqual(keypoints.shape, data['keypoints'].shape) + self.assertEqual(root_scores.shape, data['keypoints'].shape[:1]) + + # decode w/ keypoint heatmaps + codec = SPR( + input_size=(512, 512), + heatmap_size=(128, 128), + sigma=(4, 2), + generate_keypoint_heatmaps=True, + ) + + encoded = codec.encode(data['keypoints'], data['keypoints_visible']) + decoded = codec.decode( + to_tensor(encoded['heatmaps']), + to_tensor(encoded['displacements'])) + + keypoints, (root_scores, keypoint_scores) = decoded + self.assertIsNotNone(keypoint_scores) + self.assertEqual(keypoints.shape, data['keypoints'].shape) + self.assertEqual(root_scores.shape, data['keypoints'].shape[:1]) + self.assertEqual(keypoint_scores.shape, data['keypoints'].shape[:2]) + + def test_cicular_verification(self): + data = get_coco_sample(img_shape=(512, 512), num_instances=1) + + codec = SPR( + input_size=(512, 512), + heatmap_size=(128, 128), + sigma=(4, ), + generate_keypoint_heatmaps=False, + ) + + encoded = codec.encode(data['keypoints'], data['keypoints_visible']) + decoded = codec.decode( + to_tensor(encoded['heatmaps']), + to_tensor(encoded['displacements'])) + + keypoints, _ = decoded + self.assertTrue( + np.allclose(to_numpy(keypoints), data['keypoints'], atol=5.)) diff --git a/tests/test_codecs/test_udp_heatmap.py b/tests/test_codecs/test_udp_heatmap.py index 0e7d6990eb..81913ddee4 100644 --- a/tests/test_codecs/test_udp_heatmap.py +++ b/tests/test_codecs/test_udp_heatmap.py @@ -46,17 +46,17 @@ def test_encode(self): for name, cfg in self.configs: codec = KEYPOINT_CODECS.build(cfg) - heatmaps, keypoint_weights = codec.encode(keypoints, - keypoints_visible) + encoded = codec.encode(keypoints, keypoints_visible) if codec.heatmap_type == 'combined': channel_per_kpt = 3 else: channel_per_kpt = 1 - self.assertEqual(heatmaps.shape, (channel_per_kpt * 17, 64, 48), + self.assertEqual(encoded['heatmaps'].shape, + (channel_per_kpt * 17, 64, 48), f'Failed case: "{name}"') - self.assertEqual(keypoint_weights.shape, + self.assertEqual(encoded['keypoint_weights'].shape, (1, 17)), f'Failed case: "{name}"' def test_decode(self): @@ -84,8 +84,8 @@ def test_cicular_verification(self): for name, cfg in self.configs: codec = KEYPOINT_CODECS.build(cfg) - heatmaps, k = codec.encode(keypoints, keypoints_visible) - _keypoints, _ = codec.decode(heatmaps) + encoded = codec.encode(keypoints, keypoints_visible) + _keypoints, _ = codec.decode(encoded['heatmaps']) self.assertTrue( np.allclose(keypoints, _keypoints, atol=10.), diff --git a/tests/test_datasets/test_datasets/test_animal_datasets/test_animalpose_dataset.py b/tests/test_datasets/test_datasets/test_animal_datasets/test_animalpose_dataset.py index 18ae409cae..9bb9725252 100644 --- a/tests/test_datasets/test_datasets/test_animal_datasets/test_animalpose_dataset.py +++ b/tests/test_datasets/test_datasets/test_animal_datasets/test_animalpose_dataset.py @@ -91,7 +91,7 @@ def test_metainfo(self): len(dataset.metainfo['skeleton_links']), len(dataset.metainfo['skeleton_link_colors'])) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_animalpose_dataset(data_mode='topdown') self.assertEqual(dataset.data_mode, 'topdown') @@ -107,7 +107,7 @@ def test_top_down(self): self.assertEqual(len(dataset), 2) self.check_data_info_keys(dataset[0]) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_animalpose_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 2) diff --git a/tests/test_datasets/test_datasets/test_animal_datasets/test_ap10k_dataset.py b/tests/test_datasets/test_datasets/test_animal_datasets/test_ap10k_dataset.py index 1e3496b0b8..74ae89e960 100644 --- a/tests/test_datasets/test_datasets/test_animal_datasets/test_ap10k_dataset.py +++ b/tests/test_datasets/test_datasets/test_animal_datasets/test_ap10k_dataset.py @@ -91,7 +91,7 @@ def test_metainfo(self): len(dataset.metainfo['skeleton_links']), len(dataset.metainfo['skeleton_link_colors'])) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_ap10k_dataset(data_mode='topdown') self.assertEqual(dataset.data_mode, 'topdown') @@ -107,7 +107,7 @@ def test_top_down(self): self.assertEqual(len(dataset), 2) self.check_data_info_keys(dataset[0]) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_ap10k_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 2) diff --git a/tests/test_datasets/test_datasets/test_animal_datasets/test_atrw_dataset.py b/tests/test_datasets/test_datasets/test_animal_datasets/test_atrw_dataset.py index e93a76afdf..e1554b55c4 100644 --- a/tests/test_datasets/test_datasets/test_animal_datasets/test_atrw_dataset.py +++ b/tests/test_datasets/test_datasets/test_animal_datasets/test_atrw_dataset.py @@ -91,7 +91,7 @@ def test_metainfo(self): len(dataset.metainfo['skeleton_links']), len(dataset.metainfo['skeleton_link_colors'])) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_atrw_dataset(data_mode='topdown') self.assertEqual(dataset.data_mode, 'topdown') @@ -106,7 +106,7 @@ def test_top_down(self): self.assertEqual(len(dataset), 2) self.check_data_info_keys(dataset[0]) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_atrw_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 2) diff --git a/tests/test_datasets/test_datasets/test_animal_datasets/test_fly_dataset.py b/tests/test_datasets/test_datasets/test_animal_datasets/test_fly_dataset.py index 71984377e4..9765e318db 100644 --- a/tests/test_datasets/test_datasets/test_animal_datasets/test_fly_dataset.py +++ b/tests/test_datasets/test_datasets/test_animal_datasets/test_fly_dataset.py @@ -89,7 +89,7 @@ def test_metainfo(self): len(dataset.metainfo['skeleton_links']), len(dataset.metainfo['skeleton_link_colors'])) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_fly_dataset(data_mode='topdown') self.assertEqual(dataset.data_mode, 'topdown') @@ -104,7 +104,7 @@ def test_top_down(self): self.assertEqual(len(dataset), 2) self.check_data_info_keys(dataset[0]) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_fly_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 2) diff --git a/tests/test_datasets/test_datasets/test_animal_datasets/test_horse10_dataset.py b/tests/test_datasets/test_datasets/test_animal_datasets/test_horse10_dataset.py index 38a8b985ee..39e32c1a7b 100644 --- a/tests/test_datasets/test_datasets/test_animal_datasets/test_horse10_dataset.py +++ b/tests/test_datasets/test_datasets/test_animal_datasets/test_horse10_dataset.py @@ -89,7 +89,7 @@ def test_metainfo(self): len(dataset.metainfo['skeleton_links']), len(dataset.metainfo['skeleton_link_colors'])) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_horse10_dataset(data_mode='topdown') self.assertEqual(dataset.data_mode, 'topdown') @@ -105,7 +105,7 @@ def test_top_down(self): self.assertEqual(len(dataset), 3) self.check_data_info_keys(dataset[0]) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_horse10_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 3) diff --git a/tests/test_datasets/test_datasets/test_animal_datasets/test_locust_dataset.py b/tests/test_datasets/test_datasets/test_animal_datasets/test_locust_dataset.py index a3224c5959..3f48696a4b 100644 --- a/tests/test_datasets/test_datasets/test_animal_datasets/test_locust_dataset.py +++ b/tests/test_datasets/test_datasets/test_animal_datasets/test_locust_dataset.py @@ -89,7 +89,7 @@ def test_metainfo(self): len(dataset.metainfo['skeleton_links']), len(dataset.metainfo['skeleton_link_colors'])) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_locust_dataset(data_mode='topdown') self.assertEqual(dataset.data_mode, 'topdown') @@ -105,7 +105,7 @@ def test_top_down(self): self.assertEqual(len(dataset), 2) self.check_data_info_keys(dataset[0]) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_locust_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 2) diff --git a/tests/test_datasets/test_datasets/test_animal_datasets/test_macaque_dataset.py b/tests/test_datasets/test_datasets/test_animal_datasets/test_macaque_dataset.py index 92e54e8e00..1dee242812 100644 --- a/tests/test_datasets/test_datasets/test_animal_datasets/test_macaque_dataset.py +++ b/tests/test_datasets/test_datasets/test_animal_datasets/test_macaque_dataset.py @@ -91,7 +91,7 @@ def test_metainfo(self): len(dataset.metainfo['skeleton_links']), len(dataset.metainfo['skeleton_link_colors'])) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_macaque_dataset(data_mode='topdown') self.assertEqual(dataset.data_mode, 'topdown') @@ -107,7 +107,7 @@ def test_top_down(self): self.assertEqual(len(dataset), 2) self.check_data_info_keys(dataset[0]) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_macaque_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 2) diff --git a/tests/test_datasets/test_datasets/test_animal_datasets/test_zebra_dataset.py b/tests/test_datasets/test_datasets/test_animal_datasets/test_zebra_dataset.py index 7abe7d53a5..c0a2db9a2a 100644 --- a/tests/test_datasets/test_datasets/test_animal_datasets/test_zebra_dataset.py +++ b/tests/test_datasets/test_datasets/test_animal_datasets/test_zebra_dataset.py @@ -89,7 +89,7 @@ def test_metainfo(self): len(dataset.metainfo['skeleton_links']), len(dataset.metainfo['skeleton_link_colors'])) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_zebra_dataset(data_mode='topdown') self.assertEqual(dataset.data_mode, 'topdown') @@ -104,7 +104,7 @@ def test_top_down(self): self.assertEqual(len(dataset), 2) self.check_data_info_keys(dataset[0]) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_zebra_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 2) diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_aic_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_aic_dataset.py index 1eaaa7cbf2..ae00a64393 100644 --- a/tests/test_datasets/test_datasets/test_body_datasets/test_aic_dataset.py +++ b/tests/test_datasets/test_datasets/test_body_datasets/test_aic_dataset.py @@ -91,7 +91,7 @@ def test_metainfo(self): len(dataset.metainfo['skeleton_links']), len(dataset.metainfo['skeleton_link_colors'])) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_aic_dataset(data_mode='topdown') self.assertEqual(dataset.bbox_file, None) @@ -104,7 +104,7 @@ def test_top_down(self): self.assertEqual(len(dataset), 9) self.check_data_info_keys(dataset[0], data_mode='topdown') - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_aic_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 3) diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_coco_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_coco_dataset.py index bd1b575735..de78264dae 100644 --- a/tests/test_datasets/test_datasets/test_body_datasets/test_coco_dataset.py +++ b/tests/test_datasets/test_datasets/test_body_datasets/test_coco_dataset.py @@ -91,7 +91,7 @@ def test_metainfo(self): len(dataset.metainfo['skeleton_links']), len(dataset.metainfo['skeleton_link_colors'])) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_coco_dataset(data_mode='topdown') self.assertEqual(len(dataset), 12) @@ -118,7 +118,7 @@ def test_top_down(self): filter_cfg=dict(bbox_score_thr=0.3)) self.assertEqual(len(dataset), 33) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_coco_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 4) diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_crowdpose_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_crowdpose_dataset.py index b9543df1f1..8d63925257 100644 --- a/tests/test_datasets/test_datasets/test_body_datasets/test_crowdpose_dataset.py +++ b/tests/test_datasets/test_datasets/test_body_datasets/test_crowdpose_dataset.py @@ -91,7 +91,7 @@ def test_metainfo(self): len(dataset.metainfo['skeleton_links']), len(dataset.metainfo['skeleton_link_colors'])) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_crowdpose_dataset(data_mode='topdown') # filter an invalid instance due to num_keypoints = 0 @@ -121,7 +121,7 @@ def test_top_down(self): filter_cfg=dict(bbox_score_thr=0.97)) self.assertEqual(len(dataset), 5) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_crowdpose_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 2) diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_jhmdb_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_jhmdb_dataset.py index 280f3c3172..d7aa46b067 100644 --- a/tests/test_datasets/test_datasets/test_body_datasets/test_jhmdb_dataset.py +++ b/tests/test_datasets/test_datasets/test_body_datasets/test_jhmdb_dataset.py @@ -91,7 +91,7 @@ def test_metainfo(self): len(dataset.metainfo['skeleton_links']), len(dataset.metainfo['skeleton_link_colors'])) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_jhmdb_dataset(data_mode='topdown') self.assertEqual(dataset.data_mode, 'topdown') @@ -104,7 +104,7 @@ def test_top_down(self): self.assertEqual(len(dataset), 3) self.check_data_info_keys(dataset[0], data_mode='topdown') - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_jhmdb_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 3) diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_mhp_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_mhp_dataset.py index e32b9af1f3..e93a524611 100644 --- a/tests/test_datasets/test_datasets/test_body_datasets/test_mhp_dataset.py +++ b/tests/test_datasets/test_datasets/test_body_datasets/test_mhp_dataset.py @@ -94,7 +94,7 @@ def test_metainfo(self): len(dataset.metainfo['skeleton_links']), len(dataset.metainfo['skeleton_link_colors'])) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_mhp_dataset(data_mode='topdown') self.assertEqual(dataset.bbox_file, None) @@ -107,7 +107,7 @@ def test_top_down(self): self.assertEqual(len(dataset), 4) self.check_data_info_keys(dataset[0], data_mode='topdown') - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_mhp_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 2) diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_mpii_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_mpii_dataset.py index b46d816694..f6431af429 100644 --- a/tests/test_datasets/test_datasets/test_body_datasets/test_mpii_dataset.py +++ b/tests/test_datasets/test_datasets/test_body_datasets/test_mpii_dataset.py @@ -104,7 +104,7 @@ def test_topdown(self): self.assertEqual(len(dataset), 5) self.check_data_info_keys(dataset[0]) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_mpii_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 5) diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_mpii_trb_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_mpii_trb_dataset.py index 57a829bf94..bd64662ce3 100644 --- a/tests/test_datasets/test_datasets/test_body_datasets/test_mpii_trb_dataset.py +++ b/tests/test_datasets/test_datasets/test_body_datasets/test_mpii_trb_dataset.py @@ -103,7 +103,7 @@ def test_topdown(self): self.assertEqual(len(dataset), 5) self.check_data_info_keys(dataset[0], data_mode='topdown') - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_mpii_trb_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 5) diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_ochuman_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_ochuman_dataset.py index 2712c74352..8e9f3ad532 100644 --- a/tests/test_datasets/test_datasets/test_body_datasets/test_ochuman_dataset.py +++ b/tests/test_datasets/test_datasets/test_body_datasets/test_ochuman_dataset.py @@ -91,7 +91,7 @@ def test_metainfo(self): len(dataset.metainfo['skeleton_links']), len(dataset.metainfo['skeleton_link_colors'])) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_ochuman_dataset(data_mode='topdown') self.assertEqual(len(dataset), 5) @@ -103,7 +103,7 @@ def test_top_down(self): self.assertEqual(len(dataset), 5) self.check_data_info_keys(dataset[0], data_mode='topdown') - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_ochuman_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 3) diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_posetrack18_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_posetrack18_dataset.py index 438856ef03..ef3cd82dfb 100644 --- a/tests/test_datasets/test_datasets/test_body_datasets/test_posetrack18_dataset.py +++ b/tests/test_datasets/test_datasets/test_body_datasets/test_posetrack18_dataset.py @@ -91,7 +91,7 @@ def test_metainfo(self): len(dataset.metainfo['skeleton_links']), len(dataset.metainfo['skeleton_link_colors'])) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_posetrack18_dataset(data_mode='topdown') self.assertEqual(len(dataset), 14) @@ -119,7 +119,7 @@ def test_top_down(self): filter_cfg=dict(bbox_score_thr=0.3)) self.assertEqual(len(dataset), 119) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_posetrack18_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 3) diff --git a/tests/test_datasets/test_datasets/test_body_datasets/test_posetrack18_video_dataset.py b/tests/test_datasets/test_datasets/test_body_datasets/test_posetrack18_video_dataset.py index b51a4df755..88b58e486d 100644 --- a/tests/test_datasets/test_datasets/test_body_datasets/test_posetrack18_video_dataset.py +++ b/tests/test_datasets/test_datasets/test_body_datasets/test_posetrack18_video_dataset.py @@ -101,7 +101,7 @@ def test_metainfo(self): len(dataset.metainfo['skeleton_links']), len(dataset.metainfo['skeleton_link_colors'])) - def test_top_down(self): + def test_topdown(self): # test topdown training, frame_sampler_mode = 'random' dataset = self.build_posetrack18_video_dataset( data_mode='topdown', frame_sampler_mode='random') @@ -164,7 +164,7 @@ def test_top_down(self): filter_cfg=dict(bbox_score_thr=0.3)) self.assertEqual(len(dataset), 119) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_posetrack18_video_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 3) diff --git a/tests/test_datasets/test_datasets/test_dataset_wrappers/test_combined_dataset.py b/tests/test_datasets/test_datasets/test_dataset_wrappers/test_combined_dataset.py new file mode 100644 index 0000000000..698f1f060d --- /dev/null +++ b/tests/test_datasets/test_datasets/test_dataset_wrappers/test_combined_dataset.py @@ -0,0 +1,89 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import numpy as np + +from mmpose.datasets.dataset_wrappers import CombinedDataset + + +class TestCombinedDataset(TestCase): + + def build_combined_dataset(self, **kwargs): + + coco_cfg = dict( + type='CocoDataset', + ann_file='test_coco.json', + bbox_file=None, + data_mode='topdown', + data_root='tests/data/coco', + pipeline=[], + test_mode=False) + + aic_cfg = dict( + type='AicDataset', + ann_file='test_aic.json', + bbox_file=None, + data_mode='topdown', + data_root='tests/data/aic', + pipeline=[], + test_mode=False) + + cfg = dict( + metainfo=dict(from_file='configs/_base_/datasets/coco.py'), + datasets=[coco_cfg, aic_cfg], + pipeline=[]) + cfg.update(kwargs) + return CombinedDataset(**cfg) + + def check_data_info_keys(self, + data_info: dict, + data_mode: str = 'topdown'): + if data_mode == 'topdown': + expected_keys = dict( + img_id=int, + img_path=str, + bbox=np.ndarray, + bbox_score=np.ndarray, + keypoints=np.ndarray, + keypoints_visible=np.ndarray, + id=int) + elif data_mode == 'bottomup': + expected_keys = dict( + img_id=int, + img_path=str, + bbox=np.ndarray, + bbox_score=np.ndarray, + keypoints=np.ndarray, + keypoints_visible=np.ndarray, + invalid_segs=list, + id=list) + else: + raise ValueError(f'Invalid data_mode {data_mode}') + + for key, type_ in expected_keys.items(): + self.assertIn(key, data_info) + self.assertIsInstance(data_info[key], type_, key) + + def test_get_subset_index(self): + dataset = self.build_combined_dataset() + lens = dataset._lens + + with self.assertRaises(ValueError): + subset_idx, sample_idx = dataset._get_subset_index(sum(lens)) + + index = lens[0] + subset_idx, sample_idx = dataset._get_subset_index(index) + self.assertEqual(subset_idx, 1) + self.assertEqual(sample_idx, 0) + + index = -lens[1] - 1 + subset_idx, sample_idx = dataset._get_subset_index(index) + self.assertEqual(subset_idx, 0) + self.assertEqual(sample_idx, lens[0] - 1) + + def test_prepare_data(self): + dataset = self.build_combined_dataset() + lens = dataset._lens + + data_info = dataset[lens[0]] + self.check_data_info_keys(data_info) diff --git a/tests/test_datasets/test_datasets/test_face_datasets/test_aflw_dataset.py b/tests/test_datasets/test_datasets/test_face_datasets/test_aflw_dataset.py index 795a1812c7..dfc3a5ccce 100644 --- a/tests/test_datasets/test_datasets/test_face_datasets/test_aflw_dataset.py +++ b/tests/test_datasets/test_datasets/test_face_datasets/test_aflw_dataset.py @@ -86,7 +86,7 @@ def test_metainfo(self): self.assertEqual( len(dataset.metainfo['dataset_keypoint_weights']), num_keypoints) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_aflw_dataset(data_mode='topdown') self.assertEqual(dataset.data_mode, 'topdown') @@ -101,7 +101,7 @@ def test_top_down(self): self.assertEqual(len(dataset), 2) self.check_data_info_keys(dataset[0]) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_aflw_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 2) diff --git a/tests/test_datasets/test_datasets/test_face_datasets/test_coco_wholebody_face_dataset.py b/tests/test_datasets/test_datasets/test_face_datasets/test_coco_wholebody_face_dataset.py index d5943dc04d..7c296c64ad 100644 --- a/tests/test_datasets/test_datasets/test_face_datasets/test_coco_wholebody_face_dataset.py +++ b/tests/test_datasets/test_datasets/test_face_datasets/test_coco_wholebody_face_dataset.py @@ -87,7 +87,7 @@ def test_metainfo(self): # note that len(sigmas) may be zero if dataset.metainfo['sigmas'] = [] self.assertEqual(len(dataset.metainfo['sigmas']), num_keypoints) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_coco_wholebody_face_dataset(data_mode='topdown') self.assertEqual(dataset.data_mode, 'topdown') @@ -104,18 +104,18 @@ def test_top_down(self): self.assertEqual(len(dataset), 4) self.check_data_info_keys(dataset[0]) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_coco_wholebody_face_dataset(data_mode='bottomup') - # filter one invalid insances due to face_valid = false + # filter one invalid instance due to face_valid = false self.assertEqual(len(dataset), 3) self.check_data_info_keys(dataset[0], data_mode='bottomup') # test bottomup testing dataset = self.build_coco_wholebody_face_dataset( data_mode='bottomup', test_mode=True) - # filter invalid insances due to face_valid = false - self.assertEqual(len(dataset), 3) + # all images are used for evaluation + self.assertEqual(len(dataset), 4) self.check_data_info_keys(dataset[0], data_mode='bottomup') def test_exceptions_and_warnings(self): diff --git a/tests/test_datasets/test_datasets/test_face_datasets/test_cofw_dataset.py b/tests/test_datasets/test_datasets/test_face_datasets/test_cofw_dataset.py index a6ace1f07d..c8801a677a 100644 --- a/tests/test_datasets/test_datasets/test_face_datasets/test_cofw_dataset.py +++ b/tests/test_datasets/test_datasets/test_face_datasets/test_cofw_dataset.py @@ -84,7 +84,7 @@ def test_metainfo(self): self.assertEqual( len(dataset.metainfo['dataset_keypoint_weights']), num_keypoints) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_cofw_dataset(data_mode='topdown') self.assertEqual(dataset.data_mode, 'topdown') @@ -99,7 +99,7 @@ def test_top_down(self): self.assertEqual(len(dataset), 2) self.check_data_info_keys(dataset[0]) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_cofw_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 2) diff --git a/tests/test_datasets/test_datasets/test_face_datasets/test_face_300w_dataset.py b/tests/test_datasets/test_datasets/test_face_datasets/test_face_300w_dataset.py index 5cb8ff0d05..c45330536f 100644 --- a/tests/test_datasets/test_datasets/test_face_datasets/test_face_300w_dataset.py +++ b/tests/test_datasets/test_datasets/test_face_datasets/test_face_300w_dataset.py @@ -86,7 +86,7 @@ def test_metainfo(self): self.assertEqual( len(dataset.metainfo['dataset_keypoint_weights']), num_keypoints) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_face_300w_dataset(data_mode='topdown') self.assertEqual(dataset.data_mode, 'topdown') @@ -102,7 +102,7 @@ def test_top_down(self): self.assertEqual(len(dataset), 2) self.check_data_info_keys(dataset[0]) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_face_300w_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 2) diff --git a/tests/test_datasets/test_datasets/test_face_datasets/test_wflw_dataset.py b/tests/test_datasets/test_datasets/test_face_datasets/test_wflw_dataset.py index 5970de1f73..aab0fd1813 100644 --- a/tests/test_datasets/test_datasets/test_face_datasets/test_wflw_dataset.py +++ b/tests/test_datasets/test_datasets/test_face_datasets/test_wflw_dataset.py @@ -86,7 +86,7 @@ def test_metainfo(self): self.assertEqual( len(dataset.metainfo['dataset_keypoint_weights']), num_keypoints) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_wflw_dataset(data_mode='topdown') self.assertEqual(dataset.data_mode, 'topdown') @@ -101,7 +101,7 @@ def test_top_down(self): self.assertEqual(len(dataset), 4) self.check_data_info_keys(dataset[0]) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_wflw_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 2) diff --git a/tests/test_datasets/test_datasets/test_fashion_datasets/test_deepfashion_dataset.py b/tests/test_datasets/test_datasets/test_fashion_datasets/test_deepfashion_dataset.py index 102e2c2480..2140a23467 100644 --- a/tests/test_datasets/test_datasets/test_fashion_datasets/test_deepfashion_dataset.py +++ b/tests/test_datasets/test_datasets/test_fashion_datasets/test_deepfashion_dataset.py @@ -111,7 +111,7 @@ def test_metainfo(self): self.assertEqual( len(dataset.metainfo['dataset_keypoint_weights']), num_keypoints) - def test_top_down(self): + def test_topdown(self): # test subset = 'full' topdown training dataset = self.build_deepfashion_dataset(data_mode='topdown') self.assertEqual(dataset.data_mode, 'topdown') @@ -127,7 +127,7 @@ def test_top_down(self): self.assertEqual(len(dataset), 2) self.check_data_info_keys(dataset[0]) - def test_bottom_up(self): + def test_bottomup(self): # test subset = 'full' bottomup training dataset = self.build_deepfashion_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 2) diff --git a/tests/test_datasets/test_datasets/test_hand_datasets/test_coco_wholebody_hand_dataset.py b/tests/test_datasets/test_datasets/test_hand_datasets/test_coco_wholebody_hand_dataset.py index 0cccef320b..6bb425bf81 100644 --- a/tests/test_datasets/test_datasets/test_hand_datasets/test_coco_wholebody_hand_dataset.py +++ b/tests/test_datasets/test_datasets/test_hand_datasets/test_coco_wholebody_hand_dataset.py @@ -92,7 +92,7 @@ def test_metainfo(self): len(dataset.metainfo['skeleton_links']), len(dataset.metainfo['skeleton_link_colors'])) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_coco_wholebody_hand_dataset(data_mode='topdown') self.assertEqual(dataset.data_mode, 'topdown') @@ -110,7 +110,7 @@ def test_top_down(self): self.assertEqual(len(dataset), 10) self.check_data_info_keys(dataset[0]) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_coco_wholebody_hand_dataset(data_mode='bottomup') # filter repeated images diff --git a/tests/test_datasets/test_datasets/test_hand_datasets/test_freihand_dataset.py b/tests/test_datasets/test_datasets/test_hand_datasets/test_freihand_dataset.py index 974f130f9c..046a2dd602 100644 --- a/tests/test_datasets/test_datasets/test_hand_datasets/test_freihand_dataset.py +++ b/tests/test_datasets/test_datasets/test_hand_datasets/test_freihand_dataset.py @@ -89,7 +89,7 @@ def test_metainfo(self): len(dataset.metainfo['skeleton_links']), len(dataset.metainfo['skeleton_link_colors'])) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_freihand_dataset(data_mode='topdown') self.assertEqual(dataset.data_mode, 'topdown') @@ -105,7 +105,7 @@ def test_top_down(self): self.assertEqual(len(dataset), 8) self.check_data_info_keys(dataset[0]) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_freihand_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 8) diff --git a/tests/test_datasets/test_datasets/test_hand_datasets/test_onehand10k_dataset.py b/tests/test_datasets/test_datasets/test_hand_datasets/test_onehand10k_dataset.py index b1e5526229..38ff6c4083 100644 --- a/tests/test_datasets/test_datasets/test_hand_datasets/test_onehand10k_dataset.py +++ b/tests/test_datasets/test_datasets/test_hand_datasets/test_onehand10k_dataset.py @@ -89,7 +89,7 @@ def test_metainfo(self): len(dataset.metainfo['skeleton_links']), len(dataset.metainfo['skeleton_link_colors'])) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_onehand10k_dataset(data_mode='topdown') self.assertEqual(dataset.data_mode, 'topdown') @@ -105,7 +105,7 @@ def test_top_down(self): self.assertEqual(len(dataset), 4) self.check_data_info_keys(dataset[0]) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_onehand10k_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 4) diff --git a/tests/test_datasets/test_datasets/test_hand_datasets/test_panoptic_hand2d_dataset.py b/tests/test_datasets/test_datasets/test_hand_datasets/test_panoptic_hand2d_dataset.py index c8496cab8f..665795c985 100644 --- a/tests/test_datasets/test_datasets/test_hand_datasets/test_panoptic_hand2d_dataset.py +++ b/tests/test_datasets/test_datasets/test_hand_datasets/test_panoptic_hand2d_dataset.py @@ -94,7 +94,7 @@ def test_metainfo(self): len(dataset.metainfo['skeleton_links']), len(dataset.metainfo['skeleton_link_colors'])) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_panoptic_hand2d_dataset(data_mode='topdown') self.assertEqual(dataset.data_mode, 'topdown') @@ -110,7 +110,7 @@ def test_top_down(self): self.assertEqual(len(dataset), 4) self.check_data_info_keys(dataset[0]) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_panoptic_hand2d_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 4) diff --git a/tests/test_datasets/test_datasets/test_hand_datasets/test_rhd2d_dataset.py b/tests/test_datasets/test_datasets/test_hand_datasets/test_rhd2d_dataset.py index 62cb435add..852966c8fd 100644 --- a/tests/test_datasets/test_datasets/test_hand_datasets/test_rhd2d_dataset.py +++ b/tests/test_datasets/test_datasets/test_hand_datasets/test_rhd2d_dataset.py @@ -88,7 +88,7 @@ def test_metainfo(self): len(dataset.metainfo['skeleton_links']), len(dataset.metainfo['skeleton_link_colors'])) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_rhd2d_dataset(data_mode='topdown') self.assertEqual(dataset.data_mode, 'topdown') @@ -103,7 +103,7 @@ def test_top_down(self): self.assertEqual(len(dataset), 3) self.check_data_info_keys(dataset[0]) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_rhd2d_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 3) diff --git a/tests/test_datasets/test_datasets/test_wholebody_datasets/test_coco_wholebody_dataset.py b/tests/test_datasets/test_datasets/test_wholebody_datasets/test_coco_wholebody_dataset.py index 5f527ca027..a6ae20e534 100644 --- a/tests/test_datasets/test_datasets/test_wholebody_datasets/test_coco_wholebody_dataset.py +++ b/tests/test_datasets/test_datasets/test_wholebody_datasets/test_coco_wholebody_dataset.py @@ -91,7 +91,7 @@ def test_metainfo(self): len(dataset.metainfo['skeleton_links']), len(dataset.metainfo['skeleton_link_colors'])) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_coco_wholebody_dataset(data_mode='topdown') # filter two invalid instances due to num_keypoints = 0 @@ -120,7 +120,7 @@ def test_top_down(self): filter_cfg=dict(bbox_score_thr=0.3)) self.assertEqual(len(dataset), 33) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_coco_wholebody_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 4) diff --git a/tests/test_datasets/test_datasets/test_wholebody_datasets/test_halpe_dataset.py b/tests/test_datasets/test_datasets/test_wholebody_datasets/test_halpe_dataset.py index 482624400a..9ea79fc2cc 100644 --- a/tests/test_datasets/test_datasets/test_wholebody_datasets/test_halpe_dataset.py +++ b/tests/test_datasets/test_datasets/test_wholebody_datasets/test_halpe_dataset.py @@ -90,7 +90,7 @@ def test_metainfo(self): len(dataset.metainfo['skeleton_links']), len(dataset.metainfo['skeleton_link_colors'])) - def test_top_down(self): + def test_topdown(self): # test topdown training dataset = self.build_halpe_dataset(data_mode='topdown') self.assertEqual(dataset.data_mode, 'topdown') @@ -121,7 +121,7 @@ def test_top_down(self): self.assertEqual(dataset.data_mode, 'topdown') self.assertEqual(len(dataset), 33) - def test_bottom_up(self): + def test_bottomup(self): # test bottomup training dataset = self.build_halpe_dataset(data_mode='bottomup') self.assertEqual(len(dataset), 4) diff --git a/tests/test_datasets/test_transforms/test_bottomup_transforms.py b/tests/test_datasets/test_transforms/test_bottomup_transforms.py index 988b1863ec..cded7a6efb 100644 --- a/tests/test_datasets/test_transforms/test_bottomup_transforms.py +++ b/tests/test_datasets/test_transforms/test_bottomup_transforms.py @@ -110,20 +110,20 @@ def test_transform(self): # single-scale, fit transform = BottomupResize(input_size=(256, 256), resize_mode='fit') results = transform(deepcopy(self.data_info)) - scale = 256 / 480 - expected_warp_mat = np.array([[scale, 0, 0], [0, scale, 0]]) - # the upper-half is the resized image content, and the lower-half is - # the padded zeros + # the middle section of the image is the resized content, while the + # top and bottom are padded with zeros self.assertEqual(results['img'].shape, (256, 256, 3)) - self.assertTrue(np.allclose(results['warp_mat'], expected_warp_mat)) - self.assertTrue(np.all(results['img'][:128] > 0)) - self.assertTrue(np.all(results['img'][128:] == 0)) + self.assertTrue( + np.allclose(results['input_scale'], np.array([480., 480.]))) + self.assertTrue( + np.allclose(results['input_center'], np.array([240., 120.]))) + self.assertTrue(np.all(results['img'][64:192] > 0)) + self.assertTrue(np.all(results['img'][:64] == 0)) + self.assertTrue(np.all(results['img'][192:] == 0)) # single-scale, expand transform = BottomupResize(input_size=(256, 256), resize_mode='expand') results = transform(deepcopy(self.data_info)) - scale = 256 / 240 - expected_warp_mat = np.array([[scale, 0, 0], [0, scale, 0]]) # the actual input size is expanded to (512, 256) according to the # original image shape self.assertEqual(results['img'].shape, (256, 512, 3)) @@ -138,9 +138,10 @@ def test_transform(self): # multi-scale transform = BottomupResize( - input_size=(256, 256), aux_scales=[1.5], resize_mode='fit') + input_size=(256, 256), aug_scales=[1.5], resize_mode='fit') results = transform(deepcopy(self.data_info)) self.assertIsInstance(results['img'], list) - self.assertIsInstance(results['warp_mat'], np.ndarray) + self.assertIsInstance(results['input_center'], np.ndarray) + self.assertIsInstance(results['input_scale'], np.ndarray) self.assertEqual(results['img'][0].shape, (256, 256, 3)) self.assertEqual(results['img'][1].shape, (384, 384, 3)) diff --git a/tests/test_datasets/test_transforms/test_common_transforms.py b/tests/test_datasets/test_transforms/test_common_transforms.py index 3412393678..2818081dca 100644 --- a/tests/test_datasets/test_transforms/test_common_transforms.py +++ b/tests/test_datasets/test_transforms/test_common_transforms.py @@ -465,7 +465,7 @@ def setUp(self): with_bbox_cs=True, with_img_mask=True) - def test_generate_heatmap(self): + def test_generate_single_target(self): encoder = dict( type='MSRAHeatmap', input_size=(192, 256), @@ -475,7 +475,7 @@ def test_generate_heatmap(self): # generate heatmap pipeline = Compose([ TopdownAffine(input_size=(192, 256)), - GenerateTarget(target_type='heatmap', encoder=encoder) + GenerateTarget(encoder=encoder) ]) results = pipeline(deepcopy(self.data_info)) @@ -487,7 +487,6 @@ def test_generate_heatmap(self): pipeline = Compose([ TopdownAffine(input_size=(192, 256)), GenerateTarget( - target_type='heatmap', encoder=encoder, use_dataset_keypoint_weights=True, ) @@ -500,7 +499,7 @@ def test_generate_heatmap(self): np.allclose(results['keypoint_weights'], self.data_info['dataset_keypoint_weights'][None])) - def test_generate_multilevel_heatmap(self): + def test_generate_multilevel_target(self): encoder_0 = dict( type='MSRAHeatmap', input_size=(192, 256), @@ -512,61 +511,92 @@ def test_generate_multilevel_heatmap(self): pipeline = Compose([ TopdownAffine(input_size=(192, 256)), GenerateTarget( - target_type='multilevel_heatmap', - encoder=[encoder_0, encoder_1]) + encoder=[encoder_0, encoder_1], + multilevel=True, + use_dataset_keypoint_weights=True) ]) results = pipeline(deepcopy(self.data_info)) self.assertTrue(is_list_of(results['heatmaps'], np.ndarray)) + self.assertTrue(is_list_of(results['keypoint_weights'], np.ndarray)) self.assertEqual(results['heatmaps'][0].shape, (17, 64, 48)) self.assertEqual(results['heatmaps'][1].shape, (17, 32, 24)) - self.assertEqual(results['keypoint_weights'].shape, (1, 2, 17)) - - def test_generate_keypoint_label(self): - encoder = dict(type='RegressionLabel', input_size=(192, 256)) + self.assertEqual(results['keypoint_weights'][0].shape, (1, 17)) - # generate keypoint label + def test_generate_combined_target(self): + encoder_0 = dict( + type='MSRAHeatmap', + input_size=(192, 256), + heatmap_size=(48, 64), + sigma=2.0) + encoder_1 = dict(type='RegressionLabel', input_size=(192, 256)) + # generate multilevel heatmap pipeline = Compose([ TopdownAffine(input_size=(192, 256)), - GenerateTarget(target_type='keypoint_label', encoder=encoder) + GenerateTarget( + encoder=[encoder_0, encoder_1], + multilevel=False, + use_dataset_keypoint_weights=True) ]) results = pipeline(deepcopy(self.data_info)) + + self.assertEqual(results['heatmaps'].shape, (17, 64, 48)) self.assertEqual(results['keypoint_labels'].shape, (1, 17, 2)) - self.assertTrue( - np.allclose(results['keypoint_weights'], np.ones((1, 17)))) + self.assertIsInstance(results['keypoint_weights'], list) + self.assertEqual(results['keypoint_weights'][0].shape, (1, 17)) + + def test_errors(self): - # generate keypoint label and use meta keypoint weights + # single encoder with `multilevel=True` + encoder = dict( + type='MSRAHeatmap', + input_size=(192, 256), + heatmap_size=(48, 64), + sigma=2.0) + + with self.assertRaisesRegex(AssertionError, + 'Need multiple encoder configs'): + _ = GenerateTarget(encoder=encoder, multilevel=True) + + # diverse keys in multilevel encoding + encoder_0 = dict( + type='MSRAHeatmap', + input_size=(192, 256), + heatmap_size=(48, 64), + sigma=2.0) + + encoder_1 = dict(type='RegressionLabel', input_size=(192, 256)) pipeline = Compose([ TopdownAffine(input_size=(192, 256)), - GenerateTarget( - target_type='keypoint_label', - encoder=encoder, - use_dataset_keypoint_weights=True) + GenerateTarget(encoder=[encoder_0, encoder_1], multilevel=True) ]) - results = pipeline(deepcopy(self.data_info)) - self.assertEqual(results['keypoint_labels'].shape, (1, 17, 2)) - self.assertEqual(results['keypoint_weights'].shape, (1, 17)) - self.assertTrue( - np.allclose(results['keypoint_weights'], - self.data_info['dataset_keypoint_weights'][None])) + with self.assertRaisesRegex(ValueError, 'have the same keys'): + _ = pipeline(deepcopy(self.data_info)) - def test_generate_keypoint_xy_label(self): + # overlapping keys in combined encoding encoder = dict( - type='SimCCLabel', + type='MSRAHeatmap', input_size=(192, 256), - smoothing_type='gaussian', - simcc_split_ratio=2.0) + heatmap_size=(48, 64), + sigma=2.0) - # generate keypoint label pipeline = Compose([ TopdownAffine(input_size=(192, 256)), - GenerateTarget(target_type='keypoint_xy_label', encoder=encoder) + GenerateTarget(encoder=[encoder, encoder], multilevel=False) ]) - results = pipeline(deepcopy(self.data_info)) - self.assertEqual(results['keypoint_x_labels'].shape, (1, 17, 192 * 2)) - self.assertEqual(results['keypoint_y_labels'].shape, (1, 17, 256 * 2)) - self.assertTrue( - np.allclose(results['keypoint_weights'], np.ones((1, 17)))) + with self.assertRaisesRegex(ValueError, 'Overlapping item'): + _ = pipeline(deepcopy(self.data_info)) + + # deprecated argument `target_type` is given + encoder = dict( + type='MSRAHeatmap', + input_size=(192, 256), + heatmap_size=(48, 64), + sigma=2.0) + + with self.assertWarnsRegex(DeprecationWarning, + '`target_type` is deprecated'): + _ = GenerateTarget(encoder=encoder, target_type='heatmap') diff --git a/tests/test_datasets/test_transforms/test_converting.py b/tests/test_datasets/test_transforms/test_converting.py new file mode 100644 index 0000000000..f345a44063 --- /dev/null +++ b/tests/test_datasets/test_transforms/test_converting.py @@ -0,0 +1,36 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +from mmpose.datasets.transforms import KeypointConverter +from mmpose.testing import get_coco_sample + + +class TestKeypointConverter(TestCase): + + def setUp(self): + # prepare dummy bottom-up data sample with COCO metainfo + self.data_info = get_coco_sample( + img_shape=(240, 320), num_instances=4, with_bbox_cs=True) + + def test_transform(self): + mapping = [(3, 0), (6, 1), (16, 2), (5, 3)] + transform = KeypointConverter(num_keypoints=5, mapping=mapping) + results = transform(self.data_info.copy()) + + # check shape + self.assertEqual(results['keypoints'].shape[0], + self.data_info['keypoints'].shape[0]) + self.assertEqual(results['keypoints'].shape[1], 5) + self.assertEqual(results['keypoints'].shape[2], 2) + self.assertEqual(results['keypoints_visible'].shape[0], + self.data_info['keypoints_visible'].shape[0]) + self.assertEqual(results['keypoints_visible'].shape[1], 5) + + # check value + for source_index, target_index in mapping: + self.assertTrue((results['keypoints'][:, target_index] == + self.data_info['keypoints'][:, + source_index]).all()) + self.assertTrue( + (results['keypoints_visible'][:, target_index] == + self.data_info['keypoints_visible'][:, source_index]).all()) diff --git a/tests/test_datasets/test_transforms/test_loading.py b/tests/test_datasets/test_transforms/test_loading.py new file mode 100644 index 0000000000..0a63003c75 --- /dev/null +++ b/tests/test_datasets/test_transforms/test_loading.py @@ -0,0 +1,31 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import numpy as np +from mmcv import imread + +from mmpose.datasets.transforms.loading import LoadImage + + +class TestLoadImage(TestCase): + + def test_load_image(self): + + transform = LoadImage() + results = dict(img_path='tests/data/coco/000000000785.jpg') + + results = transform(results) + + self.assertIsInstance(results['img'], np.ndarray) + + def test_with_input_image(self): + transform = LoadImage(to_float32=True) + + img_path = 'tests/data/coco/000000000785.jpg' + results = dict( + img_path=img_path, img=imread(img_path).astype(np.uint8)) + + results = transform(results) + + self.assertIsInstance(results['img'], np.ndarray) + self.assertTrue(results['img'].dtype, np.float32) diff --git a/tests/test_engine/test_hooks/test_visualization_hook.py b/tests/test_engine/test_hooks/test_visualization_hook.py index 0081e53791..3e4a202198 100644 --- a/tests/test_engine/test_hooks/test_visualization_hook.py +++ b/tests/test_engine/test_hooks/test_visualization_hook.py @@ -27,7 +27,7 @@ def _rand_poses(num_boxes, h, w): class TestVisualizationHook(TestCase): def setUp(self) -> None: - PoseLocalVisualizer.get_instance('visualizer') + PoseLocalVisualizer.get_instance('test_visualization_hook') data_sample = PoseDataSample() data_sample.set_metainfo({ @@ -35,7 +35,7 @@ def setUp(self) -> None: osp.join( osp.dirname(__file__), '../../data/coco/000000000785.jpg') }) - self.data_batch = [{'data_sample': data_sample}] * 2 + self.data_batch = {'data_samples': [data_sample] * 2} pred_instances = InstanceData() pred_instances.keypoints = _rand_poses(5, 10, 12) @@ -48,7 +48,7 @@ def test_after_val_iter(self): runner = MagicMock() runner.iter = 1 runner.val_evaluator.dataset_meta = dict() - hook = PoseVisualizationHook(interval=1) + hook = PoseVisualizationHook(interval=1, enable=True) hook.after_val_iter(runner, 1, self.data_batch, self.outputs) def test_after_test_iter(self): diff --git a/tests/test_evaluation/test_functional/test_nms.py b/tests/test_evaluation/test_functional/test_nms.py new file mode 100644 index 0000000000..b29ed86ccb --- /dev/null +++ b/tests/test_evaluation/test_functional/test_nms.py @@ -0,0 +1,40 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import numpy as np + +from mmpose.evaluation.functional.nms import nearby_joints_nms + + +class TestNearbyJointsNMS(TestCase): + + def test_nearby_joints_nms(self): + + kpts_db = [] + keep_pose_inds = nearby_joints_nms( + kpts_db, 0.05, score_per_joint=True, max_dets=1) + self.assertEqual(len(keep_pose_inds), 0) + + kpts_db = [] + for _ in range(5): + kpts_db.append( + dict(keypoints=np.random.rand(3, 2), score=np.random.rand(3))) + keep_pose_inds = nearby_joints_nms( + kpts_db, 0.05, score_per_joint=True, max_dets=1) + self.assertEqual(len(keep_pose_inds), 1) + self.assertLess(keep_pose_inds[0], 5) + + kpts_db = [] + for _ in range(5): + kpts_db.append( + dict(keypoints=np.random.rand(3, 2), score=np.random.rand())) + keep_pose_inds = nearby_joints_nms( + kpts_db, 0.05, num_nearby_joints_thr=2) + self.assertLessEqual(len(keep_pose_inds), 5) + self.assertGreater(len(keep_pose_inds), 0) + + with self.assertRaises(AssertionError): + _ = nearby_joints_nms(kpts_db, 0, num_nearby_joints_thr=2) + + with self.assertRaises(AssertionError): + _ = nearby_joints_nms(kpts_db, 0.05, num_nearby_joints_thr=3) diff --git a/tests/test_evaluation/test_metrics/test_coco_metric.py b/tests/test_evaluation/test_metrics/test_coco_metric.py index bdffc9c760..82bf0bc572 100644 --- a/tests/test_evaluation/test_metrics/test_coco_metric.py +++ b/tests/test_evaluation/test_metrics/test_coco_metric.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +import copy import os.path as osp import tempfile from collections import defaultdict @@ -6,6 +7,7 @@ import numpy as np from mmengine.fileio import dump, load +from xtcocotools.coco import COCO from mmpose.datasets.datasets.utils import parse_pose_metainfo from mmpose.evaluation.metrics import CocoMetric @@ -20,17 +22,21 @@ def setUp(self): tearDown() -> cleanUp() """ self.tmp_dir = tempfile.TemporaryDirectory() - self.ann_file = 'tests/data/coco/test_coco.json' - coco_meta_info = dict(from_file='configs/_base_/datasets/coco.py') - self.coco_dataset_meta = parse_pose_metainfo(coco_meta_info) - self.db = load(self.ann_file) - - self.topdown_data = self._convert_ann_to_topdown_batch_data() - assert len(self.topdown_data) == 14 - self.bottomup_data = self._convert_ann_to_bottomup_batch_data() - assert len(self.bottomup_data) == 4 - self.target = { + self.ann_file_coco = 'tests/data/coco/test_coco.json' + meta_info_coco = dict(from_file='configs/_base_/datasets/coco.py') + self.dataset_meta_coco = parse_pose_metainfo(meta_info_coco) + self.coco = COCO(self.ann_file_coco) + self.dataset_meta_coco['CLASSES'] = self.coco.loadCats( + self.coco.getCatIds()) + + self.topdown_data_coco = self._convert_ann_to_topdown_batch_data( + self.ann_file_coco) + assert len(self.topdown_data_coco) == 14 + self.bottomup_data_coco = self._convert_ann_to_bottomup_batch_data( + self.ann_file_coco) + assert len(self.bottomup_data_coco) == 4 + self.target_coco = { 'coco/AP': 1.0, 'coco/AP .5': 1.0, 'coco/AP .75': 1.0, @@ -43,10 +49,68 @@ def setUp(self): 'coco/AR (L)': 1.0, } - def _convert_ann_to_topdown_batch_data(self): + self.ann_file_crowdpose = 'tests/data/crowdpose/test_crowdpose.json' + self.coco_crowdpose = COCO(self.ann_file_crowdpose) + meta_info_crowdpose = dict( + from_file='configs/_base_/datasets/crowdpose.py') + self.dataset_meta_crowdpose = parse_pose_metainfo(meta_info_crowdpose) + self.dataset_meta_crowdpose['CLASSES'] = self.coco_crowdpose.loadCats( + self.coco_crowdpose.getCatIds()) + + self.topdown_data_crowdpose = self._convert_ann_to_topdown_batch_data( + self.ann_file_crowdpose) + assert len(self.topdown_data_crowdpose) == 5 + self.bottomup_data_crowdpose = \ + self._convert_ann_to_bottomup_batch_data(self.ann_file_crowdpose) + assert len(self.bottomup_data_crowdpose) == 2 + + self.target_crowdpose = { + 'crowdpose/AP': 1.0, + 'crowdpose/AP .5': 1.0, + 'crowdpose/AP .75': 1.0, + 'crowdpose/AR': 1.0, + 'crowdpose/AR .5': 1.0, + 'crowdpose/AR .75': 1.0, + 'crowdpose/AP(E)': -1.0, + 'crowdpose/AP(M)': 1.0, + 'crowdpose/AP(H)': -1.0, + } + + self.ann_file_ap10k = 'tests/data/ap10k/test_ap10k.json' + self.coco_ap10k = COCO(self.ann_file_ap10k) + meta_info_ap10k = dict(from_file='configs/_base_/datasets/ap10k.py') + self.dataset_meta_ap10k = parse_pose_metainfo(meta_info_ap10k) + self.dataset_meta_ap10k['CLASSES'] = self.coco_ap10k.loadCats( + self.coco_ap10k.getCatIds()) + + self.topdown_data_ap10k = self._convert_ann_to_topdown_batch_data( + self.ann_file_ap10k) + assert len(self.topdown_data_ap10k) == 2 + self.bottomup_data_ap10k = self._convert_ann_to_bottomup_batch_data( + self.ann_file_ap10k) + assert len(self.bottomup_data_ap10k) == 2 + + self.target_ap10k = { + 'coco/AP': 1.0, + 'coco/AP .5': 1.0, + 'coco/AP .75': 1.0, + 'coco/AP (M)': -1.0, + 'coco/AP (L)': 1.0, + 'coco/AR': 1.0, + 'coco/AR .5': 1.0, + 'coco/AR .75': 1.0, + 'coco/AR (M)': -1.0, + 'coco/AR (L)': 1.0, + } + + def _convert_ann_to_topdown_batch_data(self, ann_file): """Convert annotations to topdown-style batch data.""" topdown_data = [] - for ann in self.db['annotations']: + db = load(ann_file) + imgid2info = dict() + for img in db['images']: + imgid2info[img['id']] = img + for ann in db['annotations']: w, h = ann['bbox'][2], ann['bbox'][3] bboxes = np.array(ann['bbox'], dtype=np.float32).reshape(-1, 4) bbox_scales = np.array([w * 1.25, h * 1.25]).reshape(-1, 2) @@ -66,9 +130,19 @@ def _convert_ann_to_topdown_batch_data(self): data_sample = { 'id': ann['id'], 'img_id': ann['image_id'], + 'category_id': ann.get('category_id', 1), 'gt_instances': gt_instances, - 'pred_instances': pred_instances + 'pred_instances': pred_instances, + # dummy image_shape for testing + 'ori_shape': [640, 480], + # store the raw annotation info to test without ann_file + 'raw_ann_info': copy.deepcopy(ann), } + + # add crowd_index to data_sample if it is present in the image_info + if 'crowdIndex' in imgid2info[ann['image_id']]: + data_sample['crowd_index'] = imgid2info[ + ann['image_id']]['crowdIndex'] # batch size = 1 data_batch = [data] data_samples = [data_sample] @@ -76,10 +150,11 @@ def _convert_ann_to_topdown_batch_data(self): return topdown_data - def _convert_ann_to_bottomup_batch_data(self): + def _convert_ann_to_bottomup_batch_data(self, ann_file): """Convert annotations to bottomup-style batch data.""" img2ann = defaultdict(list) - for ann in self.db['annotations']: + db = load(ann_file) + for ann in db['annotations']: img2ann[ann['image_id']].append(ann) bottomup_data = [] @@ -118,129 +193,57 @@ def test_init(self): # test score_mode option with self.assertRaisesRegex(ValueError, '`score_mode` should be one of'): - _ = CocoMetric(ann_file=self.ann_file, score_mode='keypoint') + _ = CocoMetric(ann_file=self.ann_file_coco, score_mode='invalid') # test nms_mode option with self.assertRaisesRegex(ValueError, '`nms_mode` should be one of'): - _ = CocoMetric(ann_file=self.ann_file, nms_mode='invalid') + _ = CocoMetric(ann_file=self.ann_file_coco, nms_mode='invalid') # test format_only option with self.assertRaisesRegex( AssertionError, '`outfile_prefix` can not be None when `format_only` is True'): _ = CocoMetric( - ann_file=self.ann_file, format_only=True, outfile_prefix=None) - - def test_topdown_evaluate(self): - """test topdown-style COCO metric evaluation.""" - # case 1: score_mode='bbox', nms_mode='none' - coco_metric = CocoMetric( - ann_file=self.ann_file, - outfile_prefix=f'{self.tmp_dir.name}/test', - score_mode='bbox', - nms_mode='none') - coco_metric.dataset_meta = self.coco_dataset_meta - - # process samples - for data_batch, data_samples in self.topdown_data: - coco_metric.process(data_batch, data_samples) - - eval_results = coco_metric.evaluate(size=len(self.topdown_data)) - - self.assertDictEqual(eval_results, self.target) - self.assertTrue( - osp.isfile(osp.join(self.tmp_dir.name, 'test.keypoints.json'))) - - # case 2: score_mode='bbox_keypoint', nms_mode='oks_nms' - coco_metric = CocoMetric( - ann_file=self.ann_file, - outfile_prefix=f'{self.tmp_dir.name}/test', - score_mode='bbox_keypoint', - nms_mode='oks_nms') - coco_metric.dataset_meta = self.coco_dataset_meta - - # process samples - for data_batch, data_samples in self.topdown_data: - coco_metric.process(data_batch, data_samples) - - eval_results = coco_metric.evaluate(size=len(self.topdown_data)) - - self.assertDictEqual(eval_results, self.target) - self.assertTrue( - osp.isfile(osp.join(self.tmp_dir.name, 'test.keypoints.json'))) - - # case 3: score_mode='bbox_rle', nms_mode='soft_oks_nms' - coco_metric = CocoMetric( - ann_file=self.ann_file, - outfile_prefix=f'{self.tmp_dir.name}/test', - score_mode='bbox_rle', - nms_mode='soft_oks_nms') - coco_metric.dataset_meta = self.coco_dataset_meta - - # process samples - for data_batch, data_samples in self.topdown_data: - coco_metric.process(data_batch, data_samples) - - eval_results = coco_metric.evaluate(size=len(self.topdown_data)) - - self.assertDictEqual(eval_results, self.target) - self.assertTrue( - osp.isfile(osp.join(self.tmp_dir.name, 'test.keypoints.json'))) - - def test_bottomup_evaluate(self): - """test bottomup-style COCO metric evaluation.""" - # case1: score_mode='bbox', nms_mode='none' - coco_metric = CocoMetric( - ann_file=self.ann_file, - outfile_prefix=f'{self.tmp_dir.name}/test', - score_mode='bbox', - nms_mode='none') - coco_metric.dataset_meta = self.coco_dataset_meta - - # process samples - for data_batch, data_samples in self.bottomup_data: - coco_metric.process(data_batch, data_samples) - - eval_results = coco_metric.evaluate(size=len(self.bottomup_data)) - self.assertDictEqual(eval_results, self.target) - self.assertTrue( - osp.isfile(osp.join(self.tmp_dir.name, 'test.keypoints.json'))) + ann_file=self.ann_file_coco, + format_only=True, + outfile_prefix=None) def test_other_methods(self): """test other useful methods.""" # test `_sort_and_unique_bboxes` method - coco_metric = CocoMetric( - ann_file=self.ann_file, score_mode='bbox', nms_mode='none') - coco_metric.dataset_meta = self.coco_dataset_meta + metric_coco = CocoMetric( + ann_file=self.ann_file_coco, score_mode='bbox', nms_mode='none') + metric_coco.dataset_meta = self.dataset_meta_coco # process samples - for data_batch, data_samples in self.topdown_data: - coco_metric.process(data_batch, data_samples) + for data_batch, data_samples in self.topdown_data_coco: + metric_coco.process(data_batch, data_samples) # process one extra sample - data_batch, data_samples = self.topdown_data[0] - coco_metric.process(data_batch, data_samples) + data_batch, data_samples = self.topdown_data_coco[0] + metric_coco.process(data_batch, data_samples) # an extra sample - eval_results = coco_metric.evaluate(size=len(self.topdown_data) + 1) - self.assertDictEqual(eval_results, self.target) + eval_results = metric_coco.evaluate( + size=len(self.topdown_data_coco) + 1) + self.assertDictEqual(eval_results, self.target_coco) def test_format_only(self): """test `format_only` option.""" - coco_metric = CocoMetric( - ann_file=self.ann_file, + metric_coco = CocoMetric( + ann_file=self.ann_file_coco, format_only=True, outfile_prefix=f'{self.tmp_dir.name}/test', score_mode='bbox_keypoint', nms_mode='oks_nms') - coco_metric.dataset_meta = self.coco_dataset_meta + metric_coco.dataset_meta = self.dataset_meta_coco # process one sample - data_batch, data_samples = self.topdown_data[0] - coco_metric.process(data_batch, data_samples) - eval_results = coco_metric.evaluate(size=1) + data_batch, data_samples = self.topdown_data_coco[0] + metric_coco.process(data_batch, data_samples) + eval_results = metric_coco.evaluate(size=1) self.assertDictEqual(eval_results, {}) self.assertTrue( osp.isfile(osp.join(self.tmp_dir.name, 'test.keypoints.json'))) # test when gt annotations are absent - db_ = load(self.ann_file) + db_ = load(self.ann_file_coco) del db_['annotations'] tmp_ann_file = osp.join(self.tmp_dir.name, 'temp_ann.json') dump(db_, tmp_ann_file, sort_keys=True, indent=4) @@ -249,11 +252,32 @@ def test_format_only(self): 'Ground truth annotations are required for evaluation'): _ = CocoMetric(ann_file=tmp_ann_file, format_only=False) + def test_bottomup_evaluate(self): + """test bottomup-style COCO metric evaluation.""" + # case1: score_mode='bbox', nms_mode='none' + metric_coco = CocoMetric( + ann_file=self.ann_file_coco, + outfile_prefix=f'{self.tmp_dir.name}/test', + score_mode='bbox', + nms_mode='none') + metric_coco.dataset_meta = self.dataset_meta_coco + + # process samples + for data_batch, data_samples in self.bottomup_data_coco: + metric_coco.process(data_batch, data_samples) + + eval_results = metric_coco.evaluate(size=len(self.bottomup_data_coco)) + self.assertDictEqual(eval_results, self.target_coco) + self.assertTrue( + osp.isfile(osp.join(self.tmp_dir.name, 'test.keypoints.json'))) + def test_topdown_alignment(self): """Test whether the output of CocoMetric and the original TopDownCocoDataset are the same.""" topdown_data = [] - for ann in self.db['annotations']: + db = load(self.ann_file_coco) + + for ann in db['annotations']: w, h = ann['bbox'][2], ann['bbox'][3] bboxes = np.array(ann['bbox'], dtype=np.float32).reshape(-1, 4) bbox_scales = np.array([w * 1.25, h * 1.25]).reshape(-1, 2) @@ -288,18 +312,18 @@ def test_topdown_alignment(self): # case 1: # typical setting: score_mode='bbox_keypoint', nms_mode='oks_nms' - coco_metric = CocoMetric( - ann_file=self.ann_file, - outfile_prefix=f'{self.tmp_dir.name}/test', + metric_coco = CocoMetric( + ann_file=self.ann_file_coco, + outfile_prefix=f'{self.tmp_dir.name}/test_align1', score_mode='bbox_keypoint', nms_mode='oks_nms') - coco_metric.dataset_meta = self.coco_dataset_meta + metric_coco.dataset_meta = self.dataset_meta_coco # process samples for data_batch, data_samples in topdown_data: - coco_metric.process(data_batch, data_samples) + metric_coco.process(data_batch, data_samples) - eval_results = coco_metric.evaluate(size=len(topdown_data)) + eval_results = metric_coco.evaluate(size=len(topdown_data)) target = { 'coco/AP': 0.5287458745874587, @@ -318,21 +342,22 @@ def test_topdown_alignment(self): self.assertAlmostEqual(eval_results[key], target[key]) self.assertTrue( - osp.isfile(osp.join(self.tmp_dir.name, 'test.keypoints.json'))) + osp.isfile( + osp.join(self.tmp_dir.name, 'test_align1.keypoints.json'))) # case 2: score_mode='bbox_rle', nms_mode='oks_nms' - coco_metric = CocoMetric( - ann_file=self.ann_file, - outfile_prefix=f'{self.tmp_dir.name}/test', + metric_coco = CocoMetric( + ann_file=self.ann_file_coco, + outfile_prefix=f'{self.tmp_dir.name}/test_align2', score_mode='bbox_rle', nms_mode='oks_nms') - coco_metric.dataset_meta = self.coco_dataset_meta + metric_coco.dataset_meta = self.dataset_meta_coco # process samples for data_batch, data_samples in topdown_data: - coco_metric.process(data_batch, data_samples) + metric_coco.process(data_batch, data_samples) - eval_results = coco_metric.evaluate(size=len(topdown_data)) + eval_results = metric_coco.evaluate(size=len(topdown_data)) target = { 'coco/AP': 0.5004950495049505, @@ -351,10 +376,12 @@ def test_topdown_alignment(self): self.assertAlmostEqual(eval_results[key], target[key]) self.assertTrue( - osp.isfile(osp.join(self.tmp_dir.name, 'test.keypoints.json'))) + osp.isfile( + osp.join(self.tmp_dir.name, 'test_align2.keypoints.json'))) + # case 3: score_mode='bbox_keypoint', nms_mode='soft_oks_nms' topdown_data = [] - anns = self.db['annotations'] + anns = db['annotations'] for i, ann in enumerate(anns): w, h = ann['bbox'][2], ann['bbox'][3] bboxes = np.array(ann['bbox'], dtype=np.float32).reshape(-1, 4) @@ -414,21 +441,20 @@ def test_topdown_alignment(self): data_samples = [data_sample0, data_sample1] topdown_data.append((data_batch, data_samples)) - # case 3: score_mode='bbox_keypoint', nms_mode='soft_oks_nms' - coco_metric = CocoMetric( - ann_file=self.ann_file, - outfile_prefix=f'{self.tmp_dir.name}/test', + metric_coco = CocoMetric( + ann_file=self.ann_file_coco, + outfile_prefix=f'{self.tmp_dir.name}/test_align3', score_mode='bbox_keypoint', keypoint_score_thr=0.2, nms_thr=0.9, nms_mode='soft_oks_nms') - coco_metric.dataset_meta = self.coco_dataset_meta + metric_coco.dataset_meta = self.dataset_meta_coco # process samples for data_batch, data_samples in topdown_data: - coco_metric.process(data_batch, data_samples) + metric_coco.process(data_batch, data_samples) - eval_results = coco_metric.evaluate(size=len(topdown_data) * 2) + eval_results = metric_coco.evaluate(size=len(topdown_data) * 2) target = { 'coco/AP': 0.17073707370737073, @@ -447,4 +473,140 @@ def test_topdown_alignment(self): self.assertAlmostEqual(eval_results[key], target[key]) self.assertTrue( - osp.isfile(osp.join(self.tmp_dir.name, 'test.keypoints.json'))) + osp.isfile( + osp.join(self.tmp_dir.name, 'test_align3.keypoints.json'))) + + def test_topdown_evaluate(self): + """test topdown-style COCO metric evaluation.""" + # case 1: score_mode='bbox', nms_mode='none' + metric_coco = CocoMetric( + ann_file=self.ann_file_coco, + outfile_prefix=f'{self.tmp_dir.name}/test1', + score_mode='bbox', + nms_mode='none') + metric_coco.dataset_meta = self.dataset_meta_coco + + # process samples + for data_batch, data_samples in self.topdown_data_coco: + metric_coco.process(data_batch, data_samples) + + eval_results = metric_coco.evaluate(size=len(self.topdown_data_coco)) + + self.assertDictEqual(eval_results, self.target_coco) + self.assertTrue( + osp.isfile(osp.join(self.tmp_dir.name, 'test1.keypoints.json'))) + + # case 2: score_mode='bbox_keypoint', nms_mode='oks_nms' + metric_coco = CocoMetric( + ann_file=self.ann_file_coco, + outfile_prefix=f'{self.tmp_dir.name}/test2', + score_mode='bbox_keypoint', + nms_mode='oks_nms') + metric_coco.dataset_meta = self.dataset_meta_coco + + # process samples + for data_batch, data_samples in self.topdown_data_coco: + metric_coco.process(data_batch, data_samples) + + eval_results = metric_coco.evaluate(size=len(self.topdown_data_coco)) + + self.assertDictEqual(eval_results, self.target_coco) + self.assertTrue( + osp.isfile(osp.join(self.tmp_dir.name, 'test2.keypoints.json'))) + + # case 3: score_mode='bbox_rle', nms_mode='soft_oks_nms' + metric_coco = CocoMetric( + ann_file=self.ann_file_coco, + outfile_prefix=f'{self.tmp_dir.name}/test3', + score_mode='bbox_rle', + nms_mode='soft_oks_nms') + metric_coco.dataset_meta = self.dataset_meta_coco + + # process samples + for data_batch, data_samples in self.topdown_data_coco: + metric_coco.process(data_batch, data_samples) + + eval_results = metric_coco.evaluate(size=len(self.topdown_data_coco)) + + self.assertDictEqual(eval_results, self.target_coco) + self.assertTrue( + osp.isfile(osp.join(self.tmp_dir.name, 'test3.keypoints.json'))) + + # case 4: test without providing ann_file + metric_coco = CocoMetric(outfile_prefix=f'{self.tmp_dir.name}/test4') + metric_coco.dataset_meta = self.dataset_meta_coco + # process samples + for data_batch, data_samples in self.topdown_data_coco: + metric_coco.process(data_batch, data_samples) + eval_results = metric_coco.evaluate(size=len(self.topdown_data_coco)) + self.assertDictEqual(eval_results, self.target_coco) + # test whether convert the annotation to COCO format + self.assertTrue( + osp.isfile(osp.join(self.tmp_dir.name, 'test4.gt.json'))) + self.assertTrue( + osp.isfile(osp.join(self.tmp_dir.name, 'test4.keypoints.json'))) + + # case 5: test Crowdpose dataset + metric_crowdpose = CocoMetric( + ann_file=self.ann_file_crowdpose, + outfile_prefix=f'{self.tmp_dir.name}/test5', + use_area=False, + iou_type='keypoints_crowd', + prefix='crowdpose') + metric_crowdpose.dataset_meta = self.dataset_meta_crowdpose + # process samples + for data_batch, data_samples in self.topdown_data_crowdpose: + metric_crowdpose.process(data_batch, data_samples) + eval_results = metric_crowdpose.evaluate( + size=len(self.topdown_data_crowdpose)) + self.assertDictEqual(eval_results, self.target_crowdpose) + self.assertTrue( + osp.isfile(osp.join(self.tmp_dir.name, 'test5.keypoints.json'))) + + # case 6: test Crowdpose dataset + without ann_file + metric_crowdpose = CocoMetric( + outfile_prefix=f'{self.tmp_dir.name}/test6', + use_area=False, + iou_type='keypoints_crowd', + prefix='crowdpose') + metric_crowdpose.dataset_meta = self.dataset_meta_crowdpose + # process samples + for data_batch, data_samples in self.topdown_data_crowdpose: + metric_crowdpose.process(data_batch, data_samples) + eval_results = metric_crowdpose.evaluate( + size=len(self.topdown_data_crowdpose)) + self.assertDictEqual(eval_results, self.target_crowdpose) + # test whether convert the annotation to COCO format + self.assertTrue( + osp.isfile(osp.join(self.tmp_dir.name, 'test6.gt.json'))) + self.assertTrue( + osp.isfile(osp.join(self.tmp_dir.name, 'test6.keypoints.json'))) + + # case 7: test AP10k dataset + metric_ap10k = CocoMetric( + ann_file=self.ann_file_ap10k, + outfile_prefix=f'{self.tmp_dir.name}/test7') + metric_ap10k.dataset_meta = self.dataset_meta_ap10k + # process samples + for data_batch, data_samples in self.topdown_data_ap10k: + metric_ap10k.process(data_batch, data_samples) + eval_results = metric_ap10k.evaluate(size=len(self.topdown_data_ap10k)) + for key in self.target_ap10k: + self.assertAlmostEqual(eval_results[key], self.target_ap10k[key]) + self.assertTrue( + osp.isfile(osp.join(self.tmp_dir.name, 'test7.keypoints.json'))) + + # case 8: test Crowdpose dataset + without ann_file + metric_ap10k = CocoMetric(outfile_prefix=f'{self.tmp_dir.name}/test8') + metric_ap10k.dataset_meta = self.dataset_meta_ap10k + # process samples + for data_batch, data_samples in self.topdown_data_ap10k: + metric_ap10k.process(data_batch, data_samples) + eval_results = metric_ap10k.evaluate(size=len(self.topdown_data_ap10k)) + for key in self.target_ap10k: + self.assertAlmostEqual(eval_results[key], self.target_ap10k[key]) + # test whether convert the annotation to COCO format + self.assertTrue( + osp.isfile(osp.join(self.tmp_dir.name, 'test8.gt.json'))) + self.assertTrue( + osp.isfile(osp.join(self.tmp_dir.name, 'test8.keypoints.json'))) diff --git a/tests/test_evaluation/test_metrics/test_coco_wholebody_metric.py b/tests/test_evaluation/test_metrics/test_coco_wholebody_metric.py new file mode 100644 index 0000000000..46e8498851 --- /dev/null +++ b/tests/test_evaluation/test_metrics/test_coco_wholebody_metric.py @@ -0,0 +1,294 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import os.path as osp +import tempfile +from collections import defaultdict +from unittest import TestCase + +import numpy as np +from mmengine.fileio import dump, load +from xtcocotools.coco import COCO + +from mmpose.datasets.datasets.utils import parse_pose_metainfo +from mmpose.evaluation.metrics import CocoWholeBodyMetric + + +class TestCocoWholeBodyMetric(TestCase): + + def setUp(self): + """Setup some variables which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + self.tmp_dir = tempfile.TemporaryDirectory() + + self.ann_file_coco = 'tests/data/coco/test_coco_wholebody.json' + meta_info_coco = dict( + from_file='configs/_base_/datasets/coco_wholebody.py') + self.dataset_meta_coco = parse_pose_metainfo(meta_info_coco) + self.coco = COCO(self.ann_file_coco) + self.dataset_meta_coco['CLASSES'] = self.coco.loadCats( + self.coco.getCatIds()) + + self.topdown_data_coco = self._convert_ann_to_topdown_batch_data( + self.ann_file_coco) + assert len(self.topdown_data_coco) == 14 + self.bottomup_data_coco = self._convert_ann_to_bottomup_batch_data( + self.ann_file_coco) + assert len(self.bottomup_data_coco) == 4 + self.target_coco = { + 'coco-wholebody/AP': 1.0, + 'coco-wholebody/AP .5': 1.0, + 'coco-wholebody/AP .75': 1.0, + 'coco-wholebody/AP (M)': 1.0, + 'coco-wholebody/AP (L)': 1.0, + 'coco-wholebody/AR': 1.0, + 'coco-wholebody/AR .5': 1.0, + 'coco-wholebody/AR .75': 1.0, + 'coco-wholebody/AR (M)': 1.0, + 'coco-wholebody/AR (L)': 1.0, + } + + def _convert_ann_to_topdown_batch_data(self, ann_file): + """Convert annotations to topdown-style batch data.""" + topdown_data = [] + db = load(ann_file) + imgid2info = dict() + for img in db['images']: + imgid2info[img['id']] = img + for ann in db['annotations']: + w, h = ann['bbox'][2], ann['bbox'][3] + bboxes = np.array(ann['bbox'], dtype=np.float32).reshape(-1, 4) + bbox_scales = np.array([w * 1.25, h * 1.25]).reshape(-1, 2) + _keypoints = np.array(ann['keypoints'] + ann['foot_kpts'] + + ann['face_kpts'] + ann['lefthand_kpts'] + + ann['righthand_kpts']).reshape(1, -1, 3) + + gt_instances = { + 'bbox_scales': bbox_scales, + 'bbox_scores': np.ones((1, ), dtype=np.float32), + 'bboxes': bboxes, + } + pred_instances = { + 'keypoints': _keypoints[..., :2], + 'keypoint_scores': _keypoints[..., -1], + } + + data = {'inputs': None} + data_sample = { + 'id': ann['id'], + 'img_id': ann['image_id'], + 'category_id': ann.get('category_id', 1), + 'gt_instances': gt_instances, + 'pred_instances': pred_instances, + # dummy image_shape for testing + 'ori_shape': [640, 480], + # store the raw annotation info to test without ann_file + 'raw_ann_info': copy.deepcopy(ann), + } + + # batch size = 1 + data_batch = [data] + data_samples = [data_sample] + topdown_data.append((data_batch, data_samples)) + + return topdown_data + + def _convert_ann_to_bottomup_batch_data(self, ann_file): + """Convert annotations to bottomup-style batch data.""" + img2ann = defaultdict(list) + db = load(ann_file) + for ann in db['annotations']: + img2ann[ann['image_id']].append(ann) + + bottomup_data = [] + for img_id, anns in img2ann.items(): + _keypoints = [] + for ann in anns: + _keypoints.append(ann['keypoints'] + ann['foot_kpts'] + + ann['face_kpts'] + ann['lefthand_kpts'] + + ann['righthand_kpts']) + keypoints = np.array(_keypoints).reshape((len(anns), -1, 3)) + + gt_instances = { + 'bbox_scores': np.ones((len(anns)), dtype=np.float32) + } + + pred_instances = { + 'keypoints': keypoints[..., :2], + 'keypoint_scores': keypoints[..., -1], + } + + data = {'inputs': None} + data_sample = { + 'id': [ann['id'] for ann in anns], + 'img_id': img_id, + 'gt_instances': gt_instances, + 'pred_instances': pred_instances + } + + # batch size = 1 + data_batch = [data] + data_samples = [data_sample] + bottomup_data.append((data_batch, data_samples)) + return bottomup_data + + def tearDown(self): + self.tmp_dir.cleanup() + + def test_init(self): + """test metric init method.""" + # test score_mode option + with self.assertRaisesRegex(ValueError, + '`score_mode` should be one of'): + _ = CocoWholeBodyMetric( + ann_file=self.ann_file_coco, score_mode='invalid') + + # test nms_mode option + with self.assertRaisesRegex(ValueError, '`nms_mode` should be one of'): + _ = CocoWholeBodyMetric( + ann_file=self.ann_file_coco, nms_mode='invalid') + + # test format_only option + with self.assertRaisesRegex( + AssertionError, + '`outfile_prefix` can not be None when `format_only` is True'): + _ = CocoWholeBodyMetric( + ann_file=self.ann_file_coco, + format_only=True, + outfile_prefix=None) + + def test_other_methods(self): + """test other useful methods.""" + # test `_sort_and_unique_bboxes` method + metric_coco = CocoWholeBodyMetric( + ann_file=self.ann_file_coco, score_mode='bbox', nms_mode='none') + metric_coco.dataset_meta = self.dataset_meta_coco + # process samples + for data_batch, data_samples in self.topdown_data_coco: + metric_coco.process(data_batch, data_samples) + # process one extra sample + data_batch, data_samples = self.topdown_data_coco[0] + metric_coco.process(data_batch, data_samples) + # an extra sample + eval_results = metric_coco.evaluate( + size=len(self.topdown_data_coco) + 1) + self.assertDictEqual(eval_results, self.target_coco) + + def test_format_only(self): + """test `format_only` option.""" + metric_coco = CocoWholeBodyMetric( + ann_file=self.ann_file_coco, + format_only=True, + outfile_prefix=f'{self.tmp_dir.name}/test', + score_mode='bbox_keypoint', + nms_mode='oks_nms') + metric_coco.dataset_meta = self.dataset_meta_coco + # process one sample + data_batch, data_samples = self.topdown_data_coco[0] + metric_coco.process(data_batch, data_samples) + eval_results = metric_coco.evaluate(size=1) + self.assertDictEqual(eval_results, {}) + self.assertTrue( + osp.isfile(osp.join(self.tmp_dir.name, 'test.keypoints.json'))) + + # test when gt annotations are absent + db_ = load(self.ann_file_coco) + del db_['annotations'] + tmp_ann_file = osp.join(self.tmp_dir.name, 'temp_ann.json') + dump(db_, tmp_ann_file, sort_keys=True, indent=4) + with self.assertRaisesRegex( + AssertionError, + 'Ground truth annotations are required for evaluation'): + _ = CocoWholeBodyMetric(ann_file=tmp_ann_file, format_only=False) + + def test_bottomup_evaluate(self): + """test bottomup-style COCO metric evaluation.""" + # case1: score_mode='bbox', nms_mode='none' + metric_coco = CocoWholeBodyMetric( + ann_file=self.ann_file_coco, + outfile_prefix=f'{self.tmp_dir.name}/test', + score_mode='bbox', + nms_mode='none') + metric_coco.dataset_meta = self.dataset_meta_coco + + # process samples + for data_batch, data_samples in self.bottomup_data_coco: + metric_coco.process(data_batch, data_samples) + + eval_results = metric_coco.evaluate(size=len(self.bottomup_data_coco)) + self.assertDictEqual(eval_results, self.target_coco) + self.assertTrue( + osp.isfile(osp.join(self.tmp_dir.name, 'test.keypoints.json'))) + + def test_topdown_evaluate(self): + """test topdown-style COCO metric evaluation.""" + # case 1: score_mode='bbox', nms_mode='none' + metric_coco = CocoWholeBodyMetric( + ann_file=self.ann_file_coco, + outfile_prefix=f'{self.tmp_dir.name}/test1', + score_mode='bbox', + nms_mode='none') + metric_coco.dataset_meta = self.dataset_meta_coco + + # process samples + for data_batch, data_samples in self.topdown_data_coco: + metric_coco.process(data_batch, data_samples) + + eval_results = metric_coco.evaluate(size=len(self.topdown_data_coco)) + + self.assertDictEqual(eval_results, self.target_coco) + self.assertTrue( + osp.isfile(osp.join(self.tmp_dir.name, 'test1.keypoints.json'))) + + # case 2: score_mode='bbox_keypoint', nms_mode='oks_nms' + metric_coco = CocoWholeBodyMetric( + ann_file=self.ann_file_coco, + outfile_prefix=f'{self.tmp_dir.name}/test2', + score_mode='bbox_keypoint', + nms_mode='oks_nms') + metric_coco.dataset_meta = self.dataset_meta_coco + + # process samples + for data_batch, data_samples in self.topdown_data_coco: + metric_coco.process(data_batch, data_samples) + + eval_results = metric_coco.evaluate(size=len(self.topdown_data_coco)) + + self.assertDictEqual(eval_results, self.target_coco) + self.assertTrue( + osp.isfile(osp.join(self.tmp_dir.name, 'test2.keypoints.json'))) + + # case 3: score_mode='bbox_rle', nms_mode='soft_oks_nms' + metric_coco = CocoWholeBodyMetric( + ann_file=self.ann_file_coco, + outfile_prefix=f'{self.tmp_dir.name}/test3', + score_mode='bbox_rle', + nms_mode='soft_oks_nms') + metric_coco.dataset_meta = self.dataset_meta_coco + + # process samples + for data_batch, data_samples in self.topdown_data_coco: + metric_coco.process(data_batch, data_samples) + + eval_results = metric_coco.evaluate(size=len(self.topdown_data_coco)) + + self.assertDictEqual(eval_results, self.target_coco) + self.assertTrue( + osp.isfile(osp.join(self.tmp_dir.name, 'test3.keypoints.json'))) + + # case 4: test without providing ann_file + metric_coco = CocoWholeBodyMetric( + outfile_prefix=f'{self.tmp_dir.name}/test4') + metric_coco.dataset_meta = self.dataset_meta_coco + # process samples + for data_batch, data_samples in self.topdown_data_coco: + metric_coco.process(data_batch, data_samples) + eval_results = metric_coco.evaluate(size=len(self.topdown_data_coco)) + self.assertDictEqual(eval_results, self.target_coco) + # test whether convert the annotation to COCO format + self.assertTrue( + osp.isfile(osp.join(self.tmp_dir.name, 'test4.gt.json'))) + self.assertTrue( + osp.isfile(osp.join(self.tmp_dir.name, 'test4.keypoints.json'))) diff --git a/tests/test_evaluation/test_metrics/test_keypoint_2d_metrics.py b/tests/test_evaluation/test_metrics/test_keypoint_2d_metrics.py index ab2be5ed54..fdb029c40d 100644 --- a/tests/test_evaluation/test_metrics/test_keypoint_2d_metrics.py +++ b/tests/test_evaluation/test_metrics/test_keypoint_2d_metrics.py @@ -2,11 +2,11 @@ from unittest import TestCase import numpy as np -import torch from mmengine.structures import InstanceData from mmpose.datasets.datasets.utils import parse_pose_metainfo -from mmpose.evaluation.metrics import AUC, EPE, NME, PCKAccuracy +from mmpose.evaluation.metrics import (AUC, EPE, NME, JhmdbPCKAccuracy, + MpiiPCKAccuracy, PCKAccuracy) class TestPCKAccuracy(TestCase): @@ -34,7 +34,7 @@ def setUp(self): gt_instances.head_size = np.random.random((1, 1)) * 10 * i pred_instances = InstanceData() - pred_instances.keypoints = torch.from_numpy(keypoints) + pred_instances.keypoints = keypoints data = {'inputs': None} data_sample = { @@ -58,14 +58,14 @@ def test_evaluate(self): pck_metric = PCKAccuracy(thr=0.5, norm_item='bbox') pck_metric.process(self.data_batch, self.data_samples) pck = pck_metric.evaluate(self.batch_size) - target = {'pck/PCK@0.5': 1.0} + target = {'PCK': 1.0} self.assertDictEqual(pck, target) # test normalized by 'head_size' pckh_metric = PCKAccuracy(thr=0.3, norm_item='head') pckh_metric.process(self.data_batch, self.data_samples) pckh = pckh_metric.evaluate(self.batch_size) - target = {'pck/PCKh@0.3': 1.0} + target = {'PCKh': 1.0} self.assertDictEqual(pckh, target) # test normalized by 'torso_size' @@ -74,12 +74,153 @@ def test_evaluate(self): tpck = tpck_metric.evaluate(self.batch_size) self.assertIsInstance(tpck, dict) target = { - 'pck/PCK@0.05': 1.0, - 'pck/tPCK@0.05': 1.0, + 'PCK': 1.0, + 'tPCK': 1.0, } self.assertDictEqual(tpck, target) +class TestMpiiPCKAccuracy(TestCase): + + def setUp(self): + """Setup some variables which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + self.batch_size = 8 + num_keypoints = 16 + self.data_batch = [] + self.data_samples = [] + + for i in range(self.batch_size): + gt_instances = InstanceData() + keypoints = np.zeros((1, num_keypoints, 2)) + keypoints[0, i] = [0.5 * i, 0.5 * i] + gt_instances.keypoints = keypoints + 1.0 + gt_instances.keypoints_visible = np.ones( + (1, num_keypoints, 1)).astype(bool) + gt_instances.keypoints_visible[0, (2 * i) % 8, 0] = False + gt_instances.bboxes = np.random.random((1, 4)) * 20 * i + gt_instances.head_size = np.random.random((1, 1)) * 10 * i + + pred_instances = InstanceData() + pred_instances.keypoints = keypoints + + data = {'inputs': None} + data_sample = { + 'gt_instances': gt_instances.to_dict(), + 'pred_instances': pred_instances.to_dict(), + } + + self.data_batch.append(data) + self.data_samples.append(data_sample) + + def test_init(self): + """test metric init method.""" + # test invalid normalized_items + with self.assertRaisesRegex( + KeyError, "Should be one of 'bbox', 'head', 'torso'"): + MpiiPCKAccuracy(norm_item='invalid') + + def test_evaluate(self): + """test PCK accuracy evaluation metric.""" + # test normalized by 'head_size' + mpii_pck_metric = MpiiPCKAccuracy(thr=0.3, norm_item='head') + mpii_pck_metric.process(self.data_batch, self.data_samples) + pck_results = mpii_pck_metric.evaluate(self.batch_size) + target = { + 'Head PCK': 100.0, + 'Shoulder PCK': 100.0, + 'Elbow PCK': 100.0, + 'Wrist PCK': 100.0, + 'Hip PCK': 100.0, + 'Knee PCK': 100.0, + 'Ankle PCK': 100.0, + 'PCK': 100.0, + 'PCK@0.1': 100.0, + } + self.assertDictEqual(pck_results, target) + + +class TestJhmdbPCKAccuracy(TestCase): + + def setUp(self): + """Setup some variables which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + self.batch_size = 8 + num_keypoints = 15 + self.data_batch = [] + self.data_samples = [] + + for i in range(self.batch_size): + gt_instances = InstanceData() + keypoints = np.zeros((1, num_keypoints, 2)) + keypoints[0, i] = [0.5 * i, 0.5 * i] + gt_instances.keypoints = keypoints + gt_instances.keypoints_visible = np.ones( + (1, num_keypoints, 1)).astype(bool) + gt_instances.keypoints_visible[0, (2 * i) % 8, 0] = False + gt_instances.bboxes = np.random.random((1, 4)) * 20 * i + gt_instances.head_size = np.random.random((1, 1)) * 10 * i + + pred_instances = InstanceData() + pred_instances.keypoints = keypoints + + data = {'inputs': None} + data_sample = { + 'gt_instances': gt_instances.to_dict(), + 'pred_instances': pred_instances.to_dict(), + } + + self.data_batch.append(data) + self.data_samples.append(data_sample) + + def test_init(self): + """test metric init method.""" + # test invalid normalized_items + with self.assertRaisesRegex( + KeyError, "Should be one of 'bbox', 'head', 'torso'"): + JhmdbPCKAccuracy(norm_item='invalid') + + def test_evaluate(self): + """test PCK accuracy evaluation metric.""" + # test normalized by 'bbox_size' + jhmdb_pck_metric = JhmdbPCKAccuracy(thr=0.5, norm_item='bbox') + jhmdb_pck_metric.process(self.data_batch, self.data_samples) + pck_results = jhmdb_pck_metric.evaluate(self.batch_size) + target = { + 'Head PCK': 1.0, + 'Sho PCK': 1.0, + 'Elb PCK': 1.0, + 'Wri PCK': 1.0, + 'Hip PCK': 1.0, + 'Knee PCK': 1.0, + 'Ank PCK': 1.0, + 'PCK': 1.0, + } + self.assertDictEqual(pck_results, target) + + # test normalized by 'torso_size' + jhmdb_tpck_metric = JhmdbPCKAccuracy(thr=0.2, norm_item='torso') + jhmdb_tpck_metric.process(self.data_batch, self.data_samples) + tpck_results = jhmdb_tpck_metric.evaluate(self.batch_size) + target = { + 'Head tPCK': 1.0, + 'Sho tPCK': 1.0, + 'Elb tPCK': 1.0, + 'Wri tPCK': 1.0, + 'Hip tPCK': 1.0, + 'Knee tPCK': 1.0, + 'Ank tPCK': 1.0, + 'tPCK': 1.0, + } + self.assertDictEqual(tpck_results, target) + + class TestAUCandEPE(TestCase): def setUp(self): @@ -112,7 +253,7 @@ def setUp(self): [[True, True, False, True, True]]) pred_instances = InstanceData() - pred_instances.keypoints = torch.from_numpy(output) + pred_instances.keypoints = output data = {'inputs': None} data_sample = { @@ -128,7 +269,7 @@ def test_auc_evaluate(self): auc_metric = AUC(norm_factor=20, num_thrs=4) auc_metric.process(self.data_batch, self.data_samples) auc = auc_metric.evaluate(1) - target = {'auc/@4thrs': 0.375} + target = {'AUC': 0.375} self.assertDictEqual(auc, target) def test_epe_evaluate(self): @@ -136,7 +277,7 @@ def test_epe_evaluate(self): epe_metric = EPE() epe_metric.process(self.data_batch, self.data_samples) epe = epe_metric.evaluate(1) - self.assertAlmostEqual(epe['epe/epe'], 11.5355339) + self.assertAlmostEqual(epe['EPE'], 11.5355339) class TestNME(TestCase): @@ -161,7 +302,7 @@ def _generate_data(self, gt_instances[norm_item] = np.random.random((1, 1)) * 20 * i pred_instances = InstanceData() - pred_instances.keypoints = torch.from_numpy(keypoints) + pred_instances.keypoints = keypoints data = {'inputs': None} data_sample = { @@ -187,7 +328,7 @@ def test_nme_evaluate(self): batch_size=4, num_keypoints=19, norm_item=norm_item) nme_metric.process(data_batch, data_samples) nme = nme_metric.evaluate(4) - target = {'nme/@box_size': 0.0} + target = {'NME': 0.0} self.assertDictEqual(nme, target) # test when norm_mode = 'keypoint_distance' @@ -204,7 +345,7 @@ def test_nme_evaluate(self): nme_metric.process(data_batch, data_samples) nme = nme_metric.evaluate(4) - target = {'nme/@[0, 1]': 0.0} + target = {'NME': 0.0} self.assertDictEqual(nme, target) # test when norm_mode = 'keypoint_distance' @@ -221,7 +362,7 @@ def test_nme_evaluate(self): nme_metric.process(data_batch, data_samples) nme = nme_metric.evaluate(2) - target = {f'nme/@{keypoint_indices}': 0.0} + target = {'NME': 0.0} self.assertDictEqual(nme, target) def test_exceptions_and_warnings(self): diff --git a/tests/test_evaluation/test_metrics/test_keypoint_partition_metric.py b/tests/test_evaluation/test_metrics/test_keypoint_partition_metric.py new file mode 100644 index 0000000000..2b1a60c113 --- /dev/null +++ b/tests/test_evaluation/test_metrics/test_keypoint_partition_metric.py @@ -0,0 +1,525 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import os.path as osp +import tempfile +from collections import defaultdict +from unittest import TestCase + +import numpy as np +from mmengine.fileio import load +from mmengine.structures import InstanceData +from xtcocotools.coco import COCO + +from mmpose.datasets.datasets.utils import parse_pose_metainfo +from mmpose.evaluation.metrics import KeypointPartitionMetric + + +class TestKeypointPartitionMetricWrappingCocoMetric(TestCase): + + def setUp(self): + """Setup some variables which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + self.tmp_dir = tempfile.TemporaryDirectory() + + self.ann_file_coco = \ + 'tests/data/coco/test_keypoint_partition_metric.json' + meta_info_coco = dict( + from_file='configs/_base_/datasets/coco_wholebody.py') + self.dataset_meta_coco = parse_pose_metainfo(meta_info_coco) + self.coco = COCO(self.ann_file_coco) + self.dataset_meta_coco['CLASSES'] = self.coco.loadCats( + self.coco.getCatIds()) + + self.topdown_data_coco = self._convert_ann_to_topdown_batch_data( + self.ann_file_coco) + assert len(self.topdown_data_coco) == 14 + self.bottomup_data_coco = self._convert_ann_to_bottomup_batch_data( + self.ann_file_coco) + assert len(self.bottomup_data_coco) == 4 + """ + The target results were obtained from CocoWholebodyMetric with + score_mode='bbox' and nms_mode='none'. We cannot compare other + combinations of score_mode and nms_mode because CocoWholebodyMetric + calculates scores and nms using all keypoints while + KeypointPartitionMetric calculates scores and nms part by part. + As long as this case is tested correct, the other cases should be + correct. + """ + self.target_bbox_none = { + 'body/coco/AP': 0.749, + 'body/coco/AR': 0.800, + 'foot/coco/AP': 0.840, + 'foot/coco/AR': 0.850, + 'face/coco/AP': 0.051, + 'face/coco/AR': 0.050, + 'left_hand/coco/AP': 0.283, + 'left_hand/coco/AR': 0.300, + 'right_hand/coco/AP': 0.383, + 'right_hand/coco/AR': 0.380, + 'all/coco/AP': 0.284, + 'all/coco/AR': 0.450, + } + + def _convert_ann_to_topdown_batch_data(self, ann_file): + """Convert annotations to topdown-style batch data.""" + topdown_data = [] + db = load(ann_file) + imgid2info = dict() + for img in db['images']: + imgid2info[img['id']] = img + for ann in db['annotations']: + w, h = ann['bbox'][2], ann['bbox'][3] + bboxes = np.array(ann['bbox'], dtype=np.float32).reshape(-1, 4) + bbox_scales = np.array([w * 1.25, h * 1.25]).reshape(-1, 2) + _keypoints = np.array(ann['keypoints']).reshape((1, -1, 3)) + + gt_instances = { + 'bbox_scales': bbox_scales, + 'bbox_scores': np.ones((1, ), dtype=np.float32), + 'bboxes': bboxes, + 'keypoints': _keypoints[..., :2], + 'keypoints_visible': _keypoints[..., 2:3] + } + + # fake predictions + keypoints = np.zeros_like(_keypoints) + keypoints[..., 0] = _keypoints[..., 0] * 0.99 + keypoints[..., 1] = _keypoints[..., 1] * 1.02 + keypoints[..., 2] = _keypoints[..., 2] * 0.8 + + pred_instances = { + 'keypoints': keypoints[..., :2], + 'keypoint_scores': keypoints[..., -1], + } + + data = {'inputs': None} + data_sample = { + 'id': ann['id'], + 'img_id': ann['image_id'], + 'category_id': ann.get('category_id', 1), + 'gt_instances': gt_instances, + 'pred_instances': pred_instances, + # dummy image_shape for testing + 'ori_shape': [640, 480], + # store the raw annotation info to test without ann_file + 'raw_ann_info': copy.deepcopy(ann), + } + + # add crowd_index to data_sample if it is present in the image_info + if 'crowdIndex' in imgid2info[ann['image_id']]: + data_sample['crowd_index'] = imgid2info[ + ann['image_id']]['crowdIndex'] + # batch size = 1 + data_batch = [data] + data_samples = [data_sample] + topdown_data.append((data_batch, data_samples)) + + return topdown_data + + def _convert_ann_to_bottomup_batch_data(self, ann_file): + """Convert annotations to bottomup-style batch data.""" + img2ann = defaultdict(list) + db = load(ann_file) + for ann in db['annotations']: + img2ann[ann['image_id']].append(ann) + + bottomup_data = [] + for img_id, anns in img2ann.items(): + _keypoints = np.array([ann['keypoints'] for ann in anns]).reshape( + (len(anns), -1, 3)) + + gt_instances = { + 'bbox_scores': np.ones((len(anns)), dtype=np.float32), + 'keypoints': _keypoints[..., :2], + 'keypoints_visible': _keypoints[..., 2:3] + } + + # fake predictions + keypoints = np.zeros_like(_keypoints) + keypoints[..., 0] = _keypoints[..., 0] * 0.99 + keypoints[..., 1] = _keypoints[..., 1] * 1.02 + keypoints[..., 2] = _keypoints[..., 2] * 0.8 + + pred_instances = { + 'keypoints': keypoints[..., :2], + 'keypoint_scores': keypoints[..., -1], + } + + data = {'inputs': None} + data_sample = { + 'id': [ann['id'] for ann in anns], + 'img_id': img_id, + 'gt_instances': gt_instances, + 'pred_instances': pred_instances, + # dummy image_shape for testing + 'ori_shape': [640, 480], + 'raw_ann_info': copy.deepcopy(anns), + } + + # batch size = 1 + data_batch = [data] + data_samples = [data_sample] + bottomup_data.append((data_batch, data_samples)) + return bottomup_data + + def _assert_outfiles(self, prefix): + for part in ['body', 'foot', 'face', 'left_hand', 'right_hand', 'all']: + self.assertTrue( + osp.isfile( + osp.join(self.tmp_dir.name, + f'{prefix}.{part}.keypoints.json'))) + + def tearDown(self): + self.tmp_dir.cleanup() + + def test_init(self): + """test metric init method.""" + # test wrong metric type + with self.assertRaisesRegex( + ValueError, 'Metrics supported by KeypointPartitionMetric'): + _ = KeypointPartitionMetric( + metric=dict(type='Metric'), partitions=dict(all=range(133))) + + # test ann_file arg warning + with self.assertWarnsRegex(UserWarning, + 'does not support the ann_file argument'): + _ = KeypointPartitionMetric( + metric=dict(type='CocoMetric', ann_file=''), + partitions=dict(all=range(133))) + + # test score_mode arg warning + with self.assertWarnsRegex(UserWarning, "if score_mode is not 'bbox'"): + _ = KeypointPartitionMetric( + metric=dict(type='CocoMetric'), + partitions=dict(all=range(133))) + + # test nms arg warning + with self.assertWarnsRegex(UserWarning, 'oks_nms and soft_oks_nms'): + _ = KeypointPartitionMetric( + metric=dict(type='CocoMetric'), + partitions=dict(all=range(133))) + + # test partitions + with self.assertRaisesRegex(AssertionError, 'at least one partition'): + _ = KeypointPartitionMetric( + metric=dict(type='CocoMetric'), partitions=dict()) + + with self.assertRaisesRegex(AssertionError, 'should be a sequence'): + _ = KeypointPartitionMetric( + metric=dict(type='CocoMetric'), partitions=dict(all={})) + + with self.assertRaisesRegex(AssertionError, 'at least one element'): + _ = KeypointPartitionMetric( + metric=dict(type='CocoMetric'), partitions=dict(all=[])) + + def test_bottomup_evaluate(self): + """test bottomup-style COCO metric evaluation.""" + # case1: score_mode='bbox', nms_mode='none' + metric = KeypointPartitionMetric( + metric=dict( + type='CocoMetric', + outfile_prefix=f'{self.tmp_dir.name}/test_bottomup', + score_mode='bbox', + nms_mode='none'), + partitions=dict( + body=range(17), + foot=range(17, 23), + face=range(23, 91), + left_hand=range(91, 112), + right_hand=range(112, 133), + all=range(133))) + metric.dataset_meta = self.dataset_meta_coco + + # process samples + for data_batch, data_samples in self.bottomup_data_coco: + metric.process(data_batch, data_samples) + + eval_results = metric.evaluate(size=len(self.bottomup_data_coco)) + for key in self.target_bbox_none.keys(): + self.assertAlmostEqual( + eval_results[key], self.target_bbox_none[key], places=3) + self._assert_outfiles('test_bottomup') + + def test_topdown_evaluate(self): + """test topdown-style COCO metric evaluation.""" + # case 1: score_mode='bbox', nms_mode='none' + metric = KeypointPartitionMetric( + metric=dict( + type='CocoMetric', + outfile_prefix=f'{self.tmp_dir.name}/test_topdown1', + score_mode='bbox', + nms_mode='none'), + partitions=dict( + body=range(17), + foot=range(17, 23), + face=range(23, 91), + left_hand=range(91, 112), + right_hand=range(112, 133), + all=range(133))) + metric.dataset_meta = self.dataset_meta_coco + + # process samples + for data_batch, data_samples in self.topdown_data_coco: + metric.process(data_batch, data_samples) + + eval_results = metric.evaluate(size=len(self.topdown_data_coco)) + for key in self.target_bbox_none.keys(): + self.assertAlmostEqual( + eval_results[key], self.target_bbox_none[key], places=3) + self._assert_outfiles('test_topdown1') + + +class TestKeypointPartitionMetricWrappingPCKAccuracy(TestCase): + + def setUp(self): + """Setup some variables which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + self.batch_size = 8 + num_keypoints = 24 + self.data_batch = [] + self.data_samples = [] + + for i in range(self.batch_size): + gt_instances = InstanceData() + keypoints = np.zeros((1, num_keypoints, 2)) + for j in range(num_keypoints): + keypoints[0, j] = [0.5 * i * j, 0.5 * i * j] + gt_instances.keypoints = keypoints + gt_instances.keypoints_visible = np.ones( + (1, num_keypoints, 1)).astype(bool) + gt_instances.keypoints_visible[0, (2 * i) % 8, 0] = False + gt_instances.bboxes = np.array([[0.1, 0.2, 0.3, 0.4]]) * 20 * i + gt_instances.head_size = np.array([[0.1]]) * 10 * i + + pred_instances = InstanceData() + # fake predictions + _keypoints = np.zeros_like(keypoints) + _keypoints[0, :, 0] = keypoints[0, :, 0] * 0.95 + _keypoints[0, :, 1] = keypoints[0, :, 1] * 1.05 + pred_instances.keypoints = _keypoints + + data = {'inputs': None} + data_sample = { + 'gt_instances': gt_instances.to_dict(), + 'pred_instances': pred_instances.to_dict(), + } + + self.data_batch.append(data) + self.data_samples.append(data_sample) + + def test_init(self): + # test norm_item arg warning + with self.assertWarnsRegex(UserWarning, + 'norm_item torso is used in JhmdbDataset'): + _ = KeypointPartitionMetric( + metric=dict( + type='PCKAccuracy', thr=0.05, norm_item=['bbox', 'torso']), + partitions=dict(all=range(133))) + + def test_evaluate(self): + """test PCK accuracy evaluation metric.""" + # test normalized by 'bbox' + pck_metric = KeypointPartitionMetric( + metric=dict(type='PCKAccuracy', thr=0.5, norm_item='bbox'), + partitions=dict( + p1=range(10), + p2=range(10, 24), + all=range(24), + )) + pck_metric.process(self.data_batch, self.data_samples) + pck = pck_metric.evaluate(self.batch_size) + target = {'p1/PCK': 1.0, 'p2/PCK': 1.0, 'all/PCK': 1.0} + self.assertDictEqual(pck, target) + + # test normalized by 'head_size' + pckh_metric = KeypointPartitionMetric( + metric=dict(type='PCKAccuracy', thr=0.3, norm_item='head'), + partitions=dict( + p1=range(10), + p2=range(10, 24), + all=range(24), + )) + pckh_metric.process(self.data_batch, self.data_samples) + pckh = pckh_metric.evaluate(self.batch_size) + target = {'p1/PCKh': 0.9, 'p2/PCKh': 0.0, 'all/PCKh': 0.375} + self.assertDictEqual(pckh, target) + + # test normalized by 'torso_size' + tpck_metric = KeypointPartitionMetric( + metric=dict( + type='PCKAccuracy', thr=0.05, norm_item=['bbox', 'torso']), + partitions=dict( + p1=range(10), + p2=range(10, 24), + all=range(24), + )) + tpck_metric.process(self.data_batch, self.data_samples) + tpck = tpck_metric.evaluate(self.batch_size) + self.assertIsInstance(tpck, dict) + target = { + 'p1/PCK': 0.6, + 'p1/tPCK': 0.11428571428571428, + 'p2/PCK': 0.0, + 'p2/tPCK': 0.0, + 'all/PCK': 0.25, + 'all/tPCK': 0.047619047619047616 + } + self.assertDictEqual(tpck, target) + + +class TestKeypointPartitionMetricWrappingAUCandEPE(TestCase): + + def setUp(self): + """Setup some variables which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + output = np.zeros((1, 5, 2)) + target = np.zeros((1, 5, 2)) + # first channel + output[0, 0] = [10, 4] + target[0, 0] = [10, 0] + # second channel + output[0, 1] = [10, 18] + target[0, 1] = [10, 10] + # third channel + output[0, 2] = [0, 0] + target[0, 2] = [0, -1] + # fourth channel + output[0, 3] = [40, 40] + target[0, 3] = [30, 30] + # fifth channel + output[0, 4] = [20, 10] + target[0, 4] = [0, 10] + + gt_instances = InstanceData() + gt_instances.keypoints = target + gt_instances.keypoints_visible = np.array( + [[True, True, False, True, True]]) + + pred_instances = InstanceData() + pred_instances.keypoints = output + + data = {'inputs': None} + data_sample = { + 'gt_instances': gt_instances.to_dict(), + 'pred_instances': pred_instances.to_dict() + } + + self.data_batch = [data] + self.data_samples = [data_sample] + + def test_auc_evaluate(self): + """test AUC evaluation metric.""" + auc_metric = KeypointPartitionMetric( + metric=dict(type='AUC', norm_factor=20, num_thrs=4), + partitions=dict( + p1=range(3), + p2=range(3, 5), + all=range(5), + )) + auc_metric.process(self.data_batch, self.data_samples) + auc = auc_metric.evaluate(1) + target = {'p1/AUC': 0.625, 'p2/AUC': 0.125, 'all/AUC': 0.375} + self.assertDictEqual(auc, target) + + def test_epe_evaluate(self): + """test EPE evaluation metric.""" + epe_metric = KeypointPartitionMetric( + metric=dict(type='EPE', ), + partitions=dict( + p1=range(3), + p2=range(3, 5), + all=range(5), + )) + epe_metric.process(self.data_batch, self.data_samples) + epe = epe_metric.evaluate(1) + target = { + 'p1/EPE': 6.0, + 'p2/EPE': 17.071067810058594, + 'all/EPE': 11.535533905029297 + } + self.assertDictEqual(epe, target) + + +class TestKeypointPartitionMetricWrappingNME(TestCase): + + def setUp(self): + """Setup some variables which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + self.batch_size = 4 + num_keypoints = 19 + self.data_batch = [] + self.data_samples = [] + + for i in range(self.batch_size): + gt_instances = InstanceData() + keypoints = np.zeros((1, num_keypoints, 2)) + for j in range(num_keypoints): + keypoints[0, j] = [0.5 * i * j, 0.5 * i * j] + gt_instances.keypoints = keypoints + gt_instances.keypoints_visible = np.ones( + (1, num_keypoints, 1)).astype(bool) + gt_instances.keypoints_visible[0, (2 * i) % self.batch_size, + 0] = False + gt_instances['box_size'] = np.array([[0.1]]) * 10 * i + + pred_instances = InstanceData() + # fake predictions + _keypoints = np.zeros_like(keypoints) + _keypoints[0, :, 0] = keypoints[0, :, 0] * 0.95 + _keypoints[0, :, 1] = keypoints[0, :, 1] * 1.05 + pred_instances.keypoints = _keypoints + + data = {'inputs': None} + data_sample = { + 'gt_instances': gt_instances.to_dict(), + 'pred_instances': pred_instances.to_dict(), + } + + self.data_batch.append(data) + self.data_samples.append(data_sample) + + def test_init(self): + # test norm_mode arg missing + with self.assertRaisesRegex(AssertionError, 'Missing norm_mode'): + _ = KeypointPartitionMetric( + metric=dict(type='NME', ), partitions=dict(all=range(133))) + + # test norm_mode = keypoint_distance + with self.assertRaisesRegex(ValueError, + "NME norm_mode 'keypoint_distance'"): + _ = KeypointPartitionMetric( + metric=dict(type='NME', norm_mode='keypoint_distance'), + partitions=dict(all=range(133))) + + def test_nme_evaluate(self): + """test NME evaluation metric.""" + # test when norm_mode = 'use_norm_item' + # test norm_item = 'box_size' like in `AFLWDataset` + nme_metric = KeypointPartitionMetric( + metric=dict( + type='NME', norm_mode='use_norm_item', norm_item='box_size'), + partitions=dict( + p1=range(10), + p2=range(10, 19), + all=range(19), + )) + nme_metric.process(self.data_batch, self.data_samples) + nme = nme_metric.evaluate(4) + target = { + 'p1/NME': 0.1715388651247378, + 'p2/NME': 0.4949747721354167, + 'all/NME': 0.333256827460395 + } + self.assertDictEqual(nme, target) diff --git a/tests/test_evaluation/test_metrics/test_posetrack18_metric.py b/tests/test_evaluation/test_metrics/test_posetrack18_metric.py index e7c0c829d4..fe44047e31 100644 --- a/tests/test_evaluation/test_metrics/test_posetrack18_metric.py +++ b/tests/test_evaluation/test_metrics/test_posetrack18_metric.py @@ -41,7 +41,7 @@ def setUp(self): 'posetrack18/Hip AP': 100.0, 'posetrack18/Knee AP': 100.0, 'posetrack18/Ankl AP': 100.0, - 'posetrack18/Total AP': 100.0, + 'posetrack18/AP': 100.0, } def _convert_ann_to_topdown_batch_data(self): @@ -118,8 +118,7 @@ def test_init(self): # test score_mode option with self.assertRaisesRegex(ValueError, '`score_mode` should be one of'): - _ = PoseTrack18Metric( - ann_file=self.ann_file, score_mode='keypoint') + _ = PoseTrack18Metric(ann_file=self.ann_file, score_mode='invalid') # test nms_mode option with self.assertRaisesRegex(ValueError, '`nms_mode` should be one of'): diff --git a/tests/test_models/test_heads/test_heatmap_heads/test_ae_head.py b/tests/test_models/test_heads/test_heatmap_heads/test_ae_head.py new file mode 100644 index 0000000000..c59c48f8bf --- /dev/null +++ b/tests/test_models/test_heads/test_heatmap_heads/test_ae_head.py @@ -0,0 +1,148 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import unittest +from itertools import product +from unittest import TestCase + +import numpy as np +import torch +from mmengine.structures import InstanceData, PixelData +from mmengine.utils import is_tuple_of + +from mmpose.codecs import AssociativeEmbedding # noqa +from mmpose.models.heads import AssociativeEmbeddingHead +from mmpose.registry import KEYPOINT_CODECS +from mmpose.testing._utils import get_packed_inputs + + +class TestAssociativeEmbeddingHead(TestCase): + + def _get_tags(self, heatmaps, keypoint_indices, tag_per_keypoint: bool): + + K, H, W = heatmaps.shape + N = keypoint_indices.shape[0] + + if tag_per_keypoint: + tags = np.zeros((K, H, W), dtype=np.float32) + else: + tags = np.zeros((1, H, W), dtype=np.float32) + + for n, k in product(range(N), range(K)): + y, x = np.unravel_index(keypoint_indices[n, k, 0], (H, W)) + if tag_per_keypoint: + tags[k, y, x] = n + else: + tags[0, y, x] = n + + return tags + + def test_forward(self): + + head = AssociativeEmbeddingHead( + in_channels=32, + num_keypoints=17, + tag_dim=1, + tag_per_keypoint=True, + deconv_out_channels=None) + + feats = [torch.rand(1, 32, 64, 64)] + output = head.forward(feats) # should be (heatmaps, tags) + self.assertTrue(is_tuple_of(output, torch.Tensor)) + self.assertEqual(output[0].shape, (1, 17, 64, 64)) + self.assertEqual(output[1].shape, (1, 17, 64, 64)) + + def test_predict(self): + + codec_cfg = dict( + type='AssociativeEmbedding', + input_size=(256, 256), + heatmap_size=(64, 64), + use_udp=False, + decode_keypoint_order=[ + 0, 1, 2, 3, 4, 5, 6, 11, 12, 7, 8, 9, 10, 13, 14, 15, 16 + ]) + + # get test data + codec = KEYPOINT_CODECS.build(codec_cfg) + batch_data_samples = get_packed_inputs( + 1, + input_size=(256, 256), + heatmap_size=(64, 64), + img_shape=(256, 256))['data_samples'] + + keypoints = batch_data_samples[0].gt_instances['keypoints'] + keypoints_visible = batch_data_samples[0].gt_instances[ + 'keypoints_visible'] + + encoded = codec.encode(keypoints, keypoints_visible) + heatmaps = encoded['heatmaps'] + keypoint_indices = encoded['keypoint_indices'] + + tags = self._get_tags( + heatmaps, keypoint_indices, tag_per_keypoint=True) + + dummy_feat = np.concatenate((heatmaps, tags), axis=0) + feats = [torch.from_numpy(dummy_feat)[None]] + + head = AssociativeEmbeddingHead( + in_channels=34, + num_keypoints=17, + tag_dim=1, + tag_per_keypoint=True, + deconv_out_channels=None, + has_final_layer=False, + decoder=codec_cfg) + + preds = head.predict(feats, batch_data_samples) + self.assertTrue(np.allclose(preds[0].keypoints, keypoints, atol=4.0)) + + def test_loss(self): + + codec_cfg = dict( + type='AssociativeEmbedding', + input_size=(256, 256), + heatmap_size=(64, 64), + use_udp=False, + decode_keypoint_order=[ + 0, 1, 2, 3, 4, 5, 6, 11, 12, 7, 8, 9, 10, 13, 14, 15, 16 + ]) + + # get test data + codec = KEYPOINT_CODECS.build(codec_cfg) + + batch_data_samples = get_packed_inputs( + 1, + input_size=(256, 256), + heatmap_size=(64, 64), + img_shape=(256, 256))['data_samples'] + + keypoints = batch_data_samples[0].gt_instances['keypoints'] + keypoints_visible = batch_data_samples[0].gt_instances[ + 'keypoints_visible'] + encoded = codec.encode(keypoints, keypoints_visible) + heatmaps = encoded['heatmaps'] + keypoint_indices = encoded['keypoint_indices'] + keypoint_weights = encoded['keypoint_weights'] + + heatmap_mask = np.ones((1, ) + heatmaps.shape[1:], dtype=np.float32) + batch_data_samples[0].gt_fields = PixelData( + heatmaps=heatmaps, heatmap_mask=heatmap_mask).to_tensor() + batch_data_samples[0].gt_instance_labels = InstanceData( + keypoint_indices=keypoint_indices, + keypoint_weights=keypoint_weights).to_tensor() + + feats = [torch.rand(1, 32, 64, 64)] + head = AssociativeEmbeddingHead( + in_channels=32, + num_keypoints=17, + tag_dim=1, + tag_per_keypoint=True, + deconv_out_channels=None) + + losses = head.loss(feats, batch_data_samples) + for name in ['loss_kpt', 'loss_pull', 'loss_push']: + self.assertIn(name, losses) + self.assertIsInstance(losses[name], torch.Tensor) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_models/test_heads/test_heatmap_heads/test_cid_head.py b/tests/test_models/test_heads/test_heatmap_heads/test_cid_head.py new file mode 100644 index 0000000000..29fd9e57eb --- /dev/null +++ b/tests/test_models/test_heads/test_heatmap_heads/test_cid_head.py @@ -0,0 +1,128 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple +from unittest import TestCase + +import numpy as np +import torch + +from mmpose.models.heads import CIDHead +from mmpose.testing import get_coco_sample, get_packed_inputs +from mmpose.utils.tensor_utils import to_tensor + + +class TestCIDHead(TestCase): + + def _get_feats( + self, + batch_size: int = 1, + feat_shapes: List[Tuple[int, int, int]] = [(32, 128, 128)], + ): + + feats = [ + torch.rand((batch_size, ) + shape, dtype=torch.float32) + for shape in feat_shapes + ] + + if len(feats) > 1: + feats = [[x] for x in feats] + + return feats + + def _get_data_samples(self): + data_samples = get_packed_inputs( + 1, + input_size=(512, 512), + heatmap_size=(128, 128), + img_shape=(512, 512))['data_samples'] + return data_samples + + def test_forward(self): + + head = CIDHead(in_channels=32, num_keypoints=17, gfd_channels=32) + + feats = [torch.rand(1, 32, 128, 128)] + with torch.no_grad(): + output = head.forward(feats) + self.assertIsInstance(output, torch.Tensor) + self.assertEqual(output.shape[1:], (17, 128, 128)) + + def test_predict(self): + + codec = dict( + type='DecoupledHeatmap', + input_size=(512, 512), + heatmap_size=(128, 128)) + + head = CIDHead( + in_channels=32, num_keypoints=17, gfd_channels=32, decoder=codec) + + feats = self._get_feats() + data_samples = self._get_data_samples() + with torch.no_grad(): + preds = head.predict(feats, data_samples) + self.assertEqual(len(preds), 1) + self.assertEqual(preds[0].keypoints.shape[1:], (17, 2)) + self.assertEqual(preds[0].keypoint_scores.shape[1:], (17, )) + + # tta + with torch.no_grad(): + feats_flip = self._get_feats(feat_shapes=[(32, 128, + 128), (32, 128, 128)]) + preds = head.predict(feats_flip, data_samples, + dict(flip_test=True)) + self.assertEqual(len(preds), 1) + self.assertEqual(preds[0].keypoints.shape[1:], (17, 2)) + self.assertEqual(preds[0].keypoint_scores.shape[1:], (17, )) + + # output heatmaps + with torch.no_grad(): + _, pred_fields = head.predict(feats, data_samples, + dict(output_heatmaps=True)) + self.assertEqual(len(pred_fields), 1) + self.assertEqual(pred_fields[0].heatmaps.shape[1:], (128, 128)) + self.assertEqual(pred_fields[0].heatmaps.shape[0] % 17, 0) + + def test_loss(self): + data = get_coco_sample(img_shape=(512, 512), num_instances=1) + data['bbox'] = np.tile(data['bbox'], 2).reshape(-1, 4, 2) + data['bbox'][:, 1:3, 0] = data['bbox'][:, 0:2, 0] + + codec_cfg = dict( + type='DecoupledHeatmap', + input_size=(512, 512), + heatmap_size=(128, 128)) + + head = CIDHead( + in_channels=32, + num_keypoints=17, + gfd_channels=32, + decoder=codec_cfg, + coupled_heatmap_loss=dict( + type='FocalHeatmapLoss', loss_weight=1.0), + decoupled_heatmap_loss=dict( + type='FocalHeatmapLoss', loss_weight=4.0), + contrastive_loss=dict(type='InfoNCELoss', loss_weight=1.0)) + + encoded = head.decoder.encode(data['keypoints'], + data['keypoints_visible'], data['bbox']) + feats = self._get_feats() + data_samples = self._get_data_samples() + for data_sample in data_samples: + data_sample.gt_fields.set_data({ + 'heatmaps': + to_tensor(encoded['heatmaps']), + 'instance_heatmaps': + to_tensor(encoded['instance_heatmaps']) + }) + data_sample.gt_instance_labels.set_data( + {'instance_coords': to_tensor(encoded['instance_coords'])}) + data_sample.gt_instance_labels.set_data( + {'keypoint_weights': to_tensor(encoded['keypoint_weights'])}) + + losses = head.loss(feats, data_samples) + self.assertIn('loss/heatmap_coupled', losses) + self.assertEqual(losses['loss/heatmap_coupled'].ndim, 0) + self.assertIn('loss/heatmap_decoupled', losses) + self.assertEqual(losses['loss/heatmap_decoupled'].ndim, 0) + self.assertIn('loss/contrastive', losses) + self.assertEqual(losses['loss/contrastive'].ndim, 0) diff --git a/tests/test_models/test_heads/test_heatmap_heads/test_cpm_head.py b/tests/test_models/test_heads/test_heatmap_heads/test_cpm_head.py index 02802d0c53..7b0c958b64 100644 --- a/tests/test_models/test_heads/test_heatmap_heads/test_cpm_head.py +++ b/tests/test_models/test_heads/test_heatmap_heads/test_cpm_head.py @@ -23,17 +23,15 @@ def _get_feats(self, return feats def _get_data_samples(self, batch_size: int = 2): - batch_data_samples = [ - inputs['data_sample'] for inputs in get_packed_inputs( - batch_size=batch_size, - num_instances=1, - num_keypoints=17, - img_shape=(128, 128), - input_size=(192, 256), - heatmap_size=(24, 32), - with_heatmap=True, - with_reg_label=False) - ] + batch_data_samples = get_packed_inputs( + batch_size=batch_size, + num_instances=1, + num_keypoints=17, + img_shape=(128, 128), + input_size=(192, 256), + heatmap_size=(24, 32), + with_heatmap=True, + with_reg_label=False)['data_samples'] return batch_data_samples def test_init(self): @@ -223,7 +221,7 @@ def test_loss(self): def test_errors(self): # Invalid arguments - with self.assertRaisesRegex(ValueError, 'Got unmatched values'): + with self.assertRaisesRegex(ValueError, 'Got mismatched lengths'): _ = CPMHead( num_stages=1, in_channels=17, diff --git a/tests/test_models/test_heads/test_heatmap_heads/test_heatmap_head.py b/tests/test_models/test_heads/test_heatmap_heads/test_heatmap_head.py index 93bcecf944..e9848dd16a 100644 --- a/tests/test_models/test_heads/test_heatmap_heads/test_heatmap_head.py +++ b/tests/test_models/test_heads/test_heatmap_heads/test_heatmap_head.py @@ -23,12 +23,6 @@ def _get_feats(self, ] return feats - def _get_data_samples(self, batch_size: int = 2): - batch_data_samples = [ - inputs['data_sample'] for inputs in get_packed_inputs(batch_size) - ] - return batch_data_samples - def test_init(self): # w/o deconv head = HeatmapHead( @@ -87,7 +81,7 @@ def test_predict(self): decoder=decoder_cfg) feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples(batch_size=2) + batch_data_samples = get_packed_inputs(batch_size=2)['data_samples'] preds = head.predict(feats, batch_data_samples) self.assertTrue(len(preds), 2) @@ -108,7 +102,7 @@ def test_predict(self): decoder=decoder_cfg) feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples(batch_size=2) + batch_data_samples = get_packed_inputs(batch_size=2)['data_samples'] preds = head.predict(feats, batch_data_samples) self.assertTrue(len(preds), 2) @@ -116,7 +110,31 @@ def test_predict(self): self.assertEqual(preds[0].keypoints.shape, batch_data_samples[0].gt_instances.keypoints.shape) - # input transform: output heatmap + # input transform: none + head = HeatmapHead( + in_channels=[16, 32], + out_channels=17, + input_transform='resize_concat', + input_index=[0, 1], + deconv_out_channels=(256, 256), + deconv_kernel_sizes=(4, 4), + conv_out_channels=(256, ), + conv_kernel_sizes=(1, ), + decoder=decoder_cfg) + feats = self._get_feats(batch_size=2, feat_shapes=[(48, 16, 12)])[0] + batch_data_samples = get_packed_inputs(batch_size=2)['data_samples'] + with self.assertWarnsRegex( + Warning, + 'the input of HeatmapHead is a tensor instead of a tuple ' + 'or list. The argument `input_transform` will be ignored.'): + preds = head.predict(feats, batch_data_samples) + + self.assertTrue(len(preds), 2) + self.assertIsInstance(preds[0], InstanceData) + self.assertEqual(preds[0].keypoints.shape, + batch_data_samples[0].gt_instances.keypoints.shape) + + # output heatmap head = HeatmapHead( in_channels=[16, 32], out_channels=17, @@ -125,7 +143,7 @@ def test_predict(self): decoder=decoder_cfg) feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples(batch_size=2) + batch_data_samples = get_packed_inputs(batch_size=2)['data_samples'] _, pred_heatmaps = head.predict( feats, batch_data_samples, test_cfg=dict(output_heatmaps=True)) @@ -149,7 +167,7 @@ def test_tta(self): decoder=decoder_cfg) feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples(batch_size=2) + batch_data_samples = get_packed_inputs(batch_size=2)['data_samples'] preds = head.predict([feats, feats], batch_data_samples, test_cfg=dict( @@ -177,7 +195,7 @@ def test_tta(self): decoder=decoder_cfg) feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples(batch_size=2) + batch_data_samples = get_packed_inputs(batch_size=2)['data_samples'] preds = head.predict([feats, feats], batch_data_samples, test_cfg=dict( @@ -200,7 +218,7 @@ def test_loss(self): feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples(batch_size=2) + batch_data_samples = get_packed_inputs(batch_size=2)['data_samples'] losses = head.loss(feats, batch_data_samples) self.assertIsInstance(losses['loss_kpt'], torch.Tensor) self.assertEqual(losses['loss_kpt'].shape, torch.Size(())) @@ -208,14 +226,14 @@ def test_loss(self): def test_errors(self): # Invalid arguments - with self.assertRaisesRegex(ValueError, 'Got unmatched values'): + with self.assertRaisesRegex(ValueError, 'Got mismatched lengths'): _ = HeatmapHead( in_channels=[16, 32], out_channels=17, deconv_out_channels=(256, ), deconv_kernel_sizes=(4, 4)) - with self.assertRaisesRegex(ValueError, 'Got unmatched values'): + with self.assertRaisesRegex(ValueError, 'Got mismatched lengths'): _ = HeatmapHead( in_channels=[16, 32], out_channels=17, diff --git a/tests/test_models/test_heads/test_heatmap_heads/test_mspn_head.py b/tests/test_models/test_heads/test_heatmap_heads/test_mspn_head.py index cff61b842b..ce3d19b688 100644 --- a/tests/test_models/test_heads/test_heatmap_heads/test_mspn_head.py +++ b/tests/test_models/test_heads/test_heatmap_heads/test_mspn_head.py @@ -34,18 +34,16 @@ def _get_data_samples(self, batch_size: int = 2, heatmap_size=(48, 64), num_levels=1): - batch_data_samples = [ - inputs['data_sample'] for inputs in get_packed_inputs( - batch_size=batch_size, - num_instances=1, - num_keypoints=17, - img_shape=(128, 128), - input_size=(192, 256), - heatmap_size=heatmap_size, - with_heatmap=True, - with_reg_label=False, - num_levels=num_levels) - ] + batch_data_samples = get_packed_inputs( + batch_size=batch_size, + num_instances=1, + num_keypoints=17, + img_shape=(128, 128), + input_size=(192, 256), + heatmap_size=heatmap_size, + with_heatmap=True, + with_reg_label=False, + num_levels=num_levels)['data_samples'] return batch_data_samples def test_init(self): diff --git a/tests/test_models/test_heads/test_heatmap_heads/test_rtmcc_head.py b/tests/test_models/test_heads/test_heatmap_heads/test_rtmcc_head.py new file mode 100644 index 0000000000..b7f833d362 --- /dev/null +++ b/tests/test_models/test_heads/test_heatmap_heads/test_rtmcc_head.py @@ -0,0 +1,722 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import unittest +from typing import List, Tuple +from unittest import TestCase + +import torch +import torch.nn as nn +from mmengine.structures import InstanceData +from mmengine.utils import digit_version +from mmengine.utils.dl_utils import TORCH_VERSION + +from mmpose.models.heads import RTMCCHead +from mmpose.models.utils import RTMCCBlock +from mmpose.testing import get_packed_inputs + + +class TestRTMCCHead(TestCase): + + def _get_feats(self, + batch_size: int = 2, + feat_shapes: List[Tuple[int, int, int]] = [(32, 6, 8)]): + + feats = [ + torch.rand((batch_size, ) + shape, dtype=torch.float32) + for shape in feat_shapes + ] + return feats + + def test_init(self): + + if digit_version(TORCH_VERSION) < digit_version('1.7.0'): + return unittest.skip('RTMCCHead requires PyTorch >= 1.7') + + # original version + head = RTMCCHead( + in_channels=32, + out_channels=17, + input_size=(192, 256), + in_featuremap_size=(6, 8), + simcc_split_ratio=2.0, + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + decoder=dict( + type='SimCCLabel', + input_size=(192, 256), + smoothing_type='gaussian', + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False)) + self.assertIsNotNone(head.decoder) + self.assertTrue(isinstance(head.final_layer, nn.Conv2d)) + self.assertTrue(isinstance(head.mlp, nn.Sequential)) + self.assertTrue(isinstance(head.gau, RTMCCBlock)) + self.assertTrue(isinstance(head.cls_x, nn.Linear)) + self.assertTrue(isinstance(head.cls_y, nn.Linear)) + + # w/ 1x1 conv + head = RTMCCHead( + in_channels=32, + out_channels=17, + input_size=(192, 256), + in_featuremap_size=(6, 8), + simcc_split_ratio=2.0, + final_layer_kernel_size=1, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + decoder=dict( + type='SimCCLabel', + input_size=(192, 256), + smoothing_type='gaussian', + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False)) + self.assertIsNotNone(head.decoder) + self.assertTrue(isinstance(head.final_layer, nn.Conv2d)) + self.assertTrue(isinstance(head.mlp, nn.Sequential)) + self.assertTrue(isinstance(head.gau, RTMCCBlock)) + self.assertTrue(isinstance(head.cls_x, nn.Linear)) + self.assertTrue(isinstance(head.cls_y, nn.Linear)) + + # hidden_dims + head = RTMCCHead( + in_channels=32, + out_channels=17, + input_size=(192, 256), + in_featuremap_size=(6, 8), + simcc_split_ratio=2.0, + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=512, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + decoder=dict( + type='SimCCLabel', + input_size=(192, 256), + smoothing_type='gaussian', + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False)) + self.assertIsNotNone(head.decoder) + self.assertTrue(isinstance(head.final_layer, nn.Conv2d)) + self.assertTrue(isinstance(head.mlp, nn.Sequential)) + self.assertTrue(isinstance(head.gau, RTMCCBlock)) + self.assertTrue(isinstance(head.cls_x, nn.Linear)) + self.assertTrue(isinstance(head.cls_y, nn.Linear)) + + # s = 256 + head = RTMCCHead( + in_channels=32, + out_channels=17, + input_size=(192, 256), + in_featuremap_size=(6, 8), + simcc_split_ratio=2.0, + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=256, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + decoder=dict( + type='SimCCLabel', + input_size=(192, 256), + smoothing_type='gaussian', + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False)) + self.assertIsNotNone(head.decoder) + self.assertTrue(isinstance(head.final_layer, nn.Conv2d)) + self.assertTrue(isinstance(head.mlp, nn.Sequential)) + self.assertTrue(isinstance(head.gau, RTMCCBlock)) + self.assertTrue(isinstance(head.cls_x, nn.Linear)) + self.assertTrue(isinstance(head.cls_y, nn.Linear)) + + def test_predict(self): + + if digit_version(TORCH_VERSION) < digit_version('1.7.0'): + return unittest.skip('RTMCCHead requires PyTorch >= 1.7') + + decoder_cfg_list = [] + # original version + decoder_cfg = dict( + type='SimCCLabel', + input_size=(192, 256), + smoothing_type='gaussian', + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False) + decoder_cfg_list.append(decoder_cfg) + + # single sigma + decoder_cfg = dict( + type='SimCCLabel', + input_size=(192, 256), + smoothing_type='gaussian', + sigma=6., + simcc_split_ratio=2.0, + normalize=False) + decoder_cfg_list.append(decoder_cfg) + + # normalize + decoder_cfg = dict( + type='SimCCLabel', + input_size=(192, 256), + smoothing_type='gaussian', + sigma=6., + simcc_split_ratio=2.0, + normalize=True) + decoder_cfg_list.append(decoder_cfg) + + # dark + decoder_cfg = dict( + type='SimCCLabel', + input_size=(192, 256), + smoothing_type='gaussian', + sigma=6., + simcc_split_ratio=2.0, + use_dark=True) + decoder_cfg_list.append(decoder_cfg) + + for decoder_cfg in decoder_cfg_list: + head = RTMCCHead( + in_channels=32, + out_channels=17, + input_size=(192, 256), + in_featuremap_size=(6, 8), + simcc_split_ratio=2.0, + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + decoder=decoder_cfg) + feats = self._get_feats(batch_size=2, feat_shapes=[(32, 8, 6)]) + batch_data_samples = get_packed_inputs( + batch_size=2, + simcc_split_ratio=decoder_cfg['simcc_split_ratio'], + with_simcc_label=True)['data_samples'] + preds, _ = head.predict(feats, batch_data_samples) + + self.assertTrue(len(preds), 2) + self.assertIsInstance(preds[0], InstanceData) + self.assertEqual( + preds[0].keypoints.shape, + batch_data_samples[0].gt_instances.keypoints.shape) + + # 1x1 conv + head = RTMCCHead( + in_channels=32, + out_channels=17, + input_size=(192, 256), + in_featuremap_size=(6, 8), + simcc_split_ratio=2.0, + final_layer_kernel_size=1, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + decoder=decoder_cfg) + feats = self._get_feats(batch_size=2, feat_shapes=[(32, 8, 6)]) + batch_data_samples = get_packed_inputs( + batch_size=2, + simcc_split_ratio=decoder_cfg['simcc_split_ratio'], + with_simcc_label=True)['data_samples'] + preds, _ = head.predict(feats, batch_data_samples) + + # hidden dims + head = RTMCCHead( + in_channels=32, + out_channels=17, + input_size=(192, 256), + in_featuremap_size=(6, 8), + simcc_split_ratio=2.0, + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=512, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + decoder=decoder_cfg) + feats = self._get_feats(batch_size=2, feat_shapes=[(32, 8, 6)]) + batch_data_samples = get_packed_inputs( + batch_size=2, + simcc_split_ratio=decoder_cfg['simcc_split_ratio'], + with_simcc_label=True)['data_samples'] + preds, _ = head.predict(feats, batch_data_samples) + + self.assertTrue(len(preds), 2) + self.assertIsInstance(preds[0], InstanceData) + self.assertEqual( + preds[0].keypoints.shape, + batch_data_samples[0].gt_instances.keypoints.shape) + + # s + head = RTMCCHead( + in_channels=32, + out_channels=17, + input_size=(192, 256), + in_featuremap_size=(6, 8), + simcc_split_ratio=2.0, + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=64, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + decoder=decoder_cfg) + feats = self._get_feats(batch_size=2, feat_shapes=[(32, 8, 6)]) + batch_data_samples = get_packed_inputs( + batch_size=2, + simcc_split_ratio=decoder_cfg['simcc_split_ratio'], + with_simcc_label=True)['data_samples'] + preds, _ = head.predict(feats, batch_data_samples) + + self.assertTrue(len(preds), 2) + self.assertIsInstance(preds[0], InstanceData) + self.assertEqual( + preds[0].keypoints.shape, + batch_data_samples[0].gt_instances.keypoints.shape) + + # expansion factor + head = RTMCCHead( + in_channels=32, + out_channels=17, + input_size=(192, 256), + in_featuremap_size=(6, 8), + simcc_split_ratio=2.0, + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=3, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + decoder=decoder_cfg) + feats = self._get_feats(batch_size=2, feat_shapes=[(32, 8, 6)]) + batch_data_samples = get_packed_inputs( + batch_size=2, + simcc_split_ratio=decoder_cfg['simcc_split_ratio'], + with_simcc_label=True)['data_samples'] + preds, _ = head.predict(feats, batch_data_samples) + + self.assertTrue(len(preds), 2) + self.assertIsInstance(preds[0], InstanceData) + self.assertEqual( + preds[0].keypoints.shape, + batch_data_samples[0].gt_instances.keypoints.shape) + + # drop path + head = RTMCCHead( + in_channels=32, + out_channels=17, + input_size=(192, 256), + in_featuremap_size=(6, 8), + simcc_split_ratio=2.0, + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0.1, + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + decoder=decoder_cfg) + feats = self._get_feats(batch_size=2, feat_shapes=[(32, 8, 6)]) + batch_data_samples = get_packed_inputs( + batch_size=2, + simcc_split_ratio=decoder_cfg['simcc_split_ratio'], + with_simcc_label=True)['data_samples'] + preds, _ = head.predict(feats, batch_data_samples) + + self.assertTrue(len(preds), 2) + self.assertIsInstance(preds[0], InstanceData) + self.assertEqual( + preds[0].keypoints.shape, + batch_data_samples[0].gt_instances.keypoints.shape) + + # act fn + head = RTMCCHead( + in_channels=32, + out_channels=17, + input_size=(192, 256), + in_featuremap_size=(6, 8), + simcc_split_ratio=2.0, + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='ReLU', + use_rel_bias=False, + pos_enc=False), + decoder=decoder_cfg) + feats = self._get_feats(batch_size=2, feat_shapes=[(32, 8, 6)]) + batch_data_samples = get_packed_inputs( + batch_size=2, + simcc_split_ratio=decoder_cfg['simcc_split_ratio'], + with_simcc_label=True)['data_samples'] + preds, _ = head.predict(feats, batch_data_samples) + + self.assertTrue(len(preds), 2) + self.assertIsInstance(preds[0], InstanceData) + self.assertEqual( + preds[0].keypoints.shape, + batch_data_samples[0].gt_instances.keypoints.shape) + + # use_rel_bias + head = RTMCCHead( + in_channels=32, + out_channels=17, + input_size=(192, 256), + in_featuremap_size=(6, 8), + simcc_split_ratio=2.0, + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=True, + pos_enc=False), + decoder=decoder_cfg) + feats = self._get_feats(batch_size=2, feat_shapes=[(32, 8, 6)]) + batch_data_samples = get_packed_inputs( + batch_size=2, + simcc_split_ratio=decoder_cfg['simcc_split_ratio'], + with_simcc_label=True)['data_samples'] + preds, _ = head.predict(feats, batch_data_samples) + + self.assertTrue(len(preds), 2) + self.assertIsInstance(preds[0], InstanceData) + self.assertEqual( + preds[0].keypoints.shape, + batch_data_samples[0].gt_instances.keypoints.shape) + + # pos_enc + head = RTMCCHead( + in_channels=32, + out_channels=17, + input_size=(192, 256), + in_featuremap_size=(6, 8), + simcc_split_ratio=2.0, + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=True), + decoder=decoder_cfg) + feats = self._get_feats(batch_size=2, feat_shapes=[(32, 8, 6)]) + batch_data_samples = get_packed_inputs( + batch_size=2, + simcc_split_ratio=decoder_cfg['simcc_split_ratio'], + with_simcc_label=True)['data_samples'] + preds, _ = head.predict(feats, batch_data_samples) + + self.assertTrue(len(preds), 2) + self.assertIsInstance(preds[0], InstanceData) + self.assertEqual( + preds[0].keypoints.shape, + batch_data_samples[0].gt_instances.keypoints.shape) + + # output_heatmaps + head = RTMCCHead( + in_channels=32, + out_channels=17, + input_size=(192, 256), + in_featuremap_size=(6, 8), + simcc_split_ratio=2.0, + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + decoder=decoder_cfg, + ) + feats = self._get_feats(batch_size=2, feat_shapes=[(32, 8, 6)]) + batch_data_samples = get_packed_inputs( + batch_size=2, + simcc_split_ratio=decoder_cfg['simcc_split_ratio'], + with_simcc_label=True)['data_samples'] + preds, pred_heatmaps = head.predict( + feats, batch_data_samples, test_cfg=dict(output_heatmaps=True)) + + self.assertTrue(len(preds), 2) + self.assertIsInstance(preds[0], InstanceData) + self.assertEqual(preds[0].keypoint_x_labels.shape, (1, 17, 384)) + self.assertEqual(preds[0].keypoint_y_labels.shape, (1, 17, 512)) + self.assertEqual( + preds[0].keypoints.shape, + batch_data_samples[0].gt_instances.keypoints.shape) + self.assertEqual(pred_heatmaps[0].heatmaps.shape, (17, 512, 384)) + + def test_tta(self): + if digit_version(TORCH_VERSION) < digit_version('1.7.0'): + return unittest.skip('RTMCCHead requires PyTorch >= 1.7') + + # flip test + decoder_cfg = dict( + type='SimCCLabel', + input_size=(192, 256), + smoothing_type='gaussian', + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False) + + head = RTMCCHead( + in_channels=32, + out_channels=17, + input_size=(192, 256), + in_featuremap_size=(6, 8), + simcc_split_ratio=2.0, + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + decoder=decoder_cfg) + feats = self._get_feats(batch_size=2, feat_shapes=[(32, 8, 6)]) + batch_data_samples = get_packed_inputs( + batch_size=2, simcc_split_ratio=2.0, + with_simcc_label=True)['data_samples'] + preds = head.predict([feats, feats], + batch_data_samples, + test_cfg=dict(flip_test=True)) + + self.assertTrue(len(preds), 2) + self.assertIsInstance(preds[0], InstanceData) + self.assertEqual(preds[0].keypoints.shape, + batch_data_samples[0].gt_instances.keypoints.shape) + + def test_loss(self): + if digit_version(TORCH_VERSION) < digit_version('1.7.0'): + return unittest.skip('RTMCCHead requires PyTorch >= 1.7') + + decoder_cfg_list = [] + decoder_cfg = dict( + type='SimCCLabel', + input_size=(192, 256), + smoothing_type='gaussian', + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False) + decoder_cfg_list.append(decoder_cfg) + + decoder_cfg = dict( + type='SimCCLabel', + input_size=(192, 256), + smoothing_type='gaussian', + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=True) + decoder_cfg_list.append(decoder_cfg) + + # decoder + for decoder_cfg in decoder_cfg_list: + head = RTMCCHead( + in_channels=32, + out_channels=17, + input_size=(192, 256), + in_featuremap_size=(6, 8), + simcc_split_ratio=2.0, + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=1., + label_softmax=False, + ), + decoder=decoder_cfg) + + feats = self._get_feats(batch_size=2, feat_shapes=[(32, 8, 6)]) + batch_data_samples = get_packed_inputs( + batch_size=2, simcc_split_ratio=2.0, + with_simcc_label=True)['data_samples'] + losses = head.loss(feats, batch_data_samples) + self.assertIsInstance(losses['loss_kpt'], torch.Tensor) + self.assertEqual(losses['loss_kpt'].shape, torch.Size(())) + self.assertIsInstance(losses['acc_pose'], torch.Tensor) + + # beta = 10 + head = RTMCCHead( + in_channels=32, + out_channels=17, + input_size=(192, 256), + in_featuremap_size=(6, 8), + simcc_split_ratio=2.0, + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=False, + ), + decoder=decoder_cfg) + + feats = self._get_feats(batch_size=2, feat_shapes=[(32, 8, 6)]) + batch_data_samples = get_packed_inputs( + batch_size=2, simcc_split_ratio=2.0, + with_simcc_label=True)['data_samples'] + losses = head.loss(feats, batch_data_samples) + self.assertIsInstance(losses['loss_kpt'], torch.Tensor) + self.assertEqual(losses['loss_kpt'].shape, torch.Size(())) + self.assertIsInstance(losses['acc_pose'], torch.Tensor) + + # label softmax + head = RTMCCHead( + in_channels=32, + out_channels=17, + input_size=(192, 256), + in_featuremap_size=(6, 8), + simcc_split_ratio=2.0, + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True, + ), + decoder=decoder_cfg) + + feats = self._get_feats(batch_size=2, feat_shapes=[(32, 8, 6)]) + batch_data_samples = get_packed_inputs( + batch_size=2, simcc_split_ratio=2.0, + with_simcc_label=True)['data_samples'] + losses = head.loss(feats, batch_data_samples) + self.assertIsInstance(losses['loss_kpt'], torch.Tensor) + self.assertEqual(losses['loss_kpt'].shape, torch.Size(())) + self.assertIsInstance(losses['acc_pose'], torch.Tensor) + + def test_errors(self): + if digit_version(TORCH_VERSION) < digit_version('1.7.0'): + return unittest.skip('RTMCCHead requires PyTorch >= 1.7') + + # Invalid arguments + with self.assertRaisesRegex(ValueError, 'multiple input features'): + _ = RTMCCHead( + in_channels=(16, 32), + out_channels=17, + input_size=(192, 256), + in_featuremap_size=(6, 8), + simcc_split_ratio=2.0, + final_layer_kernel_size=7, + gau_cfg=dict( + hidden_dims=256, + s=128, + expansion_factor=2, + dropout_rate=0., + drop_path=0., + act_fn='SiLU', + use_rel_bias=False, + pos_enc=False), + input_transform='select', + input_index=[0, 1], + loss=dict( + type='KLDiscretLoss', + use_target_weight=True, + beta=10., + label_softmax=True, + ), + decoder=dict( + type='SimCCLabel', + input_size=(192, 256), + smoothing_type='gaussian', + sigma=(4.9, 5.66), + simcc_split_ratio=2.0, + normalize=False)) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_models/test_heads/test_heatmap_heads/test_simcc_head.py b/tests/test_models/test_heads/test_heatmap_heads/test_simcc_head.py index 20af073e3f..4b332f4663 100644 --- a/tests/test_models/test_heads/test_heatmap_heads/test_simcc_head.py +++ b/tests/test_models/test_heads/test_heatmap_heads/test_simcc_head.py @@ -22,20 +22,6 @@ def _get_feats(self, ] return feats - def _get_data_samples(self, - batch_size: int = 2, - simcc_split_ratio: float = 2.0, - with_simcc_label=False): - - batch_data_samples = [ - inputs['data_sample'] for inputs in get_packed_inputs( - batch_size, - simcc_split_ratio=simcc_split_ratio, - with_simcc_label=with_simcc_label) - ] - - return batch_data_samples - def test_init(self): # w/ gaussian decoder @@ -43,7 +29,8 @@ def test_init(self): in_channels=32, out_channels=17, input_size=(192, 256), - in_featuremap_size=(8, 6), + in_featuremap_size=(6, 8), + simcc_split_ratio=2.0, decoder=dict( type='SimCCLabel', input_size=(192, 256), @@ -57,7 +44,8 @@ def test_init(self): in_channels=32, out_channels=17, input_size=(192, 256), - in_featuremap_size=(8, 6), + in_featuremap_size=(6, 8), + simcc_split_ratio=3.0, decoder=dict( type='SimCCLabel', input_size=(192, 256), @@ -72,7 +60,8 @@ def test_init(self): in_channels=32, out_channels=17, input_size=(192, 256), - in_featuremap_size=(8, 6), + in_featuremap_size=(6, 8), + simcc_split_ratio=3.0, decoder=dict( type='SimCCLabel', input_size=(192, 256), @@ -110,14 +99,17 @@ def test_predict(self): in_channels=[16, 32], out_channels=17, input_size=(192, 256), - in_featuremap_size=(8, 6), + in_featuremap_size=(6, 8), + simcc_split_ratio=decoder_cfg['simcc_split_ratio'], input_transform='select', input_index=-1, decoder=decoder_cfg) feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples( - batch_size=2, simcc_split_ratio=2.0, with_simcc_label=True) + batch_data_samples = get_packed_inputs( + batch_size=2, + simcc_split_ratio=decoder_cfg['simcc_split_ratio'], + with_simcc_label=True)['data_samples'] preds = head.predict(feats, batch_data_samples) self.assertTrue(len(preds), 2) @@ -141,8 +133,10 @@ def test_predict(self): decoder=decoder_cfg) feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples( - batch_size=2, simcc_split_ratio=2.0, with_simcc_label=True) + batch_data_samples = get_packed_inputs( + batch_size=2, + simcc_split_ratio=decoder_cfg['simcc_split_ratio'], + with_simcc_label=True)['data_samples'] preds = head.predict(feats, batch_data_samples) self.assertTrue(len(preds), 2) @@ -156,21 +150,26 @@ def test_predict(self): in_channels=[16, 32], out_channels=17, input_size=(192, 256), - in_featuremap_size=(8, 6), + in_featuremap_size=(6, 8), + simcc_split_ratio=decoder_cfg['simcc_split_ratio'], input_transform='select', input_index=-1, decoder=decoder_cfg) feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples( - batch_size=2, simcc_split_ratio=2.0, with_simcc_label=True) - preds = head.predict( + batch_data_samples = get_packed_inputs( + batch_size=2, + simcc_split_ratio=decoder_cfg['simcc_split_ratio'], + with_simcc_label=True)['data_samples'] + preds, pred_heatmaps = head.predict( feats, batch_data_samples, test_cfg=dict(output_heatmaps=True)) self.assertEqual(preds[0].keypoint_x_labels.shape, (1, 17, 192 * 2)) self.assertEqual(preds[0].keypoint_y_labels.shape, (1, 17, 256 * 2)) + self.assertTrue(len(pred_heatmaps), 2) + self.assertEqual(pred_heatmaps[0].heatmaps.shape, (17, 512, 384)) def test_tta(self): # flip test @@ -185,14 +184,16 @@ def test_tta(self): in_channels=[16, 32], out_channels=17, input_size=(192, 256), - in_featuremap_size=(8, 6), + in_featuremap_size=(6, 8), + simcc_split_ratio=2.0, input_transform='select', input_index=-1, decoder=decoder_cfg) feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples( - batch_size=2, simcc_split_ratio=2.0, with_simcc_label=True) + batch_data_samples = get_packed_inputs( + batch_size=2, simcc_split_ratio=2.0, + with_simcc_label=True)['data_samples'] preds = head.predict([feats, feats], batch_data_samples, test_cfg=dict(flip_test=True)) @@ -231,15 +232,17 @@ def test_loss(self): in_channels=[16, 32], out_channels=17, input_size=(192, 256), - in_featuremap_size=(8, 6), + in_featuremap_size=(6, 8), + simcc_split_ratio=2.0, input_transform='select', input_index=-1, decoder=decoder_cfg) feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples( - batch_size=2, simcc_split_ratio=2.0, with_simcc_label=True) + batch_data_samples = get_packed_inputs( + batch_size=2, simcc_split_ratio=2.0, + with_simcc_label=True)['data_samples'] losses = head.loss(feats, batch_data_samples) self.assertIsInstance(losses['loss_kpt'], torch.Tensor) self.assertEqual(losses['loss_kpt'].shape, torch.Size(())) @@ -247,7 +250,7 @@ def test_loss(self): def test_errors(self): # Invalid arguments - with self.assertRaisesRegex(ValueError, 'Got unmatched values'): + with self.assertRaisesRegex(ValueError, 'Got mismatched lengths'): _ = SimCCHead( in_channels=[16, 32], out_channels=17, @@ -256,7 +259,7 @@ def test_errors(self): deconv_out_channels=(256, ), deconv_kernel_sizes=(4, 4)) - with self.assertRaisesRegex(ValueError, 'Got unmatched values'): + with self.assertRaisesRegex(ValueError, 'Got mismatched lengths'): _ = SimCCHead( in_channels=[16, 32], out_channels=17, diff --git a/tests/test_models/test_heads/test_heatmap_heads/test_vipnas_head.py b/tests/test_models/test_heads/test_heatmap_heads/test_vipnas_head.py index 8848b7983c..060fceced8 100644 --- a/tests/test_models/test_heads/test_heatmap_heads/test_vipnas_head.py +++ b/tests/test_models/test_heads/test_heatmap_heads/test_vipnas_head.py @@ -22,12 +22,6 @@ def _get_feats(self, ] return feats - def _get_data_samples(self, batch_size: int = 2): - batch_data_samples = [ - inputs['data_sample'] for inputs in get_packed_inputs(batch_size) - ] - return batch_data_samples - def test_init(self): # w/o deconv head = ViPNASHead( @@ -88,7 +82,7 @@ def test_predict(self): decoder=decoder_cfg) feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples(batch_size=2) + batch_data_samples = get_packed_inputs(batch_size=2)['data_samples'] preds = head.predict(feats, batch_data_samples) self.assertTrue(len(preds), 2) @@ -110,7 +104,7 @@ def test_predict(self): decoder=decoder_cfg) feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples(batch_size=2) + batch_data_samples = get_packed_inputs(batch_size=2)['data_samples'] preds = head.predict(feats, batch_data_samples) self.assertTrue(len(preds), 2) @@ -127,7 +121,7 @@ def test_predict(self): decoder=decoder_cfg) feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples(batch_size=2) + batch_data_samples = get_packed_inputs(batch_size=2)['data_samples'] _, pred_heatmaps = head.predict( feats, batch_data_samples, test_cfg=dict(output_heatmaps=True)) @@ -152,7 +146,7 @@ def test_tta(self): decoder=decoder_cfg) feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples(batch_size=2) + batch_data_samples = get_packed_inputs(batch_size=2)['data_samples'] preds = head.predict([feats, feats], batch_data_samples, test_cfg=dict( @@ -175,7 +169,7 @@ def test_loss(self): feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples(batch_size=2) + batch_data_samples = get_packed_inputs(batch_size=2)['data_samples'] losses = head.loss(feats, batch_data_samples) self.assertIsInstance(losses['loss_kpt'], torch.Tensor) self.assertEqual(losses['loss_kpt'].shape, torch.Size(())) @@ -183,13 +177,13 @@ def test_loss(self): def test_errors(self): # Invalid arguments - with self.assertRaisesRegex(ValueError, 'Got unmatched values'): + with self.assertRaisesRegex(ValueError, 'Got mismatched lengths'): _ = ViPNASHead( in_channels=[16, 32], out_channels=17, deconv_out_channels=(256, ), deconv_kernel_sizes=(4, 4)) - with self.assertRaisesRegex(ValueError, 'Got unmatched values'): + with self.assertRaisesRegex(ValueError, 'Got mismatched lengths'): _ = ViPNASHead( in_channels=[16, 32], out_channels=17, @@ -197,7 +191,7 @@ def test_errors(self): deconv_kernel_sizes=(4, 4), deconv_num_groups=(1, )) - with self.assertRaisesRegex(ValueError, 'Got unmatched values'): + with self.assertRaisesRegex(ValueError, 'Got mismatched lengths'): _ = ViPNASHead( in_channels=[16, 32], out_channels=17, diff --git a/tests/test_models/test_heads/test_hybrid_heads/test_dekr_head.py b/tests/test_models/test_heads/test_hybrid_heads/test_dekr_head.py new file mode 100644 index 0000000000..957f3499d0 --- /dev/null +++ b/tests/test_models/test_heads/test_hybrid_heads/test_dekr_head.py @@ -0,0 +1,137 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple +from unittest import TestCase + +import torch +from mmengine.utils import is_tuple_of + +from mmpose.models.heads import DEKRHead +from mmpose.testing import get_coco_sample, get_packed_inputs +from mmpose.utils.tensor_utils import to_tensor + + +class TestDEKRHead(TestCase): + + def _get_feats( + self, + batch_size: int = 1, + feat_shapes: List[Tuple[int, int, int]] = [(32, 128, 128)], + ): + + feats = [ + torch.rand((batch_size, ) + shape, dtype=torch.float32) + for shape in feat_shapes + ] + + if len(feats) > 1: + feats = [[x] for x in feats] + + return feats + + def _get_data_samples(self): + data_samples = get_packed_inputs( + 1, + input_size=(512, 512), + heatmap_size=(128, 128), + img_shape=(512, 512))['data_samples'] + return data_samples + + def test_forward(self): + + head = DEKRHead(in_channels=32, num_keypoints=17) + + feats = [torch.rand(1, 32, 128, 128)] + output = head.forward(feats) # should be (heatmaps, displacements) + self.assertTrue(is_tuple_of(output, torch.Tensor)) + self.assertEqual(output[0].shape, (1, 18, 128, 128)) + self.assertEqual(output[1].shape, (1, 34, 128, 128)) + + def test_predict(self): + + codec_cfg = dict( + type='SPR', + input_size=(512, 512), + heatmap_size=(128, 128), + sigma=(4, 2), + generate_keypoint_heatmaps=True, + ) + + head = DEKRHead(in_channels=32, num_keypoints=17, decoder=codec_cfg) + + feats = self._get_feats() + data_samples = self._get_data_samples() + with torch.no_grad(): + preds = head.predict(feats, data_samples) + self.assertEqual(len(preds), 1) + self.assertEqual(preds[0].keypoints.shape[1:], (17, 2)) + self.assertEqual(preds[0].keypoint_scores.shape[1:], (17, )) + + # predict with rescore net + head = DEKRHead( + in_channels=32, + num_keypoints=17, + decoder=codec_cfg, + rescore_cfg=dict(in_channels=74, norm_indexes=(5, 6))) + + with torch.no_grad(): + preds = head.predict(feats, data_samples) + self.assertEqual(len(preds), 1) + self.assertEqual(preds[0].keypoints.shape[1:], (17, 2)) + self.assertEqual(preds[0].keypoint_scores.shape[1:], (17, )) + + # tta + with torch.no_grad(): + feats_flip = self._get_feats(feat_shapes=[(32, 128, + 128), (32, 128, 128)]) + preds = head.predict(feats_flip, data_samples, + dict(flip_test=True)) + self.assertEqual(len(preds), 1) + self.assertEqual(preds[0].keypoints.shape[1:], (17, 2)) + self.assertEqual(preds[0].keypoint_scores.shape[1:], (17, )) + + # output heatmaps + with torch.no_grad(): + _, pred_fields = head.predict(feats, data_samples, + dict(output_heatmaps=True)) + self.assertEqual(len(pred_fields), 1) + self.assertEqual(pred_fields[0].heatmaps.shape, (18, 128, 128)) + self.assertEqual(pred_fields[0].displacements.shape, + (34, 128, 128)) + + def test_loss(self): + data = get_coco_sample(img_shape=(512, 512), num_instances=1) + + codec_cfg = dict( + type='SPR', + input_size=(512, 512), + heatmap_size=(128, 128), + sigma=(4, 2), + generate_keypoint_heatmaps=True, + ) + + head = DEKRHead( + in_channels=32, + num_keypoints=17, + decoder=codec_cfg, + heatmap_loss=dict(type='KeypointMSELoss', use_target_weight=True), + displacement_loss=dict( + type='SoftWeightSmoothL1Loss', + use_target_weight=True, + supervise_empty=False, + beta=1 / 9, + )) + + encoded = head.decoder.encode(data['keypoints'], + data['keypoints_visible']) + feats = self._get_feats() + data_samples = self._get_data_samples() + for data_sample in data_samples: + data_sample.gt_fields.set_data( + {k: to_tensor(v) + for k, v in encoded.items()}) + + losses = head.loss(feats, data_samples) + self.assertIn('loss/heatmap', losses) + self.assertEqual(losses['loss/heatmap'].ndim, 0) + self.assertIn('loss/displacement', losses) + self.assertEqual(losses['loss/displacement'].ndim, 0) diff --git a/tests/test_models/test_heads/test_regression_heads/test_dsnt_head.py b/tests/test_models/test_heads/test_regression_heads/test_dsnt_head.py index 84f86c39f3..0ccdb52e42 100644 --- a/tests/test_models/test_heads/test_regression_heads/test_dsnt_head.py +++ b/tests/test_models/test_heads/test_regression_heads/test_dsnt_head.py @@ -25,16 +25,6 @@ def _get_feats( return feats - def _get_data_samples(self, - batch_size: int = 2, - with_reg_label: bool = False): - batch_data_samples = [ - inputs['data_sample'] for inputs in get_packed_inputs( - batch_size, with_reg_label=with_reg_label) - ] - - return batch_data_samples - def test_init(self): # square heatmap head = DSNTHead( @@ -137,8 +127,8 @@ def test_predict(self): feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples( - batch_size=2, with_reg_label=False) + batch_data_samples = get_packed_inputs( + batch_size=2, with_reg_label=False)['data_samples'] preds = head.predict(feats, batch_data_samples) self.assertTrue(len(preds), 2) @@ -157,7 +147,7 @@ def test_predict(self): ) feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples(batch_size=2) + batch_data_samples = get_packed_inputs(batch_size=2)['data_samples'] preds = head.predict(feats, batch_data_samples) self.assertTrue(len(preds), 2) @@ -177,8 +167,8 @@ def test_predict(self): feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples( - batch_size=2, with_reg_label=False) + batch_data_samples = get_packed_inputs( + batch_size=2, with_reg_label=False)['data_samples'] _, pred_heatmaps = head.predict( feats, batch_data_samples, test_cfg=dict(output_heatmaps=True)) @@ -205,8 +195,8 @@ def test_tta(self): feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples( - batch_size=2, with_reg_label=False) + batch_data_samples = get_packed_inputs( + batch_size=2, with_reg_label=False)['data_samples'] preds = head.predict([feats, feats], batch_data_samples, test_cfg=dict(flip_test=True)) @@ -233,8 +223,8 @@ def test_loss(self): feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples( - batch_size=2, with_reg_label=True) + batch_data_samples = get_packed_inputs( + batch_size=2, with_reg_label=True)['data_samples'] losses = head.loss(feats, batch_data_samples) self.assertIsInstance(losses['loss_kpt'], torch.Tensor) diff --git a/tests/test_models/test_heads/test_regression_heads/test_integral_regression_head.py b/tests/test_models/test_heads/test_regression_heads/test_integral_regression_head.py index 5357e0c2c3..5e93ae91cc 100644 --- a/tests/test_models/test_heads/test_regression_heads/test_integral_regression_head.py +++ b/tests/test_models/test_heads/test_regression_heads/test_integral_regression_head.py @@ -25,16 +25,6 @@ def _get_feats( return feats - def _get_data_samples(self, - batch_size: int = 2, - with_heatmap: bool = False): - batch_data_samples = [ - inputs['data_sample'] for inputs in get_packed_inputs( - batch_size, with_heatmap=with_heatmap) - ] - - return batch_data_samples - def test_init(self): # square heatmap head = IntegralRegressionHead( @@ -129,8 +119,8 @@ def test_predict(self): feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples( - batch_size=2, with_heatmap=False) + batch_data_samples = get_packed_inputs( + batch_size=2, with_heatmap=False)['data_samples'] preds = head.predict(feats, batch_data_samples) self.assertTrue(len(preds), 2) @@ -149,7 +139,7 @@ def test_predict(self): ) feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples(batch_size=2) + batch_data_samples = get_packed_inputs(batch_size=2)['data_samples'] preds = head.predict(feats, batch_data_samples) self.assertTrue(len(preds), 2) @@ -169,8 +159,8 @@ def test_predict(self): feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples( - batch_size=2, with_heatmap=False) + batch_data_samples = get_packed_inputs( + batch_size=2, with_heatmap=False)['data_samples'] _, pred_heatmaps = head.predict( feats, batch_data_samples, test_cfg=dict(output_heatmaps=True)) @@ -193,8 +183,8 @@ def test_tta(self): feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples( - batch_size=2, with_heatmap=False) + batch_data_samples = get_packed_inputs( + batch_size=2, with_heatmap=False)['data_samples'] preds = head.predict([feats, feats], batch_data_samples, test_cfg=dict( @@ -218,7 +208,7 @@ def test_loss(self): feats = self._get_feats( batch_size=2, feat_shapes=[(16, 16, 12), (32, 8, 6)]) - batch_data_samples = self._get_data_samples(batch_size=2) + batch_data_samples = get_packed_inputs(batch_size=2)['data_samples'] losses = head.loss(feats, batch_data_samples) self.assertIsInstance(losses['loss_kpt'], torch.Tensor) diff --git a/tests/test_models/test_heads/test_regression_heads/test_regression_head.py b/tests/test_models/test_heads/test_regression_heads/test_regression_head.py index 89c8d8c1ff..d0d8782ef0 100644 --- a/tests/test_models/test_heads/test_regression_heads/test_regression_head.py +++ b/tests/test_models/test_heads/test_regression_heads/test_regression_head.py @@ -25,17 +25,6 @@ def _get_feats( return feats - def _get_data_samples(self, - batch_size: int = 2, - with_heatmap: bool = False): - - batch_data_samples = [ - inputs['data_sample'] for inputs in get_packed_inputs( - batch_size, with_heatmap=with_heatmap) - ] - - return batch_data_samples - def test_init(self): head = RegressionHead(in_channels=1024, num_joints=17) @@ -64,8 +53,8 @@ def test_predict(self): feats = self._get_feats( batch_size=2, feat_shapes=[(16, 1, 1), (32, 1, 1)]) - batch_data_samples = self._get_data_samples( - batch_size=2, with_heatmap=False) + batch_data_samples = get_packed_inputs( + batch_size=2, with_heatmap=False)['data_samples'] preds = head.predict(feats, batch_data_samples) self.assertTrue(len(preds), 2) @@ -83,7 +72,7 @@ def test_predict(self): ) feats = self._get_feats( batch_size=2, feat_shapes=[(16, 1, 1), (32, 1, 1)]) - batch_data_samples = self._get_data_samples(batch_size=2) + batch_data_samples = get_packed_inputs(batch_size=2)['data_samples'] preds = head.predict(feats, batch_data_samples) self.assertTrue(len(preds), 2) @@ -105,8 +94,8 @@ def test_tta(self): feats = self._get_feats( batch_size=2, feat_shapes=[(16, 1, 1), (32, 1, 1)]) - batch_data_samples = self._get_data_samples( - batch_size=2, with_heatmap=False) + batch_data_samples = get_packed_inputs( + batch_size=2, with_heatmap=False)['data_samples'] preds = head.predict([feats, feats], batch_data_samples, test_cfg=dict(flip_test=True, shift_coords=True)) @@ -126,8 +115,8 @@ def test_loss(self): feats = self._get_feats( batch_size=2, feat_shapes=[(16, 1, 1), (32, 1, 1)]) - batch_data_samples = self._get_data_samples( - batch_size=2, with_heatmap=False) + batch_data_samples = get_packed_inputs( + batch_size=2, with_heatmap=False)['data_samples'] losses = head.loss(feats, batch_data_samples) self.assertIsInstance(losses['loss_kpt'], torch.Tensor) diff --git a/tests/test_models/test_heads/test_regression_heads/test_rle_head.py b/tests/test_models/test_heads/test_regression_heads/test_rle_head.py index c717bbf9c3..b4be688bfb 100644 --- a/tests/test_models/test_heads/test_regression_heads/test_rle_head.py +++ b/tests/test_models/test_heads/test_regression_heads/test_rle_head.py @@ -25,17 +25,6 @@ def _get_feats( return feats - def _get_data_samples(self, - batch_size: int = 2, - with_heatmap: bool = False): - - batch_data_samples = [ - inputs['data_sample'] for inputs in get_packed_inputs( - batch_size, with_heatmap=with_heatmap) - ] - - return batch_data_samples - def test_init(self): # w/ sigma @@ -65,8 +54,8 @@ def test_predict(self): feats = self._get_feats( batch_size=2, feat_shapes=[(16, 1, 1), (32, 1, 1)]) - batch_data_samples = self._get_data_samples( - batch_size=2, with_heatmap=False) + batch_data_samples = get_packed_inputs( + batch_size=2, with_heatmap=False)['data_samples'] preds = head.predict(feats, batch_data_samples) self.assertTrue(len(preds), 2) @@ -84,7 +73,7 @@ def test_predict(self): ) feats = self._get_feats( batch_size=2, feat_shapes=[(16, 1, 1), (32, 1, 1)]) - batch_data_samples = self._get_data_samples(batch_size=2) + batch_data_samples = get_packed_inputs(batch_size=2)['data_samples'] preds = head.predict(feats, batch_data_samples) self.assertTrue(len(preds), 2) @@ -106,8 +95,8 @@ def test_tta(self): feats = self._get_feats( batch_size=2, feat_shapes=[(16, 1, 1), (32, 1, 1)]) - batch_data_samples = self._get_data_samples( - batch_size=2, with_heatmap=False) + batch_data_samples = get_packed_inputs( + batch_size=2, with_heatmap=False)['data_samples'] preds = head.predict([feats, feats], batch_data_samples, test_cfg=dict(flip_test=True)) @@ -127,8 +116,8 @@ def test_loss(self): feats = self._get_feats( batch_size=2, feat_shapes=[(16, 1, 1), (32, 1, 1)]) - batch_data_samples = self._get_data_samples( - batch_size=2, with_heatmap=False) + batch_data_samples = get_packed_inputs( + batch_size=2, with_heatmap=False)['data_samples'] losses = head.loss(feats, batch_data_samples) self.assertIsInstance(losses['loss_kpt'], torch.Tensor) diff --git a/tests/test_models/test_losses/test_ae_loss.py b/tests/test_models/test_losses/test_ae_loss.py new file mode 100644 index 0000000000..406c075532 --- /dev/null +++ b/tests/test_models/test_losses/test_ae_loss.py @@ -0,0 +1,186 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from itertools import product +from typing import Tuple +from unittest import TestCase + +import numpy as np +import torch +import torch.nn as nn +from torch import Tensor + +from mmpose.codecs.associative_embedding import AssociativeEmbedding +from mmpose.models.losses.ae_loss import AssociativeEmbeddingLoss +from mmpose.testing._utils import get_coco_sample + + +class AELoss(nn.Module): + """Associative Embedding loss in MMPose v0.x.""" + + def __init__(self, loss_type): + super().__init__() + self.loss_type = loss_type + + @staticmethod + def _make_input(t, requires_grad=False, device=torch.device('cpu')): + """Make zero inputs for AE loss. + + Args: + t (torch.Tensor): input + requires_grad (bool): Option to use requires_grad. + device: torch device + + Returns: + torch.Tensor: zero input. + """ + inp = torch.autograd.Variable(t, requires_grad=requires_grad) + inp = inp.sum() + inp = inp.to(device) + return inp + + def singleTagLoss(self, pred_tag, joints): + """Associative embedding loss for one image. + + Note: + - heatmaps weight: W + - heatmaps height: H + - max_num_people: M + - num_keypoints: K + + Args: + pred_tag (torch.Tensor[KxHxW,1]): tag of output for one image. + joints (torch.Tensor[M,K,2]): joints information for one image. + """ + tags = [] + pull = 0 + pred_tag = pred_tag.view(17, -1, 1) + for joints_per_person in joints: + tmp = [] + for k, joint in enumerate(joints_per_person): + if joint[1] > 0: + tmp.append(pred_tag[k, joint[0]]) + if len(tmp) == 0: + continue + tmp = torch.stack(tmp) + tags.append(torch.mean(tmp, dim=0)) + pull = pull + torch.mean((tmp - tags[-1].expand_as(tmp))**2) + + num_tags = len(tags) + if num_tags == 0: + return (self._make_input( + torch.zeros(1).float(), device=pred_tag.device), + self._make_input( + torch.zeros(1).float(), device=pred_tag.device)) + elif num_tags == 1: + return (self._make_input( + torch.zeros(1).float(), device=pred_tag.device), pull) + + tags = torch.stack(tags) + + size = (num_tags, num_tags) + A = tags.expand(*size) + B = A.permute(1, 0) + + diff = A - B + + if self.loss_type == 'exp': + diff = torch.pow(diff, 2) + push = torch.exp(-diff) + push = torch.sum(push) + elif self.loss_type == 'max': + diff = 1 - torch.abs(diff) + push = torch.clamp(diff, min=0).sum() - num_tags + else: + raise ValueError('Unknown ae loss type') + + push_loss = push / ((num_tags - 1) * num_tags) * 0.5 + pull_loss = pull / (num_tags) + + return push_loss, pull_loss + + def forward(self, tags, keypoint_indices): + assert tags.shape[0] == len(keypoint_indices) + + pull_loss = 0. + push_loss = 0. + + for i in range(tags.shape[0]): + _push, _pull = self.singleTagLoss(tags[i].view(-1, 1), + keypoint_indices[i]) + pull_loss += _pull + push_loss += _push + + return pull_loss, push_loss + + +class TestAssociativeEmbeddingLoss(TestCase): + + def _make_input(self, num_instance: int) -> Tuple[Tensor, Tensor]: + + encoder = AssociativeEmbedding( + input_size=(256, 256), heatmap_size=(64, 64)) + + data = get_coco_sample( + img_shape=(256, 256), num_instances=num_instance) + encoded = encoder.encode(data['keypoints'], data['keypoints_visible']) + heatmaps = encoded['heatmaps'] + keypoint_indices = encoded['keypoint_indices'] + + tags = self._get_tags( + heatmaps, keypoint_indices, tag_per_keypoint=True) + + batch_tags = torch.from_numpy(tags[None]) + batch_keypoint_indices = [torch.from_numpy(keypoint_indices)] + + return batch_tags, batch_keypoint_indices + + def _get_tags(self, + heatmaps, + keypoint_indices, + tag_per_keypoint: bool, + with_randomness: bool = True): + + K, H, W = heatmaps.shape + N = keypoint_indices.shape[0] + + if tag_per_keypoint: + tags = np.zeros((K, H, W), dtype=np.float32) + else: + tags = np.zeros((1, H, W), dtype=np.float32) + + for n, k in product(range(N), range(K)): + y, x = np.unravel_index(keypoint_indices[n, k, 0], (H, W)) + + randomness = np.random.rand() if with_randomness else 0 + + if tag_per_keypoint: + tags[k, y, x] = n + randomness + else: + tags[0, y, x] = n + randomness + + return tags + + def test_loss(self): + + tags, keypoint_indices = self._make_input(num_instance=2) + + # test loss calculation + loss_module = AssociativeEmbeddingLoss() + pull_loss, push_loss = loss_module(tags, keypoint_indices) + _pull_loss, _push_loss = AELoss('exp')(tags, keypoint_indices) + + self.assertTrue(torch.allclose(pull_loss, _pull_loss)) + self.assertTrue(torch.allclose(push_loss, _push_loss)) + + # test loss weight + loss_module = AssociativeEmbeddingLoss(loss_weight=0.) + pull_loss, push_loss = loss_module(tags, keypoint_indices) + + self.assertTrue(torch.allclose(pull_loss, torch.zeros(1))) + self.assertTrue(torch.allclose(push_loss, torch.zeros(1))) + + # test push loss factor + loss_module = AssociativeEmbeddingLoss(push_loss_factor=0.) + pull_loss, push_loss = loss_module(tags, keypoint_indices) + + self.assertFalse(torch.allclose(pull_loss, torch.zeros(1))) + self.assertTrue(torch.allclose(push_loss, torch.zeros(1))) diff --git a/tests/test_models/test_losses/test_classification_losses.py b/tests/test_models/test_losses/test_classification_losses.py new file mode 100644 index 0000000000..fd7d3fd898 --- /dev/null +++ b/tests/test_models/test_losses/test_classification_losses.py @@ -0,0 +1,22 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmpose.models.losses.classification_loss import InfoNCELoss + + +class TestInfoNCELoss(TestCase): + + def test_loss(self): + + # test loss w/o target_weight + loss = InfoNCELoss(temperature=0.05) + + fake_pred = torch.arange(5 * 2).reshape(5, 2).float() + self.assertTrue( + torch.allclose(loss(fake_pred), torch.tensor(5.4026), atol=1e-4)) + + # check if the value of temperature is positive + with self.assertRaises(AssertionError): + loss = InfoNCELoss(temperature=0.) diff --git a/tests/test_models/test_losses/test_heatmap_losses.py b/tests/test_models/test_losses/test_heatmap_losses.py new file mode 100644 index 0000000000..bfabc84749 --- /dev/null +++ b/tests/test_models/test_losses/test_heatmap_losses.py @@ -0,0 +1,65 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmpose.models.losses.heatmap_loss import (AdaptiveWingLoss, + FocalHeatmapLoss) + + +class TestAdaptiveWingLoss(TestCase): + + def test_loss(self): + + # test loss w/o target_weight + loss = AdaptiveWingLoss(use_target_weight=False) + + fake_pred = torch.zeros((1, 3, 2, 2)) + fake_label = torch.zeros((1, 3, 2, 2)) + self.assertTrue( + torch.allclose(loss(fake_pred, fake_label), torch.tensor(0.))) + + fake_pred = torch.ones((1, 3, 2, 2)) + fake_label = torch.zeros((1, 3, 2, 2)) + self.assertTrue( + torch.allclose( + loss(fake_pred, fake_label), torch.tensor(8.4959), atol=1e-4)) + + # test loss w/ target_weight + loss = AdaptiveWingLoss(use_target_weight=True) + + fake_pred = torch.zeros((1, 3, 2, 2)) + fake_label = torch.zeros((1, 3, 2, 2)) + fake_weight = torch.tensor([1, 0, 1]).reshape(1, 3).float() + self.assertTrue( + torch.allclose( + loss(fake_pred, fake_label, fake_weight), torch.tensor(0.))) + + +class TestFocalHeatmapLoss(TestCase): + + def test_loss(self): + + loss = FocalHeatmapLoss(use_target_weight=False) + + fake_pred = torch.zeros((1, 3, 5, 5)) + fake_label = torch.zeros((1, 3, 5, 5)) + + self.assertTrue( + torch.allclose(loss(fake_pred, fake_label), torch.tensor(0.))) + + fake_pred = torch.ones((1, 3, 5, 5)) * 0.4 + fake_label = torch.ones((1, 3, 5, 5)) * 0.6 + self.assertTrue( + torch.allclose( + loss(fake_pred, fake_label), torch.tensor(0.1569), atol=1e-4)) + + # test loss w/ target_weight + loss = FocalHeatmapLoss(use_target_weight=True) + + fake_weight = torch.arange(3 * 5 * 5).reshape(1, 3, 5, 5).float() + self.assertTrue( + torch.allclose( + loss(fake_pred, fake_label, fake_weight), + torch.tensor(5.8062), + atol=1e-4)) diff --git a/tests/test_models/test_losses/test_regression_losses.py b/tests/test_models/test_losses/test_regression_losses.py new file mode 100644 index 0000000000..0975ac6b55 --- /dev/null +++ b/tests/test_models/test_losses/test_regression_losses.py @@ -0,0 +1,48 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmpose.models.losses.regression_loss import SoftWeightSmoothL1Loss + + +class TestSoftWeightSmoothL1Loss(TestCase): + + def test_loss(self): + + # test loss w/o target_weight + loss = SoftWeightSmoothL1Loss(use_target_weight=False, beta=0.5) + + fake_pred = torch.zeros((1, 3, 2)) + fake_label = torch.zeros((1, 3, 2)) + self.assertTrue( + torch.allclose(loss(fake_pred, fake_label), torch.tensor(0.))) + + fake_pred = torch.ones((1, 3, 2)) + fake_label = torch.zeros((1, 3, 2)) + self.assertTrue( + torch.allclose(loss(fake_pred, fake_label), torch.tensor(.75))) + + # test loss w/ target_weight + loss = SoftWeightSmoothL1Loss( + use_target_weight=True, supervise_empty=True) + + fake_pred = torch.ones((1, 3, 2)) + fake_label = torch.zeros((1, 3, 2)) + fake_weight = torch.arange(6).reshape(1, 3, 2).float() + self.assertTrue( + torch.allclose( + loss(fake_pred, fake_label, fake_weight), torch.tensor(1.25))) + + # test loss that does not take empty channels into account + loss = SoftWeightSmoothL1Loss( + use_target_weight=True, supervise_empty=False) + self.assertTrue( + torch.allclose( + loss(fake_pred, fake_label, fake_weight), torch.tensor(1.5))) + + with self.assertRaises(ValueError): + _ = loss.smooth_l1_loss(fake_pred, fake_label, reduction='fake') + + output = loss.smooth_l1_loss(fake_pred, fake_label, reduction='sum') + self.assertTrue(torch.allclose(output, torch.tensor(3.0))) diff --git a/tests/test_models/test_pose_estimators/test_bottomup.py b/tests/test_models/test_pose_estimators/test_bottomup.py new file mode 100644 index 0000000000..16258dacaf --- /dev/null +++ b/tests/test_models/test_pose_estimators/test_bottomup.py @@ -0,0 +1,54 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import unittest +from unittest import TestCase + +import torch +from parameterized import parameterized + +from mmpose.testing import get_packed_inputs, get_pose_estimator_cfg +from mmpose.utils import register_all_modules + +configs = [ + 'body_2d_keypoint/associative_embedding/coco/' + 'ae_hrnet-w32_8xb24-300e_coco-512x512.py' +] + +configs_with_devices = [(config, ('cpu', 'cuda')) for config in configs] + + +class TestTopdownPoseEstimator(TestCase): + + def setUp(self) -> None: + register_all_modules() + + @parameterized.expand(configs) + def test_init(self, config): + model_cfg = get_pose_estimator_cfg(config) + model_cfg.backbone.init_cfg = None + + from mmpose.models import build_pose_estimator + model = build_pose_estimator(model_cfg) + self.assertTrue(model.backbone) + self.assertTrue(model.head) + if model_cfg.get('neck', None): + self.assertTrue(model.neck) + + @parameterized.expand(configs_with_devices) + def test_forward_tensor(self, config, devices): + model_cfg = get_pose_estimator_cfg(config) + model_cfg.backbone.init_cfg = None + + from mmpose.models import build_pose_estimator + + for device in devices: + model = build_pose_estimator(model_cfg) + + if device == 'cuda': + if not torch.cuda.is_available(): + return unittest.skip('test requires GPU and torch+cuda') + model = model.cuda() + + packed_inputs = get_packed_inputs(2) + data = model.data_preprocessor(packed_inputs, training=True) + batch_results = model.forward(**data, mode='tensor') + self.assertIsInstance(batch_results, (tuple, torch.Tensor)) diff --git a/tests/test_models/test_pose_estimators/test_topdown.py b/tests/test_models/test_pose_estimators/test_topdown.py index 62f52f5b54..bc65cecd3e 100644 --- a/tests/test_models/test_pose_estimators/test_topdown.py +++ b/tests/test_models/test_pose_estimators/test_topdown.py @@ -1,10 +1,102 @@ # Copyright (c) OpenMMLab. All rights reserved. +import unittest from unittest import TestCase +import torch +from parameterized import parameterized + +from mmpose.structures import PoseDataSample +from mmpose.testing import get_packed_inputs, get_pose_estimator_cfg from mmpose.utils import register_all_modules +configs = [ + 'body_2d_keypoint/topdown_heatmap/coco/' + 'td-hm_hrnet-w32_8xb64-210e_coco-256x192.py', + 'configs/body_2d_keypoint/topdown_regression/coco/' + 'td-reg_res50_8xb64-210e_coco-256x192.py', + 'configs/body_2d_keypoint/simcc/coco/' + 'simcc_mobilenetv2_wo-deconv-8xb64-210e_coco-256x192.py', +] + +configs_with_devices = [(config, ('cpu', 'cuda')) for config in configs] + class TestTopdownPoseEstimator(TestCase): def setUp(self) -> None: register_all_modules() + + @parameterized.expand(configs) + def test_init(self, config): + model_cfg = get_pose_estimator_cfg(config) + model_cfg.backbone.init_cfg = None + + from mmpose.models import build_pose_estimator + model = build_pose_estimator(model_cfg) + self.assertTrue(model.backbone) + self.assertTrue(model.head) + if model_cfg.get('neck', None): + self.assertTrue(model.neck) + + @parameterized.expand(configs_with_devices) + def test_forward_loss(self, config, devices): + model_cfg = get_pose_estimator_cfg(config) + model_cfg.backbone.init_cfg = None + + from mmpose.models import build_pose_estimator + + for device in devices: + model = build_pose_estimator(model_cfg) + + if device == 'cuda': + if not torch.cuda.is_available(): + return unittest.skip('test requires GPU and torch+cuda') + model = model.cuda() + + packed_inputs = get_packed_inputs(2) + data = model.data_preprocessor(packed_inputs, training=True) + losses = model.forward(**data, mode='loss') + self.assertIsInstance(losses, dict) + + @parameterized.expand(configs_with_devices) + def test_forward_predict(self, config, devices): + model_cfg = get_pose_estimator_cfg(config) + model_cfg.backbone.init_cfg = None + + from mmpose.models import build_pose_estimator + + for device in devices: + model = build_pose_estimator(model_cfg) + + if device == 'cuda': + if not torch.cuda.is_available(): + return unittest.skip('test requires GPU and torch+cuda') + model = model.cuda() + + packed_inputs = get_packed_inputs(2) + model.eval() + with torch.no_grad(): + data = model.data_preprocessor(packed_inputs, training=True) + batch_results = model.forward(**data, mode='predict') + self.assertEqual(len(batch_results), 2) + self.assertIsInstance(batch_results[0], PoseDataSample) + + @parameterized.expand(configs_with_devices) + def test_forward_tensor(self, config, devices): + model_cfg = get_pose_estimator_cfg(config) + model_cfg.backbone.init_cfg = None + + from mmpose.models import build_pose_estimator + + for device in devices: + model = build_pose_estimator(model_cfg) + + if device == 'cuda': + if not torch.cuda.is_available(): + return unittest.skip('test requires GPU and torch+cuda') + model = model.cuda() + + packed_inputs = get_packed_inputs(2) + data = model.data_preprocessor(packed_inputs, training=True) + batch_results = model.forward(**data, mode='tensor') + self.assertIsInstance(batch_results, (tuple, torch.Tensor)) diff --git a/tests/test_visualization/test_pose_visualizer.py b/tests/test_visualization/test_pose_visualizer.py index 8de33fc1ed..d9eb502c30 100644 --- a/tests/test_visualization/test_pose_visualizer.py +++ b/tests/test_visualization/test_pose_visualizer.py @@ -97,6 +97,13 @@ def test_add_datasample(self): out_file=out_file) self._assert_image_and_shape(out_file, ((h * 2), (w * 2), 3)) + def test_simcc_visualization(self): + img = np.zeros((512, 512, 3), dtype=np.uint8) + heatmap = torch.randn([17, 512, 512]) + pixelData = PixelData() + pixelData.heatmaps = heatmap + self.visualizer._draw_instance_xy_heatmap(pixelData, img, 10) + def _assert_image_and_shape(self, out_file, out_shape): self.assertTrue(os.path.exists(out_file)) drawn_img = cv2.imread(out_file) diff --git a/tools/analysis_tools/get_flops.py b/tools/analysis_tools/get_flops.py index 1445acfd92..9325037699 100644 --- a/tools/analysis_tools/get_flops.py +++ b/tools/analysis_tools/get_flops.py @@ -6,7 +6,6 @@ from mmengine.config import DictAction from mmpose.apis.inference import init_model -from mmpose.utils import register_all_modules as register_mmpose_modules try: from mmcv.cnn import get_model_complexity_info @@ -70,7 +69,6 @@ def batch_constructor(flops_model, batch_size, input_shape): def main(): - register_mmpose_modules() args = parse_args() diff --git a/tools/deployment/pytorch2onnx.py b/tools/deployment/pytorch2onnx.py deleted file mode 100644 index 5caff6e070..0000000000 --- a/tools/deployment/pytorch2onnx.py +++ /dev/null @@ -1,165 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import argparse -import warnings - -import numpy as np -import torch - -from mmpose.apis import init_pose_model - -try: - import onnx - import onnxruntime as rt -except ImportError as e: - raise ImportError(f'Please install onnx and onnxruntime first. {e}') - -try: - from mmcv.onnx.symbolic import register_extra_symbolics -except ModuleNotFoundError: - raise NotImplementedError('please update mmcv to version>=1.0.4') - - -def _convert_batchnorm(module): - """Convert the syncBNs into normal BN3ds.""" - module_output = module - if isinstance(module, torch.nn.SyncBatchNorm): - module_output = torch.nn.BatchNorm3d(module.num_features, module.eps, - module.momentum, module.affine, - module.track_running_stats) - if module.affine: - module_output.weight.data = module.weight.data.clone().detach() - module_output.bias.data = module.bias.data.clone().detach() - # keep requires_grad unchanged - module_output.weight.requires_grad = module.weight.requires_grad - module_output.bias.requires_grad = module.bias.requires_grad - module_output.running_mean = module.running_mean - module_output.running_var = module.running_var - module_output.num_batches_tracked = module.num_batches_tracked - for name, child in module.named_children(): - module_output.add_module(name, _convert_batchnorm(child)) - del module - return module_output - - -def pytorch2onnx(model, - input_shape, - opset_version=11, - show=False, - output_file='tmp.onnx', - verify=False): - """Convert pytorch model to onnx model. - - Args: - model (:obj:`nn.Module`): The pytorch model to be exported. - input_shape (tuple[int]): The input tensor shape of the model. - opset_version (int): Opset version of onnx used. Default: 11. - show (bool): Determines whether to print the onnx model architecture. - Default: False. - output_file (str): Output onnx model name. Default: 'tmp.onnx'. - verify (bool): Determines whether to verify the onnx model. - Default: False. - """ - model.cpu().eval() - - one_img = torch.randn(input_shape) - - register_extra_symbolics(opset_version) - torch.onnx.export( - model, - one_img, - output_file, - export_params=True, - keep_initializers_as_inputs=True, - verbose=show, - opset_version=opset_version) - - print(f'Successfully exported ONNX model: {output_file}') - if verify: - # check by onnx - onnx_model = onnx.load(output_file) - onnx.checker.check_model(onnx_model) - - # check the numerical value - # get pytorch output - pytorch_results = model(one_img) - if not isinstance(pytorch_results, (list, tuple)): - assert isinstance(pytorch_results, torch.Tensor) - pytorch_results = [pytorch_results] - - # get onnx output - input_all = [node.name for node in onnx_model.graph.input] - input_initializer = [ - node.name for node in onnx_model.graph.initializer - ] - net_feed_input = list(set(input_all) - set(input_initializer)) - assert len(net_feed_input) == 1 - sess = rt.InferenceSession(output_file) - onnx_results = sess.run(None, - {net_feed_input[0]: one_img.detach().numpy()}) - - # compare results - assert len(pytorch_results) == len(onnx_results) - for pt_result, onnx_result in zip(pytorch_results, onnx_results): - assert np.allclose( - pt_result.detach().cpu(), onnx_result, atol=1.e-5 - ), 'The outputs are different between Pytorch and ONNX' - print('The numerical values are same between Pytorch and ONNX') - - -def parse_args(): - parser = argparse.ArgumentParser( - description='Convert MMPose models to ONNX') - parser.add_argument('config', help='test config file path') - parser.add_argument('checkpoint', help='checkpoint file') - parser.add_argument('--show', action='store_true', help='show onnx graph') - parser.add_argument('--output-file', type=str, default='tmp.onnx') - parser.add_argument('--opset-version', type=int, default=11) - parser.add_argument( - '--verify', - action='store_true', - help='verify the onnx model output against pytorch output') - parser.add_argument( - '--shape', - type=int, - nargs='+', - default=[1, 3, 256, 192], - help='input size') - args = parser.parse_args() - return args - - -if __name__ == '__main__': - args = parse_args() - - assert args.opset_version == 11, 'MMPose only supports opset 11 now' - - # Following strings of text style are from colorama package - bright_style, reset_style = '\x1b[1m', '\x1b[0m' - red_text, blue_text = '\x1b[31m', '\x1b[34m' - white_background = '\x1b[107m' - - msg = white_background + bright_style + red_text - msg += 'DeprecationWarning: This tool will be deprecated in future. ' - msg += blue_text + 'Welcome to use the unified model deployment toolbox ' - msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy' - msg += reset_style - warnings.warn(msg) - - model = init_pose_model(args.config, args.checkpoint, device='cpu') - model = _convert_batchnorm(model) - - # onnx.export does not support kwargs - if hasattr(model, 'forward_dummy'): - model.forward = model.forward_dummy - else: - raise NotImplementedError( - 'Please implement the forward method for exporting.') - - # convert model to onnx file - pytorch2onnx( - model, - args.shape, - opset_version=args.opset_version, - show=args.show, - output_file=args.output_file, - verify=args.verify) diff --git a/tools/misc/browse_dataset.py b/tools/misc/browse_dataset.py index b744b0a3c2..74dfa39d6b 100644 --- a/tools/misc/browse_dataset.py +++ b/tools/misc/browse_dataset.py @@ -7,12 +7,11 @@ import mmengine import numpy as np from mmengine import Config, DictAction -from mmengine.registry import build_from_cfg +from mmengine.registry import build_from_cfg, init_default_scope from mmengine.structures import InstanceData from mmpose.registry import DATASETS, VISUALIZERS from mmpose.structures import PoseDataSample -from mmpose.utils import register_all_modules def parse_args(): @@ -84,7 +83,7 @@ def main(): file_client = mmengine.FileClient(**file_client_args) # register all modules in mmpose into the registries - register_all_modules() + init_default_scope(cfg.get('default_scope', 'mmpose')) if args.mode == 'original': cfg[f'{args.phase}_dataloader'].dataset.pipeline = [] diff --git a/tools/misc/publish_model.py b/tools/misc/publish_model.py index 393721ab06..4a8338fdbd 100644 --- a/tools/misc/publish_model.py +++ b/tools/misc/publish_model.py @@ -4,6 +4,9 @@ from datetime import date import torch +from mmengine.logging import print_log +from mmengine.utils import digit_version +from mmengine.utils.dl_utils import TORCH_VERSION def parse_args(): @@ -11,18 +14,37 @@ def parse_args(): description='Process a checkpoint to be published') parser.add_argument('in_file', help='input checkpoint filename') parser.add_argument('out_file', help='output checkpoint filename') + parser.add_argument( + '--save-keys', + nargs='+', + type=str, + default=['meta', 'state_dict'], + help='keys to save in published checkpoint (default: meta state_dict)') args = parser.parse_args() return args -def process_checkpoint(in_file, out_file): +def process_checkpoint(in_file, out_file, save_keys=['meta', 'state_dict']): checkpoint = torch.load(in_file, map_location='cpu') - # remove optimizer for smaller file size - if 'optimizer' in checkpoint: - del checkpoint['optimizer'] + + # only keep `meta` and `state_dict` for smaller file size + ckpt_keys = list(checkpoint.keys()) + for k in ckpt_keys: + if k not in save_keys: + print_log( + f'Key `{k}` will be removed because it is not in ' + f'save_keys. If you want to keep it, ' + f'please set --save-keys.', + logger='current') + checkpoint.pop(k, None) + # if it is necessary to remove some sensitive data in checkpoint['meta'], # add the code here. - torch.save(checkpoint, out_file) + + if digit_version(TORCH_VERSION) >= digit_version('1.6.0'): + torch.save(checkpoint, out_file, _use_new_zipfile_serialization=False) + else: + torch.save(checkpoint, out_file) sha = subprocess.check_output(['sha256sum', out_file]).decode() if out_file.endswith('.pth'): out_file_name = out_file[:-4] @@ -36,7 +58,7 @@ def process_checkpoint(in_file, out_file): def main(): args = parse_args() - process_checkpoint(args.in_file, args.out_file) + process_checkpoint(args.in_file, args.out_file, args.save_keys) if __name__ == '__main__': diff --git a/tools/test.py b/tools/test.py index 673d64365f..3a22ae78c5 100644 --- a/tools/test.py +++ b/tools/test.py @@ -8,8 +8,6 @@ from mmengine.hooks import Hook from mmengine.runner import Runner -from mmpose.utils import register_all_modules - def parse_args(): parser = argparse.ArgumentParser( @@ -92,10 +90,6 @@ def merge_args(cfg, args): def main(): args = parse_args() - # register all modules in mmpose into the registries - # do not init the default scope here because it will be init in the runner - register_all_modules(init_default_scope=False) - # load config cfg = Config.fromfile(args.config) cfg = merge_args(cfg, args) diff --git a/tools/deployment/mmpose2torchserve.py b/tools/torchserve/mmpose2torchserve.py similarity index 100% rename from tools/deployment/mmpose2torchserve.py rename to tools/torchserve/mmpose2torchserve.py diff --git a/tools/deployment/mmpose_handler.py b/tools/torchserve/mmpose_handler.py similarity index 100% rename from tools/deployment/mmpose_handler.py rename to tools/torchserve/mmpose_handler.py diff --git a/tools/deployment/test_torchserver.py b/tools/torchserve/test_torchserver.py similarity index 100% rename from tools/deployment/test_torchserver.py rename to tools/torchserve/test_torchserver.py diff --git a/tools/train.py b/tools/train.py index 374e6e4dd3..a29051d9e0 100644 --- a/tools/train.py +++ b/tools/train.py @@ -6,8 +6,6 @@ from mmengine.config import Config, DictAction from mmengine.runner import Runner -from mmpose.utils import register_all_modules - def parse_args(): parser = argparse.ArgumentParser(description='Train a pose model') @@ -137,10 +135,6 @@ def merge_args(cfg, args): def main(): args = parse_args() - # register all modules in mmpose into the registries - # do not init the default scope here because it will be init in the runner - register_all_modules(init_default_scope=False) - # load config cfg = Config.fromfile(args.config) @@ -148,7 +142,9 @@ def main(): cfg = merge_args(cfg, args) # set preprocess configs to model - cfg.model.setdefault('data_preprocessor', cfg.get('preprocess_cfg', {})) + if 'preprocess_cfg' in cfg: + cfg.model.setdefault('data_preprocessor', + cfg.get('preprocess_cfg', {})) # build the runner from config runner = Runner.from_cfg(cfg)