-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Feature] Implement of RAM with a gradio interface. (#1802)
* [CodeCamp2023-584]Support DINO self-supervised learning in project (#1756) * feat: impelemt DINO * chore: delete debug code * chore: impplement pre-commit * fix: fix imported package * chore: pre-commit check * [CodeCamp2023-340] New Version of config Adapting MobileNet Algorithm (#1774) * add new config adapting MobileNetV2,V3 * add base model config for mobile net v3, modified all training configs of mobile net v3 inherit from the base model config * removed directory _base_/models/mobilenet_v3 * [Feature] Implement of Zero-Shot CLIP Classifier (#1737) * zero-shot CLIP * modify zero-shot clip config * add in1k_sub_prompt(8 prompts) for improvement * add some annotations doc * clip base class & clip_zs sub-class * some modifications of details after review * convert into and use mmpretrain-vit * modify names of some files and directories * ram init commit * [Fix] Fix pipeline bug in image retrieval inferencer * [CodeCamp2023-341] 多模态数据集文档补充-COCO Retrieval * Update OFA to compat with latest huggingface. * Update train.py to compat with new config * Bump version to v1.1.0 * Update __init__.py --------- Co-authored-by: LALBJ <[email protected]> Co-authored-by: DE009 <[email protected]> Co-authored-by: mzr1996 <[email protected]> Co-authored-by: 飞飞 <[email protected]>
- Loading branch information
1 parent
c076651
commit ed5924b
Showing
69 changed files
with
4,618 additions
and
26 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
_base_ = '../_base_/default_runtime.py' | ||
|
||
# data settings | ||
data_preprocessor = dict( | ||
type='MultiModalDataPreprocessor', | ||
mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255], | ||
std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255], | ||
to_rgb=False, | ||
) | ||
|
||
test_pipeline = [ | ||
dict(type='Resize', scale=(224, 224), interpolation='bicubic'), | ||
dict( | ||
type='PackInputs', | ||
algorithm_keys=['text'], | ||
meta_keys=['image_id', 'scale_factor'], | ||
), | ||
] | ||
|
||
train_dataloader = None | ||
test_dataloader = dict( | ||
batch_size=32, | ||
num_workers=8, | ||
dataset=dict( | ||
type='CIFAR100', | ||
data_root='data/cifar100', | ||
split='test', | ||
pipeline=test_pipeline), | ||
sampler=dict(type='DefaultSampler', shuffle=False), | ||
) | ||
test_evaluator = dict(type='Accuracy', topk=(1, 5)) | ||
|
||
# schedule settings | ||
train_cfg = None | ||
val_cfg = None | ||
test_cfg = dict() | ||
|
||
# model settings | ||
model = dict( | ||
type='CLIPZeroShot', | ||
vision_backbone=dict( | ||
type='VisionTransformer', | ||
arch='base', | ||
img_size=224, | ||
patch_size=16, | ||
drop_rate=0., | ||
layer_cfgs=dict(act_cfg=dict(type='QuickGELU')), | ||
pre_norm=True, | ||
), | ||
projection=dict(type='CLIPProjection', in_channels=768, out_channels=512), | ||
text_backbone=dict( | ||
type='CLIPTransformer', | ||
width=512, | ||
layers=12, | ||
heads=8, | ||
attn_mask=True, | ||
), | ||
tokenizer=dict( | ||
type='AutoTokenizer', | ||
name_or_path='openai/clip-vit-base-patch16', | ||
use_fast=False), | ||
vocab_size=49408, | ||
transformer_width=512, | ||
proj_dim=512, | ||
text_prototype='cifar100', | ||
text_prompt='openai_cifar100', | ||
context_length=77, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
_base_ = '../_base_/default_runtime.py' | ||
|
||
# data settings | ||
data_preprocessor = dict( | ||
type='MultiModalDataPreprocessor', | ||
mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255], | ||
std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255], | ||
to_rgb=True, | ||
) | ||
|
||
test_pipeline = [ | ||
dict(type='LoadImageFromFile'), | ||
dict(type='Resize', scale=(224, 224), interpolation='bicubic'), | ||
dict( | ||
type='PackInputs', | ||
algorithm_keys=['text'], | ||
meta_keys=['image_id', 'scale_factor'], | ||
), | ||
] | ||
|
||
train_dataloader = None | ||
test_dataloader = dict( | ||
batch_size=32, | ||
num_workers=8, | ||
dataset=dict( | ||
type='ImageNet', | ||
data_root='data/imagenet', | ||
split='val', | ||
pipeline=test_pipeline), | ||
sampler=dict(type='DefaultSampler', shuffle=False), | ||
) | ||
test_evaluator = dict(type='Accuracy', topk=(1, 5)) | ||
|
||
# schedule settings | ||
train_cfg = None | ||
val_cfg = None | ||
test_cfg = dict() | ||
|
||
# model settings | ||
model = dict( | ||
type='CLIPZeroShot', | ||
vision_backbone=dict( | ||
type='VisionTransformer', | ||
arch='base', | ||
img_size=224, | ||
patch_size=16, | ||
drop_rate=0., | ||
layer_cfgs=dict(act_cfg=dict(type='QuickGELU')), | ||
pre_norm=True, | ||
), | ||
projection=dict(type='CLIPProjection', in_channels=768, out_channels=512), | ||
text_backbone=dict( | ||
type='CLIPTransformer', | ||
width=512, | ||
layers=12, | ||
heads=8, | ||
attn_mask=True, | ||
), | ||
tokenizer=dict( | ||
type='AutoTokenizer', | ||
name_or_path='openai/clip-vit-base-patch16', | ||
use_fast=False), | ||
vocab_size=49408, | ||
transformer_width=512, | ||
proj_dim=512, | ||
text_prototype='imagenet', | ||
text_prompt='openai_imagenet_sub', # openai_imagenet, openai_imagenet_sub | ||
context_length=77, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
_base_ = '../_base_/default_runtime.py' | ||
|
||
# data settings | ||
data_preprocessor = dict( | ||
type='MultiModalDataPreprocessor', | ||
mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255], | ||
std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255], | ||
to_rgb=False, | ||
) | ||
|
||
test_pipeline = [ | ||
dict(type='Resize', scale=(224, 224), interpolation='bicubic'), | ||
dict( | ||
type='PackInputs', | ||
algorithm_keys=['text'], | ||
meta_keys=['image_id', 'scale_factor'], | ||
), | ||
] | ||
|
||
train_dataloader = None | ||
test_dataloader = dict( | ||
batch_size=32, | ||
num_workers=8, | ||
dataset=dict( | ||
type='CIFAR100', | ||
data_root='data/cifar100', | ||
split='test', | ||
pipeline=test_pipeline), | ||
sampler=dict(type='DefaultSampler', shuffle=False), | ||
) | ||
test_evaluator = dict(type='Accuracy', topk=(1, 5)) | ||
|
||
# schedule settings | ||
train_cfg = None | ||
val_cfg = None | ||
test_cfg = dict() | ||
|
||
# model settings | ||
model = dict( | ||
type='CLIPZeroShot', | ||
vision_backbone=dict( | ||
type='VisionTransformer', | ||
arch='large', | ||
img_size=224, | ||
patch_size=14, | ||
drop_rate=0., | ||
layer_cfgs=dict(act_cfg=dict(type='QuickGELU')), | ||
pre_norm=True, | ||
), | ||
projection=dict(type='CLIPProjection', in_channels=1024, out_channels=768), | ||
text_backbone=dict( | ||
type='CLIPTransformer', | ||
width=768, | ||
layers=12, | ||
heads=12, | ||
attn_mask=True, | ||
), | ||
tokenizer=dict( | ||
type='AutoTokenizer', | ||
name_or_path='openai/clip-vit-large-patch14', | ||
use_fast=False), | ||
vocab_size=49408, | ||
transformer_width=768, | ||
proj_dim=768, | ||
text_prototype='cifar100', | ||
text_prompt='openai_cifar100', | ||
context_length=77, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
_base_ = '../_base_/default_runtime.py' | ||
|
||
# data settings | ||
data_preprocessor = dict( | ||
type='MultiModalDataPreprocessor', | ||
mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255], | ||
std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255], | ||
to_rgb=True, | ||
) | ||
|
||
test_pipeline = [ | ||
dict(type='LoadImageFromFile'), | ||
dict(type='Resize', scale=(224, 224), interpolation='bicubic'), | ||
dict( | ||
type='PackInputs', | ||
algorithm_keys=['text'], | ||
meta_keys=['image_id', 'scale_factor'], | ||
), | ||
] | ||
|
||
train_dataloader = None | ||
test_dataloader = dict( | ||
batch_size=32, | ||
num_workers=8, | ||
dataset=dict( | ||
type='ImageNet', | ||
data_root='data/imagenet', | ||
split='val', | ||
pipeline=test_pipeline), | ||
sampler=dict(type='DefaultSampler', shuffle=False), | ||
) | ||
test_evaluator = dict(type='Accuracy', topk=(1, 5)) | ||
|
||
# schedule settings | ||
train_cfg = None | ||
val_cfg = None | ||
test_cfg = dict() | ||
|
||
# model settings | ||
model = dict( | ||
type='CLIPZeroShot', | ||
vision_backbone=dict( | ||
type='VisionTransformer', | ||
arch='large', | ||
img_size=224, | ||
patch_size=14, | ||
drop_rate=0., | ||
layer_cfgs=dict(act_cfg=dict(type='QuickGELU')), | ||
pre_norm=True, | ||
), | ||
projection=dict(type='CLIPProjection', in_channels=1024, out_channels=768), | ||
text_backbone=dict( | ||
type='CLIPTransformer', | ||
width=768, | ||
layers=12, | ||
heads=12, | ||
attn_mask=True, | ||
), | ||
tokenizer=dict( | ||
type='AutoTokenizer', | ||
name_or_path='openai/clip-vit-large-patch14', | ||
use_fast=False), | ||
vocab_size=49408, | ||
transformer_width=768, | ||
proj_dim=768, | ||
text_prototype='imagenet', | ||
text_prompt='openai_imagenet_sub', # openai_imagenet, openai_imagenet_sub | ||
context_length=77, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.