diff --git a/configs/_base_/models/csn_ig65m_pretrained.py b/configs/_base_/models/ircsn_r152.py similarity index 76% rename from configs/_base_/models/csn_ig65m_pretrained.py rename to configs/_base_/models/ircsn_r152.py index 2e827026d5..fcab416cbd 100644 --- a/configs/_base_/models/csn_ig65m_pretrained.py +++ b/configs/_base_/models/ircsn_r152.py @@ -4,8 +4,7 @@ backbone=dict( type='ResNet3dCSN', pretrained2d=False, - pretrained= # noqa: E251 - 'https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r152_ig65m_20200807-771c4135.pth', # noqa: E501 + pretrained=None, depth=152, with_pool2=False, bottleneck_mode='ir', diff --git a/configs/recognition/csn/README.md b/configs/recognition/csn/README.md index 2c8322fcc1..a347de5f4b 100644 --- a/configs/recognition/csn/README.md +++ b/configs/recognition/csn/README.md @@ -33,8 +33,14 @@ doi = {10.1109/ICCV.2019.00565} |config | resolution | gpus | backbone |pretrain| top1 acc| top5 acc | inference_time(video/s) | gpu_mem(M)| ckpt | log| json| |:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:| +|[ircsn_ig65m_pretrained_bnfrozen_r50_32x2x1_58e_kinetics400_rgb](/configs/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r50_32x2x1_58e_kinetics400_rgb.py)|short-side 320|x| ResNet50 | IG65M | 79.0 | 94.2 | x | x | [infer_ckpt](https://download.openmmlab.com/mmaction/recognition/csn/vmz/vmz_ircsn_ig65m_pretrained_r50_32x2x1_58e_kinetics400_rgb_20210617-86d33018.pth) | x | x | +|[ircsn_bnfrozen_r152_32x2x1_180e_kinetics400_rgb](/configs/recognition/csn/ircsn_bnfrozen_r152_32x2x1_180e_kinetics400_rgb.py)|short-side 320|x| ResNet152 | None | 76.5 | 92.1 | x | x | [infer_ckpt](https://download.openmmlab.com/mmaction/recognition/csn/vmz/vmz_ircsn_from_scratch_r152_32x2x1_180e_kinetics400_rgb_20210617-5c933ae1.pth) | x | x | +|[ircsn_sports1m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb](/configs/recognition/csn/ircsn_sports1m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py)|short-side 320|x| ResNet152 | Sports1M | 78.2 | 93.0 | x | x | [infer_ckpt](https://download.openmmlab.com/mmaction/recognition/csn/vmz/vmz_ircsn_sports1m_pretrained_r152_32x2x1_58e_kinetics400_rgb_20210617-b9b10241.pth) | x | x | +|[ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py](/configs/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py)|short-side 320|8x4| ResNet152 | IG65M|82.76/82.6|95.68/95.3|x|8516|[ckpt](https://download.openmmlab.com/mmaction/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb_20200812-9037a758.pth)/[infer_ckpt](https://download.openmmlab.com/mmaction/recognition/csn/vmz/vmz_ircsn_ig65m_pretrained_r152_32x2x1_58e_kinetics400_rgb_20210617-e63ee1bd.pth)|[log](https://download.openmmlab.com/mmaction/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb/20200809_053132.log)|[json](https://download.openmmlab.com/mmaction/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb/20200809_053132.log.json)| +|[ipcsn_bnfrozen_r152_32x2x1_180e_kinetics400_rgb](/configs/recognition/csn/ipcsn_bnfrozen_r152_32x2x1_180e_kinetics400_rgb.py)|short-side 320|x| ResNet152 | None | 77.8 | 92.8 | x | x | [infer_ckpt](https://download.openmmlab.com/mmaction/recognition/csn/vmz/vmz_ipcsn_from_scratch_r152_32x2x1_180e_kinetics400_rgb_20210617-d565828d.pth) | x | x | +|[ipcsn_sports1m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb](/configs/recognition/csn/ipcsn_sports1m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py)|short-side 320|x| ResNet152 | Sports1M | 78.8 | 93.5 | x | x | [infer_ckpt](https://download.openmmlab.com/mmaction/recognition/csn/vmz/vmz_ipcsn_sports1m_pretrained_r152_32x2x1_58e_kinetics400_rgb_20210617-3367437a.pth) | x | x | +|[ipcsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb](/configs/recognition/csn/ipcsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py)|short-side 320|x| ResNet152 | IG65M | 82.5 | 95.3 | x | x | [infer_ckpt](https://download.openmmlab.com/mmaction/recognition/csn/vmz/vmz_ipcsn_ig65m_pretrained_r152_32x2x1_58e_kinetics400_rgb_20210617-c3be9793.pth) | x | x | |[ircsn_ig65m_pretrained_r152_32x2x1_58e_kinetics400_rgb.py](/configs/recognition/csn/ircsn_ig65m_pretrained_r152_32x2x1_58e_kinetics400_rgb.py)|short-side 320|8x4| ResNet152 | IG65M|80.14|94.93|x|8517|[ckpt](https://download.openmmlab.com/mmaction/recognition/csn/ircsn_ig65m_pretrained_r152_32x2x1_58e_kinetics400_rgb/ircsn_ig65m_pretrained_r152_32x2x1_58e_kinetics400_rgb_20200803-fc66ce8d.pth)|[log](https://download.openmmlab.com/mmaction/recognition/csn/ircsn_ig65m_pretrained_r152_32x2x1_58e_kinetics400_rgb/20200728_031952.log)|[json](https://download.openmmlab.com/mmaction/recognition/csn/ircsn_ig65m_pretrained_r152_32x2x1_58e_kinetics400_rgb/20200728_031952.log.json)| -|[ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py](/configs/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py)|short-side 320|8x4| ResNet152 | IG65M|82.76|95.68|x|8516|[ckpt](https://download.openmmlab.com/mmaction/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb_20200812-9037a758.pth)|[log](https://download.openmmlab.com/mmaction/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb/20200809_053132.log)|[json](https://download.openmmlab.com/mmaction/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb/20200809_053132.log.json)| Notes: @@ -44,6 +50,7 @@ Notes: 2. The **inference_time** is got by this [benchmark script](/tools/analysis/benchmark.py), where we use the sampling frames strategy of the test setting and only care about the model inference time, not including the IO time and pre-processing time. For each setting, we use 1 gpu and set batch size (videos per gpu) to 1 to calculate the inference time. 3. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. +4. The **infer_ckpt** means those checkpoints are ported from [VMZ](https://github.com/facebookresearch/VMZ). For more details on data preparation, you can refer to Kinetics400 in [Data Preparation](/docs/data_preparation.md). diff --git a/configs/recognition/csn/ipcsn_bnfrozen_r152_32x2x1_180e_kinetics400_rgb.py b/configs/recognition/csn/ipcsn_bnfrozen_r152_32x2x1_180e_kinetics400_rgb.py new file mode 100644 index 0000000000..8d352419d8 --- /dev/null +++ b/configs/recognition/csn/ipcsn_bnfrozen_r152_32x2x1_180e_kinetics400_rgb.py @@ -0,0 +1,92 @@ +_base_ = [ + './ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py' +] + +# model settings +model = dict( + backbone=dict( + norm_eval=True, bn_frozen=True, bottleneck_mode='ip', pretrained=None)) + +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train' +data_root_val = 'data/kinetics400/rawframes_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='FrameSelector'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='FrameSelector'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=10, + test_mode=True), + dict(type='FrameSelector'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=test_pipeline)) + +optimizer = dict( + type='SGD', lr=0.08, momentum=0.9, + weight_decay=0.0001) # this lr is used for 8 gpus +optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + min_lr=0, + warmup='linear', + warmup_by_epoch=True, + warmup_iters=40) +total_epochs = 180 + +work_dir = './work_dirs/ipcsn_bnfrozen_r152_32x2x1_180e_kinetics400_rgb' # noqa: E501 diff --git a/configs/recognition/csn/ipcsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py b/configs/recognition/csn/ipcsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py new file mode 100644 index 0000000000..7aed801a62 --- /dev/null +++ b/configs/recognition/csn/ipcsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py @@ -0,0 +1,15 @@ +_base_ = [ + './ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py' +] + +# model settings +model = dict( + backbone=dict( + norm_eval=True, + bn_frozen=True, + bottleneck_mode='ip', + pretrained= # noqa: E251 + 'https://download.openmmlab.com/mmaction/recognition/csn/ipcsn_from_scratch_r152_ig65m_20210617-c4b99d38.pth' # noqa: E501 + )) + +work_dir = './work_dirs/ipcsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb' # noqa: E501 diff --git a/configs/recognition/csn/ipcsn_sports1m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py b/configs/recognition/csn/ipcsn_sports1m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py new file mode 100644 index 0000000000..0cc11366ba --- /dev/null +++ b/configs/recognition/csn/ipcsn_sports1m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py @@ -0,0 +1,15 @@ +_base_ = [ + './ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py' +] + +# model settings +model = dict( + backbone=dict( + norm_eval=True, + bn_frozen=True, + bottleneck_mode='ip', + pretrained= # noqa: E251 + 'https://download.openmmlab.com/mmaction/recognition/csn/ipcsn_from_scratch_r152_sports1m_20210617-7a7cc5b9.pth' # noqa: E501 + )) + +work_dir = './work_dirs/ipcsn_sports1m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb' # noqa: E501 diff --git a/configs/recognition/csn/ircsn_bnfrozen_r152_32x2x1_180e_kinetics400_rgb.py b/configs/recognition/csn/ircsn_bnfrozen_r152_32x2x1_180e_kinetics400_rgb.py new file mode 100644 index 0000000000..ecc41f1451 --- /dev/null +++ b/configs/recognition/csn/ircsn_bnfrozen_r152_32x2x1_180e_kinetics400_rgb.py @@ -0,0 +1,92 @@ +_base_ = [ + './ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py' +] + +# model settings +model = dict( + backbone=dict( + norm_eval=True, bn_frozen=True, bottleneck_mode='ir', pretrained=None)) + +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train' +data_root_val = 'data/kinetics400/rawframes_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='FrameSelector'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='FrameSelector'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=10, + test_mode=True), + dict(type='FrameSelector'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=test_pipeline)) + +optimizer = dict( + type='SGD', lr=0.08, momentum=0.9, + weight_decay=0.0001) # this lr is used for 8 gpus +optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + min_lr=0, + warmup='linear', + warmup_by_epoch=True, + warmup_iters=40) +total_epochs = 180 + +work_dir = './work_dirs/ircsn_bnfrozen_r152_32x2x1_180e_kinetics400_rgb' # noqa: E501 diff --git a/configs/recognition/csn/ircsn_bnfrozen_r50_32x2x1_180e_kinetics400_rgb.py b/configs/recognition/csn/ircsn_bnfrozen_r50_32x2x1_180e_kinetics400_rgb.py new file mode 100644 index 0000000000..7e3bab7f59 --- /dev/null +++ b/configs/recognition/csn/ircsn_bnfrozen_r50_32x2x1_180e_kinetics400_rgb.py @@ -0,0 +1,96 @@ +_base_ = [ + './ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py' +] + +# model settings +model = dict( + backbone=dict( + depth=50, + norm_eval=True, + bn_frozen=True, + bottleneck_mode='ir', + pretrained=None)) + +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train' +data_root_val = 'data/kinetics400/rawframes_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='FrameSelector'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='FrameSelector'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=10, + test_mode=True), + dict(type='FrameSelector'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=test_pipeline)) + +optimizer = dict( + type='SGD', lr=0.08, momentum=0.9, + weight_decay=0.0001) # this lr is used for 8 gpus +optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + min_lr=0, + warmup='linear', + warmup_by_epoch=True, + warmup_iters=40) +total_epochs = 180 + +work_dir = './work_dirs/ircsn_bnfrozen_r50_32x2x1_180e_kinetics400_rgb' # noqa: E501 diff --git a/configs/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py b/configs/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py index a995cb5ca6..db97c917f5 100644 --- a/configs/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py +++ b/configs/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py @@ -1,7 +1,15 @@ -_base_ = ['../../_base_/models/csn_ig65m_pretrained.py'] +_base_ = [ + '../../_base_/models/ircsn_r152.py', '../../_base_/default_runtime.py' +] # model settings -model = dict(backbone=dict(norm_eval=True, bn_frozen=True)) +model = dict( + backbone=dict( + norm_eval=True, + bn_frozen=True, + pretrained= # noqa: E251 + 'https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r152_ig65m_20200807-771c4135.pth' # noqa: E501 + )) # dataset settings dataset_type = 'RawframeDataset' data_root = 'data/kinetics400/rawframes_train' @@ -33,7 +41,6 @@ dict(type='FrameSelector'), dict(type='Resize', scale=(-1, 256)), dict(type='CenterCrop', crop_size=224), - dict(type='Flip', flip_ratio=0), dict(type='Normalize', **img_norm_cfg), dict(type='FormatShape', input_format='NCTHW'), dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), @@ -49,7 +56,6 @@ dict(type='FrameSelector'), dict(type='Resize', scale=(-1, 256)), dict(type='ThreeCrop', crop_size=256), - dict(type='Flip', flip_ratio=0), dict(type='Normalize', **img_norm_cfg), dict(type='FormatShape', input_format='NCTHW'), dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), @@ -73,6 +79,9 @@ ann_file=ann_file_val, data_prefix=data_root_val, pipeline=test_pipeline)) +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) + # optimizer optimizer = dict( type='SGD', lr=0.000125, momentum=0.9, @@ -87,18 +96,6 @@ warmup_by_epoch=True, warmup_iters=16) total_epochs = 58 -checkpoint_config = dict(interval=2) -evaluation = dict( - interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) -log_config = dict( - interval=20, - hooks=[dict(type='TextLoggerHook'), - dict(type='TensorboardLoggerHook')]) -# runtime settings -dist_params = dict(backend='nccl') -log_level = 'INFO' + work_dir = './work_dirs/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb' # noqa: E501 -load_from = None -resume_from = None -workflow = [('train', 1)] find_unused_parameters = True diff --git a/configs/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r50_32x2x1_58e_kinetics400_rgb.py b/configs/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r50_32x2x1_58e_kinetics400_rgb.py new file mode 100644 index 0000000000..d6110a4a83 --- /dev/null +++ b/configs/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r50_32x2x1_58e_kinetics400_rgb.py @@ -0,0 +1,102 @@ +_base_ = [ + '../../_base_/models/ircsn_r152.py', '../../_base_/default_runtime.py' +] + +# model settings +model = dict( + backbone=dict( + depth=50, + norm_eval=True, + bn_frozen=True, + pretrained= # noqa: E251 + 'https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r50_ig65m_20210617-ce545a37.pth' # noqa: E501 + )) +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train' +data_root_val = 'data/kinetics400/rawframes_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='FrameSelector'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='FrameSelector'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=10, + test_mode=True), + dict(type='FrameSelector'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=3, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=test_pipeline)) +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) + +# optimizer +optimizer = dict( + type='SGD', lr=0.000125, momentum=0.9, + weight_decay=0.0001) # this lr is used for 8 gpus +optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + step=[32, 48], + warmup='linear', + warmup_ratio=0.1, + warmup_by_epoch=True, + warmup_iters=16) +total_epochs = 58 + +work_dir = './work_dirs/ircsn_ig65m_pretrained_bnfrozen_r50_32x2x1_58e_kinetics400_rgb' # noqa: E501 +find_unused_parameters = True diff --git a/configs/recognition/csn/ircsn_ig65m_pretrained_r152_32x2x1_58e_kinetics400_rgb.py b/configs/recognition/csn/ircsn_ig65m_pretrained_r152_32x2x1_58e_kinetics400_rgb.py index fe41038755..67b371233f 100644 --- a/configs/recognition/csn/ircsn_ig65m_pretrained_r152_32x2x1_58e_kinetics400_rgb.py +++ b/configs/recognition/csn/ircsn_ig65m_pretrained_r152_32x2x1_58e_kinetics400_rgb.py @@ -1,4 +1,13 @@ -_base_ = ['../../_base_/models/csn_ig65m_pretrained.py'] +_base_ = [ + '../../_base_/models/ircsn_r152.py', '../../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + pretrained= # noqa: E251 + 'https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r152_ig65m_20200807-771c4135.pth' # noqa: E501 + )) + # dataset settings dataset_type = 'RawframeDataset' data_root = 'data/kinetics400/rawframes_train' @@ -30,7 +39,6 @@ dict(type='FrameSelector'), dict(type='Resize', scale=(-1, 256)), dict(type='CenterCrop', crop_size=224), - dict(type='Flip', flip_ratio=0), dict(type='Normalize', **img_norm_cfg), dict(type='FormatShape', input_format='NCTHW'), dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), @@ -46,7 +54,6 @@ dict(type='FrameSelector'), dict(type='Resize', scale=(-1, 256)), dict(type='ThreeCrop', crop_size=256), - dict(type='Flip', flip_ratio=0), dict(type='Normalize', **img_norm_cfg), dict(type='FormatShape', input_format='NCTHW'), dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), @@ -70,6 +77,9 @@ ann_file=ann_file_val, data_prefix=data_root_val, pipeline=test_pipeline)) +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) + # optimizer optimizer = dict( type='SGD', lr=0.000125, momentum=0.9, @@ -84,17 +94,6 @@ warmup_by_epoch=True, warmup_iters=16) total_epochs = 58 -checkpoint_config = dict(interval=2) -evaluation = dict( - interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) -log_config = dict( - interval=20, - hooks=[dict(type='TextLoggerHook'), - dict(type='TensorboardLoggerHook')]) -# runtime settings -dist_params = dict(backend='nccl') -log_level = 'INFO' + work_dir = './work_dirs/ircsn_ig65m_pretrained_r152_32x2x1_58e_kinetics400_rgb' -load_from = None -resume_from = None -workflow = [('train', 1)] +find_unused_parameters = True diff --git a/configs/recognition/csn/ircsn_sports1m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py b/configs/recognition/csn/ircsn_sports1m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py new file mode 100644 index 0000000000..d0803f68ab --- /dev/null +++ b/configs/recognition/csn/ircsn_sports1m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py @@ -0,0 +1,15 @@ +_base_ = [ + './ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py' +] + +# model settings +model = dict( + backbone=dict( + norm_eval=True, + bn_frozen=True, + bottleneck_mode='ir', + pretrained= # noqa: E251 + 'https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r152_sports1m_20210617-bcc9c0dd.pth' # noqa: E501 + )) + +work_dir = './work_dirs/ipcsn_sports1m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb' # noqa: E501 diff --git a/configs/recognition/csn/metafile.yml b/configs/recognition/csn/metafile.yml index 9dd6136cc2..31dd84f33f 100644 --- a/configs/recognition/csn/metafile.yml +++ b/configs/recognition/csn/metafile.yml @@ -46,3 +46,122 @@ Models: Training Json Log: https://download.openmmlab.com/mmaction/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb/20200809_053132.log.json Training Log: https://download.openmmlab.com/mmaction/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb/20200809_053132.log Weights: https://download.openmmlab.com/mmaction/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb/ircsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb_20200812-9037a758.pth +- Config: configs/recognition/csn/ipcsn_bnfrozen_r152_32x2x1_180e_kinetics400_rgb.py + In Collection: CSN + Metadata: + Architecture: ResNet152 + Epochs: 180 + FLOPs: 110337228800 + Parameters: 33016592 + Pretrained: None + Resolution: short-side 320 + Training Data: Kinetics-400 + Modality: RGB + Name: ipcsn_bnfrozen_r152_32x2x1_180e_kinetics400_rgb.py + Results: + - Dataset: Kinetics-400 + Metrics: + top1 acc: 77.8 + top5 acc: 92.8 + Task: Action Recognition + Weights: https://download.openmmlab.com/mmaction/recognition/csn/vmz/vmz_ipcsn_from_scratch_r152_32x2x1_180e_kinetics400_rgb_20210617-d565828d.pth +- Config: configs/recognition/csn/ipcsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py + In Collection: CSN + Metadata: + Architecture: ResNet152 + Epochs: 58 + FLOPs: 110337228800 + Parameters: 33016592 + Pretrained: IG65M + Resolution: short-side 320 + Training Data: Kinetics-400 + Modality: RGB + Name: ipcsn_ig65m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py + Results: + - Dataset: Kinetics-400 + Metrics: + top1 acc: 82.5 + top5 acc: 95.3 + Task: Action Recognition + Weights: https://download.openmmlab.com/mmaction/recognition/csn/vmz/vmz_ipcsn_ig65m_pretrained_r152_32x2x1_58e_kinetics400_rgb_20210617-c3be9793.pth + inference_time(video/s): x +- Config: configs/recognition/csn/ipcsn_sports1m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py + In Collection: CSN + Metadata: + Architecture: ResNet152 + Epochs: 58 + FLOPs: 110337228800 + Parameters: 33016592 + Pretrained: Sports1M + Resolution: short-side 320 + Training Data: Kinetics-400 + Modality: RGB + Name: ipcsn_sports1m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py + Results: + - Dataset: Kinetics-400 + Metrics: + top1 acc: 78.8 + top5 acc: 93.5 + Task: Action Recognition + Weights: https://download.openmmlab.com/mmaction/recognition/csn/vmz/vmz_ipcsn_sports1m_pretrained_r152_32x2x1_58e_kinetics400_rgb_20210617-3367437a.pth + inference_time(video/s): x +- Config: configs/recognition/csn/ircsn_bnfrozen_r152_32x2x1_180e_kinetics400_rgb.py + In Collection: CSN + Metadata: + Architecture: ResNet152 + Epochs: 180 + FLOPs: 98096676864 + Parameters: 29703568 + Pretrained: None + Resolution: short-side 320 + Training Data: Kinetics-400 + Modality: RGB + Name: ircsn_bnfrozen_r152_32x2x1_180e_kinetics400_rgb.py + Results: + - Dataset: Kinetics-400 + Metrics: + top1 acc: 76.5 + top5 acc: 92.1 + Task: Action Recognition + Weights: https://download.openmmlab.com/mmaction/recognition/csn/vmz/vmz_ircsn_from_scratch_r152_32x2x1_180e_kinetics400_rgb_20210617-5c933ae1.pth + inference_time(video/s): x +- Config: configs/recognition/csn/ircsn_ig65m_pretrained_bnfrozen_r50_32x2x1_58e_kinetics400_rgb.py + In Collection: CSN + Metadata: + Architecture: ResNet50 + Epochs: 58 + FLOPs: 56209211392 + Parameters: 13131152 + Pretrained: IG65M + Resolution: short-side 320 + Training Data: Kinetics-400 + Modality: RGB + Name: ircsn_ig65m_pretrained_bnfrozen_r50_32x2x1_58e_kinetics400_rgb.py + Results: + - Dataset: Kinetics-400 + Metrics: + top1 acc: 79.0 + top5 acc: 94.2 + Task: Action Recognition + Weights: https://download.openmmlab.com/mmaction/recognition/csn/vmz/vmz_ircsn_ig65m_pretrained_r50_32x2x1_58e_kinetics400_rgb_20210617-86d33018.pth + inference_time(video/s): x +- Config: configs/recognition/csn/ircsn_sports1m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py + In Collection: CSN + Metadata: + Architecture: ResNet152 + Epochs: 58 + FLOPs: 98096676864 + Parameters: 29703568 + Pretrained: Sports1M + Resolution: short-side 320 + Training Data: Kinetics-400 + Modality: RGB + Name: ircsn_sports1m_pretrained_bnfrozen_r152_32x2x1_58e_kinetics400_rgb.py + Results: + - Dataset: Kinetics-400 + Metrics: + top1 acc: 78.2 + top5 acc: 93.0 + Task: Action Recognition + Weights: https://download.openmmlab.com/mmaction/recognition/csn/vmz/vmz_ircsn_sports1m_pretrained_r152_32x2x1_58e_kinetics400_rgb_20210617-b9b10241.pth + inference_time(video/s): x diff --git a/mmaction/models/backbones/resnet3d_csn.py b/mmaction/models/backbones/resnet3d_csn.py index 5d041d5450..aa190a2888 100644 --- a/mmaction/models/backbones/resnet3d_csn.py +++ b/mmaction/models/backbones/resnet3d_csn.py @@ -43,7 +43,15 @@ def __init__(self, conv2 = [] if self.bottleneck_mode == 'ip': conv2.append( - nn.Conv3d(planes, planes, kernel_size=1, stride=1, bias=False)) + ConvModule( + planes, + planes, + 1, + stride=1, + bias=False, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=None)) conv2_kernel_size = self.conv2.conv.kernel_size conv2_stride = self.conv2.conv.stride conv2_padding = self.conv2.conv.padding