[Modelzoo] Add K600, K700 baselines (#259)

open-mmlab · Oct 17, 2020 · 6ec17b8 · 6ec17b8
1 parent 363fe9a
commit 6ec17b8
Show file tree

Hide file tree

Showing 8 changed files with 491 additions and 8 deletions.
diff --git a/configs/recognition/slowonly/README.md b/configs/recognition/slowonly/README.md
@@ -45,6 +45,20 @@ In data benchmark, we compare two different data preprocessing methods: (1) Resi
 | [slowonly_r101_8x8x1_196e_kinetics400_rgb](/configs/recognition/slowonly/slowonly_r101_8x8x1_196e_kinetics400_rgb.py) |       x        | ResNet101 |   None   |        :x:         |   76.5   |   92.7   |            x            |     x      | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowonly/omni/slowonly_r101_without_omni_8x8x1_kinetics400_rgb_20200926-0c730aef.pth) |                              x                               |                              x                               |
 |                              x                               |       x        | ResNet101 |   None   | :heavy_check_mark: |   80.4   |   94.4   |            x            |     x      | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowonly/omni/slowonly_r101_omni_8x8x1_kinetics400_rgb_20200926-b5dbb701.pth) |                              x                               |                              x                               |
 
+### Kinetics-600
+
+| config                                                       |   resolution   | gpus | backbone | pretrain | top1 acc | top5 acc | inference_time(video/s) | gpu_mem(M) |                             ckpt                             |                             log                              |                             json                             |
+| :----------------------------------------------------------- | :------------: | :--: | :------: | :------: | :------: | :------: | :---------------------: | :--------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
+| [slowonly_r50_video_8x8x1_256e_kinetics600_rgb](/configs/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics600_rgb.py) | short-side 256 | 8x4  | ResNet50 |   None   |   77.5   |   93.7   |    2.3 (80x3 frames)    |    8478    | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics600_rgb/slowonly_r50_video_8x8x1_256e_kinetics600_rgb_20201015-81e5153e.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics600_rgb/slowonly_r50_video_8x8x1_256e_kinetics600_rgb_20201015.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics600_rgb/slowonly_r50_video_8x8x1_256e_kinetics600_rgb_20201015.json) |
+
+### Kinetics-700
+
+| config                                                       |   resolution   | gpus | backbone | pretrain | top1 acc | top5 acc | inference_time(video/s) | gpu_mem(M) |                             ckpt                             |                             log                              |                             json                             |
+| :----------------------------------------------------------- | :------------: | :--: | :------: | :------: | :------: | :------: | :---------------------: | :--------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
+| [slowonly_r50_video_8x8x1_256e_kinetics700_rgb](/configs/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics700_rgb.py) | short-side 256 | 8x4  | ResNet50 |   None   |   65.0   |   86.1   |    2.3 (80x3 frames)    |    8478    | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics700_rgb/slowonly_r50_video_8x8x1_256e_kinetics700_rgb_20201015-9250f662.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics700_rgb/slowonly_r50_video_8x8x1_256e_kinetics700_rgb_20201015.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics700_rgb/slowonly_r50_video_8x8x1_256e_kinetics700_rgb_20201015.json) |
+
+
+
 Notes:
 
 1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default.

diff --git a/configs/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics600_rgb.py b/configs/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics600_rgb.py
@@ -0,0 +1,114 @@
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet3dSlowOnly',
+        depth=50,
+        pretrained=None,
+        lateral=False,
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        inflate=(0, 0, 1, 1),
+        norm_eval=False),
+    cls_head=dict(
+        type='I3DHead',
+        in_channels=2048,
+        num_classes=600,
+        spatial_type='avg',
+        dropout_ratio=0.5))
+train_cfg = None
+test_cfg = dict(average_clips='prob')
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics600/videos_train'
+data_root_val = 'data/kinetics600/videos_val'
+ann_file_train = 'data/kinetics600/kinetics600_train_list_videos.txt'
+ann_file_val = 'data/kinetics600/kinetics600_val_list_videos.txt'
+ann_file_test = 'data/kinetics600/kinetics600_val_list_videos.txt'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=8,
+        num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Flip', flip_ratio=0),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=8,
+        num_clips=10,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Flip', flip_ratio=0),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=12,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.15, momentum=0.9,
+    weight_decay=0.0001)  # this lr is used for 8 gpus
+optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
+# learning policy
+lr_config = dict(policy='CosineAnnealing', min_lr=0)
+total_epochs = 256
+checkpoint_config = dict(interval=4)
+workflow = [('train', 1)]
+evaluation = dict(
+    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5))
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        #    dict(type='TensorboardLoggerHook'),
+    ])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/slowonly_r50_video_8x8x1_256e_kinetics600_rgb'
+load_from = None
+resume_from = None
+find_unused_parameters = False
diff --git a/configs/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics700_rgb.py b/configs/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics700_rgb.py
@@ -0,0 +1,114 @@
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet3dSlowOnly',
+        depth=50,
+        pretrained=None,
+        lateral=False,
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        inflate=(0, 0, 1, 1),
+        norm_eval=False),
+    cls_head=dict(
+        type='I3DHead',
+        in_channels=2048,
+        num_classes=700,
+        spatial_type='avg',
+        dropout_ratio=0.5))
+train_cfg = None
+test_cfg = dict(average_clips='prob')
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics700/videos_train'
+data_root_val = 'data/kinetics700/videos_val'
+ann_file_train = 'data/kinetics700/kinetics700_train_list_videos.txt'
+ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt'
+ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=8,
+        num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Flip', flip_ratio=0),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=8,
+        num_clips=10,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Flip', flip_ratio=0),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=12,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.15, momentum=0.9,
+    weight_decay=0.0001)  # this lr is used for 8 gpus
+optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
+# learning policy
+lr_config = dict(policy='CosineAnnealing', min_lr=0)
+total_epochs = 256
+checkpoint_config = dict(interval=4)
+workflow = [('train', 1)]
+evaluation = dict(
+    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5))
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        #    dict(type='TensorboardLoggerHook'),
+    ])
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/slowonly_r50_video_8x8x1_256e_kinetics700_rgb'
+load_from = None
+resume_from = None
+find_unused_parameters = False
diff --git a/configs/recognition/tsn/README.md b/configs/recognition/tsn/README.md
@@ -74,6 +74,18 @@ In data benchmark, we compare:
 
 [1] We obtain the pre-trained model from [torch-hub](https://pytorch.org/hub/facebookresearch_semi-supervised-ImageNet1K-models_resnext/), the pretrain model we used is `resnet50_swsl`
 
+### Kinetics-600
+
+| config                                                       |   resolution   | gpus | backbone | pretrain | top1 acc | top5 acc | inference_time(video/s) | gpu_mem(M) |                             ckpt                             |                             log                              |                             json                             |
+| :----------------------------------------------------------- | :------------: | :--: | :------: | :------: | :------: | :------: | :---------------------: | :--------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
+| [tsn_r50_video_1x1x8_100e_kinetics600_rgb](/configs/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics600_rgb.py) | short-side 256 | 8x2  | ResNet50 | ImageNet |   74.8   |   92.3   |   11.1 (25x3 frames)    |    8344    | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics600_rgb/tsn_r50_video_1x1x8_100e_kinetics600_rgb_20201015-4db3c461.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics600_rgb/tsn_r50_video_1x1x8_100e_kinetics600_rgb_20201015.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics600_rgb/tsn_r50_video_1x1x8_100e_kinetics600_rgb_20201015.json) |
+
+### Kinetics-700
+
+| config                                                       |   resolution   | gpus | backbone | pretrain | top1 acc | top5 acc | inference_time(video/s) | gpu_mem(M) |                             ckpt                             |                             log                              |                             json                             |
+| :----------------------------------------------------------- | :------------: | :--: | :------: | :------: | :------: | :------: | :---------------------: | :--------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
+| [tsn_r50_video_1x1x8_100e_kinetics700_rgb](/configs/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics700_rgb.py) | short-side 256 | 8x2  | ResNet50 | ImageNet |   61.7   |   83.6   |   11.1 (25x3 frames)    |    8344    | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics700_rgb/tsn_r50_video_1x1x8_100e_kinetics700_rgb_20201015-e381a6c7.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics700_rgb/tsn_r50_video_1x1x8_100e_kinetics700_rgb_20201015.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics700_rgb/tsn_r50_video_1x1x8_100e_kinetics700_rgb_20201015.json) |
+
 ### Something-Something V1
 
 |config|resolution | gpus| backbone |pretrain| top1 acc| top5 acc | reference top1 acc | reference top5 acc | gpu_mem(M) | ckpt | log| json|

diff --git a/configs/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics600_rgb.py b/configs/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics600_rgb.py
@@ -0,0 +1,116 @@
+# model settings
+model = dict(
+    type='Recognizer2D',
+    backbone=dict(
+        type='ResNet',
+        pretrained='torchvision://resnet50',
+        depth=50,
+        norm_eval=False),
+    cls_head=dict(
+        type='TSNHead',
+        num_classes=600,
+        in_channels=2048,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.4,
+        init_std=0.01))
+# model training and testing settings
+train_cfg = None
+test_cfg = dict(average_clips=None)
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics600/videos_train'
+data_root_val = 'data/kinetics600/videos_val'
+ann_file_train = 'data/kinetics600/kinetics600_train_list_videos.txt'
+ann_file_val = 'data/kinetics600/kinetics600_val_list_videos.txt'
+ann_file_test = 'data/kinetics600/kinetics600_val_list_videos.txt'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='DecordDecode'),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='Flip', flip_ratio=0),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=25,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Flip', flip_ratio=0),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=12,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.00375, momentum=0.9,
+    weight_decay=0.0001)  # this lr is used for 8 gpus
+optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
+# learning policy
+lr_config = dict(policy='step', step=[40, 80])
+total_epochs = 100
+checkpoint_config = dict(interval=5)
+evaluation = dict(
+    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5))
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook'),
+    ])
+# runtime settings
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/tsn_r50_1x1x3_100e_kinetics600_rgb/'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]