diff --git a/configs/recognition/slowonly/README.md b/configs/recognition/slowonly/README.md index bed9b3c96e..932adb78b5 100644 --- a/configs/recognition/slowonly/README.md +++ b/configs/recognition/slowonly/README.md @@ -45,6 +45,20 @@ In data benchmark, we compare two different data preprocessing methods: (1) Resi | [slowonly_r101_8x8x1_196e_kinetics400_rgb](/configs/recognition/slowonly/slowonly_r101_8x8x1_196e_kinetics400_rgb.py) | x | ResNet101 | None | :x: | 76.5 | 92.7 | x | x | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowonly/omni/slowonly_r101_without_omni_8x8x1_kinetics400_rgb_20200926-0c730aef.pth) | x | x | | x | x | ResNet101 | None | :heavy_check_mark: | 80.4 | 94.4 | x | x | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowonly/omni/slowonly_r101_omni_8x8x1_kinetics400_rgb_20200926-b5dbb701.pth) | x | x | +### Kinetics-600 + +| config | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | inference_time(video/s) | gpu_mem(M) | ckpt | log | json | +| :----------------------------------------------------------- | :------------: | :--: | :------: | :------: | :------: | :------: | :---------------------: | :--------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | +| [slowonly_r50_video_8x8x1_256e_kinetics600_rgb](/configs/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics600_rgb.py) | short-side 256 | 8x4 | ResNet50 | None | 77.5 | 93.7 | 2.3 (80x3 frames) | 8478 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics600_rgb/slowonly_r50_video_8x8x1_256e_kinetics600_rgb_20201015-81e5153e.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics600_rgb/slowonly_r50_video_8x8x1_256e_kinetics600_rgb_20201015.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics600_rgb/slowonly_r50_video_8x8x1_256e_kinetics600_rgb_20201015.json) | + +### Kinetics-700 + +| config | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | inference_time(video/s) | gpu_mem(M) | ckpt | log | json | +| :----------------------------------------------------------- | :------------: | :--: | :------: | :------: | :------: | :------: | :---------------------: | :--------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | +| [slowonly_r50_video_8x8x1_256e_kinetics700_rgb](/configs/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics700_rgb.py) | short-side 256 | 8x4 | ResNet50 | None | 65.0 | 86.1 | 2.3 (80x3 frames) | 8478 | [ckpt](https://download.openmmlab.com/mmaction/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics700_rgb/slowonly_r50_video_8x8x1_256e_kinetics700_rgb_20201015-9250f662.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics700_rgb/slowonly_r50_video_8x8x1_256e_kinetics700_rgb_20201015.log) | [json](https://download.openmmlab.com/mmaction/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics700_rgb/slowonly_r50_video_8x8x1_256e_kinetics700_rgb_20201015.json) | + + + Notes: 1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default. diff --git a/configs/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics600_rgb.py b/configs/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics600_rgb.py new file mode 100644 index 0000000000..4f068c824a --- /dev/null +++ b/configs/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics600_rgb.py @@ -0,0 +1,114 @@ +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dSlowOnly', + depth=50, + pretrained=None, + lateral=False, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + norm_eval=False), + cls_head=dict( + type='I3DHead', + in_channels=2048, + num_classes=600, + spatial_type='avg', + dropout_ratio=0.5)) +train_cfg = None +test_cfg = dict(average_clips='prob') +dataset_type = 'VideoDataset' +data_root = 'data/kinetics600/videos_train' +data_root_val = 'data/kinetics600/videos_val' +ann_file_train = 'data/kinetics600/kinetics600_train_list_videos.txt' +ann_file_val = 'data/kinetics600/kinetics600_val_list_videos.txt' +ann_file_test = 'data/kinetics600/kinetics600_val_list_videos.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='DecordInit'), + dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1), + dict(type='DecordDecode'), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=8, + frame_interval=8, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=8, + frame_interval=8, + num_clips=10, + test_mode=True), + dict(type='DecordDecode'), + dict(type='ThreeCrop', crop_size=256), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=12, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +# optimizer +optimizer = dict( + type='SGD', lr=0.15, momentum=0.9, + weight_decay=0.0001) # this lr is used for 8 gpus +optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) +# learning policy +lr_config = dict(policy='CosineAnnealing', min_lr=0) +total_epochs = 256 +checkpoint_config = dict(interval=4) +workflow = [('train', 1)] +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5)) +log_config = dict( + interval=20, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook'), + ]) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/slowonly_r50_video_8x8x1_256e_kinetics600_rgb' +load_from = None +resume_from = None +find_unused_parameters = False diff --git a/configs/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics700_rgb.py b/configs/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics700_rgb.py new file mode 100644 index 0000000000..396bdb8ebc --- /dev/null +++ b/configs/recognition/slowonly/slowonly_r50_video_8x8x1_256e_kinetics700_rgb.py @@ -0,0 +1,114 @@ +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dSlowOnly', + depth=50, + pretrained=None, + lateral=False, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + norm_eval=False), + cls_head=dict( + type='I3DHead', + in_channels=2048, + num_classes=700, + spatial_type='avg', + dropout_ratio=0.5)) +train_cfg = None +test_cfg = dict(average_clips='prob') +dataset_type = 'VideoDataset' +data_root = 'data/kinetics700/videos_train' +data_root_val = 'data/kinetics700/videos_val' +ann_file_train = 'data/kinetics700/kinetics700_train_list_videos.txt' +ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt' +ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='DecordInit'), + dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1), + dict(type='DecordDecode'), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=8, + frame_interval=8, + num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=8, + frame_interval=8, + num_clips=10, + test_mode=True), + dict(type='DecordDecode'), + dict(type='ThreeCrop', crop_size=256), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=12, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +# optimizer +optimizer = dict( + type='SGD', lr=0.15, momentum=0.9, + weight_decay=0.0001) # this lr is used for 8 gpus +optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) +# learning policy +lr_config = dict(policy='CosineAnnealing', min_lr=0) +total_epochs = 256 +checkpoint_config = dict(interval=4) +workflow = [('train', 1)] +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5)) +log_config = dict( + interval=20, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook'), + ]) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/slowonly_r50_video_8x8x1_256e_kinetics700_rgb' +load_from = None +resume_from = None +find_unused_parameters = False diff --git a/configs/recognition/tsn/README.md b/configs/recognition/tsn/README.md index fe95774dfe..786587bc15 100644 --- a/configs/recognition/tsn/README.md +++ b/configs/recognition/tsn/README.md @@ -74,6 +74,18 @@ In data benchmark, we compare: [1] We obtain the pre-trained model from [torch-hub](https://pytorch.org/hub/facebookresearch_semi-supervised-ImageNet1K-models_resnext/), the pretrain model we used is `resnet50_swsl` +### Kinetics-600 + +| config | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | inference_time(video/s) | gpu_mem(M) | ckpt | log | json | +| :----------------------------------------------------------- | :------------: | :--: | :------: | :------: | :------: | :------: | :---------------------: | :--------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | +| [tsn_r50_video_1x1x8_100e_kinetics600_rgb](/configs/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics600_rgb.py) | short-side 256 | 8x2 | ResNet50 | ImageNet | 74.8 | 92.3 | 11.1 (25x3 frames) | 8344 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics600_rgb/tsn_r50_video_1x1x8_100e_kinetics600_rgb_20201015-4db3c461.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics600_rgb/tsn_r50_video_1x1x8_100e_kinetics600_rgb_20201015.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics600_rgb/tsn_r50_video_1x1x8_100e_kinetics600_rgb_20201015.json) | + +### Kinetics-700 + +| config | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | inference_time(video/s) | gpu_mem(M) | ckpt | log | json | +| :----------------------------------------------------------- | :------------: | :--: | :------: | :------: | :------: | :------: | :---------------------: | :--------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | +| [tsn_r50_video_1x1x8_100e_kinetics700_rgb](/configs/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics700_rgb.py) | short-side 256 | 8x2 | ResNet50 | ImageNet | 61.7 | 83.6 | 11.1 (25x3 frames) | 8344 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics700_rgb/tsn_r50_video_1x1x8_100e_kinetics700_rgb_20201015-e381a6c7.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics700_rgb/tsn_r50_video_1x1x8_100e_kinetics700_rgb_20201015.log) | [json](https://download.openmmlab.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics700_rgb/tsn_r50_video_1x1x8_100e_kinetics700_rgb_20201015.json) | + ### Something-Something V1 |config|resolution | gpus| backbone |pretrain| top1 acc| top5 acc | reference top1 acc | reference top5 acc | gpu_mem(M) | ckpt | log| json| diff --git a/configs/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics600_rgb.py b/configs/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics600_rgb.py new file mode 100644 index 0000000000..5a7e810384 --- /dev/null +++ b/configs/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics600_rgb.py @@ -0,0 +1,116 @@ +# model settings +model = dict( + type='Recognizer2D', + backbone=dict( + type='ResNet', + pretrained='torchvision://resnet50', + depth=50, + norm_eval=False), + cls_head=dict( + type='TSNHead', + num_classes=600, + in_channels=2048, + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.4, + init_std=0.01)) +# model training and testing settings +train_cfg = None +test_cfg = dict(average_clips=None) +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics600/videos_train' +data_root_val = 'data/kinetics600/videos_val' +ann_file_train = 'data/kinetics600/kinetics600_train_list_videos.txt' +ann_file_val = 'data/kinetics600/kinetics600_val_list_videos.txt' +ann_file_test = 'data/kinetics600/kinetics600_val_list_videos.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='DecordInit'), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='DecordDecode'), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=256), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=25, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=12, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +# optimizer +optimizer = dict( + type='SGD', lr=0.00375, momentum=0.9, + weight_decay=0.0001) # this lr is used for 8 gpus +optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) +# learning policy +lr_config = dict(policy='step', step=[40, 80]) +total_epochs = 100 +checkpoint_config = dict(interval=5) +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5)) +log_config = dict( + interval=20, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook'), + ]) +# runtime settings +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/tsn_r50_1x1x3_100e_kinetics600_rgb/' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics700_rgb.py b/configs/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics700_rgb.py new file mode 100644 index 0000000000..a49632ea9c --- /dev/null +++ b/configs/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics700_rgb.py @@ -0,0 +1,116 @@ +# model settings +model = dict( + type='Recognizer2D', + backbone=dict( + type='ResNet', + pretrained='torchvision://resnet50', + depth=50, + norm_eval=False), + cls_head=dict( + type='TSNHead', + num_classes=700, + in_channels=2048, + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.4, + init_std=0.01)) +# model training and testing settings +train_cfg = None +test_cfg = dict(average_clips=None) +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics700/videos_train' +data_root_val = 'data/kinetics700/videos_val' +ann_file_train = 'data/kinetics700/kinetics700_train_list_videos.txt' +ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt' +ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='DecordInit'), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='DecordDecode'), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=256), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict(type='DecordInit'), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=25, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=12, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +# optimizer +optimizer = dict( + type='SGD', lr=0.00375, momentum=0.9, + weight_decay=0.0001) # this lr is used for 8 gpus +optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) +# learning policy +lr_config = dict(policy='step', step=[40, 80]) +total_epochs = 100 +checkpoint_config = dict(interval=5) +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5)) +log_config = dict( + interval=20, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook'), + ]) +# runtime settings +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/tsn_r50_1x1x3_100e_kinetics700_rgb/' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/recognition/tsn/tsn_r50_video_320p_1x1x3_100e_kinetics400_rgb.py b/configs/recognition/tsn/tsn_r50_video_320p_1x1x3_100e_kinetics400_rgb.py index 2e5643d070..5f9d40e50e 100644 --- a/configs/recognition/tsn/tsn_r50_video_320p_1x1x3_100e_kinetics400_rgb.py +++ b/configs/recognition/tsn/tsn_r50_video_320p_1x1x3_100e_kinetics400_rgb.py @@ -28,12 +28,7 @@ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) train_pipeline = [ dict(type='DecordInit'), - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=3, - start_index=0), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3), dict(type='DecordDecode'), dict(type='RandomResizedCrop'), dict(type='Resize', scale=(224, 224), keep_ratio=False), @@ -50,7 +45,6 @@ clip_len=1, frame_interval=1, num_clips=3, - start_index=0, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), @@ -68,7 +62,6 @@ clip_len=1, frame_interval=1, num_clips=25, - start_index=0, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), diff --git a/docs/changelog.md b/docs/changelog.md index d0fe20a249..d61da7604a 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -8,6 +8,10 @@ **Bug Fixes** - Fix the potential bug for default value in dataset_setting ([#245](https://github.com/open-mmlab/mmaction2/pull/245)) - Fix the invalid config url in slowonly README data benchmark ([#249](https://github.com/open-mmlab/mmaction2/pull/249)) +- Validate that the performance of models trained with videos have no significant difference comparing to the performance of models trained with rawframes ([#256](https://github.com/open-mmlab/mmaction2/pull/256)) + +**ModelZoo** +- Add Baselines for Kinetics-600 and Kinetics-700, including TSN-R50-8seg and SlowOnly-R50-8x8 ([#259](https://github.com/open-mmlab/mmaction2/pull/259)) ### v0.7.0 (30/9/2020)