diff --git a/configs/recognition/tsm/README.md b/configs/recognition/tsm/README.md index 6fb3cf18b7..d944acbad0 100644 --- a/configs/recognition/tsm/README.md +++ b/configs/recognition/tsm/README.md @@ -46,7 +46,8 @@ |config | resolution | gpus | backbone| pretrain | top1 acc (efficient/accurate)| top5 acc (efficient/accurate)| reference top1 acc (efficient/accurate)| reference top5 acc (efficient/accurate)| gpu_mem(M) | ckpt | log| json| |:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:| -|[tsm_r50_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb.py) |height 100|8| ResNet50 | ImageNet|45.46 / 47.21|74.71 / 76.09|[45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)|[74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)| 7077| [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/tsm_r50_1x1x8_50e_sthv1_rgb_20200616-3417f361.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/20200616_022852.log)| [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/20200616_022852.log.json)| +|[tsm_r50_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb.py) |height 100|8| ResNet50 | ImageNet| 45.58 / 47.70|75.02 / 76.12|[45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)|[74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)| 7077| [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/tsm_r50_1x1x8_50e_sthv1_rgb_20210203-01dce462.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/20210203_150227.log)| [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/20210203_150227.log.json)| +|[tsm_r50_flip_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb.py) |height 100|8| ResNet50 | ImageNet| 47.10 / 48.51|76.02 / 77.56|[45.50 / 47.33](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)|[74.34 / 76.60](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)| 7077| [ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/tsm_r50_flip_1x1x8_50e_sthv1_rgb_20210203-12596f16.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/20210203_145829.log)| [json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb/20210203_145829.log.json)| |[tsm_r50_1x1x16_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py)|height 100|8| ResNet50 | ImageNet|47.62 / 49.28|76.63 / 77.82|[47.05 / 48.61](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)|[76.40 / 77.96](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)|10390|[ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/tsm_r50_1x1x16_50e_sthv1_rgb_20201010-17fa49f6.pth)|[log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/20201010_221240.log)|[json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb/20201010_221240.log.json)| |[tsm_r101_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb.py)|height 100|8| ResNet50 | ImageNet|45.72 / 48.43|74.67 / 76.72|[46.64 / 48.13](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)|[75.40 / 77.31](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)|9800|[ckpt](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/tsm_r101_1x1x8_50e_sthv1_rgb_20201010-43fedf2e.pth)|[log](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/20201010_224055.log)|[json](https://download.openmmlab.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb/20201010_224055.log.json)| diff --git a/configs/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb.py b/configs/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb.py new file mode 100644 index 0000000000..00f40cbd58 --- /dev/null +++ b/configs/recognition/tsm/tsm_r50_flip_1x1x8_50e_sthv1_rgb.py @@ -0,0 +1,97 @@ +_base_ = [ + '../../_base_/models/tsm_r50.py', '../../_base_/schedules/sgd_tsm_50e.py', + '../../_base_/default_runtime.py' +] + +# model settings +model = dict(cls_head=dict(num_classes=174)) + +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/sthv1/rawframes' +data_root_val = 'data/sthv1/rawframes' +ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt' +ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt' +ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' + +sthv1_flip_label_map = {2: 4, 4: 2, 30: 41, 41: 30, 52: 66, 66: 52} +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) + +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv1_flip_label_map), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + filename_tmpl='{:05}.jpg', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + filename_tmpl='{:05}.jpg', + pipeline=test_pipeline)) +evaluation = dict( + interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy']) + +# optimizer +optimizer = dict(weight_decay=0.0005) + +# runtime settings +work_dir = './work_dirs/tsm_r50_flip_1x1x8_50e_sthv1_rgb/' diff --git a/docs/changelog.md b/docs/changelog.md index 0411e77c28..ca69c142cc 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -9,6 +9,7 @@ **New Features** - Support TSM-MobileNetV2 ([#415](https://github.com/open-mmlab/mmaction2/pull/415)) +- Support flip with label mapping ([#591](https://github.com/open-mmlab/mmaction2/pull/591)) **ModelZoo** diff --git a/docs/tutorials/4_data_pipeline.md b/docs/tutorials/4_data_pipeline.md index 5475049c73..e7c6579528 100644 --- a/docs/tutorials/4_data_pipeline.md +++ b/docs/tutorials/4_data_pipeline.md @@ -177,7 +177,7 @@ For each operation, we list the related dict fields that are added/updated/remov `Flip` - add: flip, flip_direction -- update: imgs +- update: imgs, label `Normalize` diff --git a/mmaction/datasets/pipelines/augmentations.py b/mmaction/datasets/pipelines/augmentations.py index 4320846ba7..3a6088989b 100644 --- a/mmaction/datasets/pipelines/augmentations.py +++ b/mmaction/datasets/pipelines/augmentations.py @@ -1091,16 +1091,23 @@ class Flip: flip_ratio (float): Probability of implementing flip. Default: 0.5. direction (str): Flip imgs horizontally or vertically. Options are "horizontal" | "vertical". Default: "horizontal". + flip_label_map (Dict[int, int] | None): Transform the label of the + flipped image with the specific label. Default: None. lazy (bool): Determine whether to apply lazy operation. Default: False. """ _directions = ['horizontal', 'vertical'] - def __init__(self, flip_ratio=0.5, direction='horizontal', lazy=False): + def __init__(self, + flip_ratio=0.5, + direction='horizontal', + flip_label_map=None, + lazy=False): if direction not in self._directions: raise ValueError(f'Direction {direction} is not supported. ' f'Currently support ones are {self._directions}') self.flip_ratio = flip_ratio self.direction = direction + self.flip_label_map = flip_label_map self.lazy = lazy def __call__(self, results): @@ -1120,6 +1127,10 @@ def __call__(self, results): results['flip'] = flip results['flip_direction'] = self.direction + if self.flip_label_map is not None and flip: + results['label'] = self.flip_label_map.get(results['label'], + results['label']) + if not self.lazy: if flip: for i, img in enumerate(results['imgs']): @@ -1151,7 +1162,7 @@ def __repr__(self): repr_str = ( f'{self.__class__.__name__}(' f'flip_ratio={self.flip_ratio}, direction={self.direction}, ' - f'lazy={self.lazy})') + f'flip_label_map={self.flip_label_map}, lazy={self.lazy})') return repr_str diff --git a/tests/test_data/test_pipelines/test_augmentations/test_flip.py b/tests/test_data/test_pipelines/test_augmentations/test_flip.py index 5ca88c6655..6f25002bc0 100644 --- a/tests/test_data/test_pipelines/test_augmentations/test_flip.py +++ b/tests/test_data/test_pipelines/test_augmentations/test_flip.py @@ -78,4 +78,26 @@ def test_flip(self): assert repr(flip) == (f'{flip.__class__.__name__}' f'(flip_ratio={1}, direction=vertical, ' - f'lazy={False})') + f'flip_label_map={None}, lazy={False})') + + # transform label for the flipped image with the specific label. + _flip_label_map = {4: 6} + imgs = list(np.random.rand(2, 64, 64, 3)) + + # the label should be mapped. + results = dict(imgs=copy.deepcopy(imgs), modality='RGB', label=4) + flip = Flip( + flip_ratio=1, + direction='horizontal', + flip_label_map=_flip_label_map) + flip_results = flip(results) + assert results['label'] == 6 + + # the label should not be mapped. + results = dict(imgs=copy.deepcopy(imgs), modality='RGB', label=3) + flip = Flip( + flip_ratio=1, + direction='horizontal', + flip_label_map=_flip_label_map) + flip_results = flip(results) + assert results['label'] == 3 diff --git a/tests/test_data/test_pipelines/test_augmentations/test_lazy.py b/tests/test_data/test_pipelines/test_augmentations/test_lazy.py index 3315b6349a..b021e3e0bf 100644 --- a/tests/test_data/test_pipelines/test_augmentations/test_lazy.py +++ b/tests/test_data/test_pipelines/test_augmentations/test_lazy.py @@ -339,7 +339,7 @@ def test_flip_lazy(self): assert repr(flip) == (f'{flip.__class__.__name__}' f'(flip_ratio={1}, direction=vertical, ' - f'lazy={True})') + f'flip_label_map={None}, lazy={True})') def test_center_crop_lazy(self): with pytest.raises(TypeError):