Fix code style according to review.

Minor. Fix bugs in unittest.=
open-mmlab · Nov 25, 2020 · 8b4b529 · 8b4b529
1 parent 9ed8287
commit 8b4b529
Show file tree

Hide file tree

Showing 12 changed files with 53 additions and 49 deletions.
diff --git a/configs/recognition_audio/audioonly/README.md b/configs/recognition_audio/audioonly/README.md
@@ -16,7 +16,7 @@
 
 |config | n_fft | gpus | backbone |pretrain| top1 acc/delta| top5 acc/delta | inference_time(video/s) | gpu_mem(M)| ckpt | log| json|
 |:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
-|[audioonly_r50_64x1x1_100e_kinetics400_audio_feature](/configs/recognition_audio/avslowfast/audioonly_r50_64x1x1_100e_kinetics400_audio_feature.py)|1024|8| ResNet50 | None |20.37|37.37|x|6154|[ckpt]()|[log]()|[json]()|
+|[audioonly_r50_64x1x1_100e_kinetics400_audio_feature](/configs/recognition_audio/audioonly/audioonly_r50_64x1x1_100e_kinetics400_audio_feature.py)|1024|8| ResNet50 | None |20.37|37.37|x|6154|[ckpt]()|[log]()|[json]()|
 
 Notes:
 

diff --git a/configs/recognition_audio/avslowfast/avslowfast_r50_32x2x1_239e_kinetics400_audio_feature.py b/configs/recognition_audio/avslowfast/avslowfast_r50_32x2x1_239e_kinetics400_audio_feature.py
@@ -10,7 +10,7 @@
         init_std=0.01))
 # model training and testing settings
 train_cfg = None
-test_cfg = dict(average_clips='score')
+test_cfg = dict(average_clips='prob')
 # dataset settings
 dataset_type = 'AudioFeatureRawframeDataset'
 data_root = 'data/kinetics400/rawframes_train'

diff --git a/configs/recognition_audio/resnet/tsn_r18_64x1x1_100e_kinetics400_audio_feature.py b/configs/recognition_audio/resnet/tsn_r18_64x1x1_100e_kinetics400_audio_feature.py
@@ -90,7 +90,7 @@
 # runtime settings
 dist_params = dict(backend='nccl')
 log_level = 'INFO'
-work_dir = './work_dirs/tsn_resnet_r18_64x1x1_100e_kinetics400_audio_feature/'
+work_dir = './work_dirs/tsn_r18_64x1x1_100e_kinetics400_audio_feature/'
 load_from = None
 resume_from = None
 workflow = [('train', 1)]
diff --git a/mmaction/datasets/audio_feature_rawframe_dataset.py b/mmaction/datasets/audio_feature_rawframe_dataset.py
@@ -6,12 +6,13 @@
 
 @DATASETS.register_module
 class AudioFeatureRawframeDataset(RawframeDataset):
+    """Dataset that read both audio and visual, supporting both rawframes and
+    videos."""
 
     def __init__(self, ann_file, pipeline, audio_prefix, **kwargs):
         self.audio_prefix = audio_prefix
         self.video_prefix = kwargs.pop('video_prefix', None)
-        super(AudioFeatureRawframeDataset,
-              self).__init__(ann_file, pipeline, **kwargs)
+        super().__init__(ann_file, pipeline, **kwargs)
 
     def load_annotations(self):
         video_infos = []

diff --git a/mmaction/models/backbones/resnet3d_avslowfast.py b/mmaction/models/backbones/resnet3d_avslowfast.py
@@ -1,3 +1,4 @@
+import copy
 import random
 
 import torch
@@ -59,7 +60,7 @@ def __init__(self,
                 act_cfg=None)
 
         self.lateral_connections = []
-        for i in range(len(self.stage_blocks)):
+        for i, _ in enumerate(self.stage_blocks):
             planes = self.base_channels * 2**i
             self.inplanes = planes * self.block.expansion
 
@@ -139,8 +140,7 @@ def build_pathway(cfg, *args, **kwargs):
     pathway_type = cfg_.pop('type')
     if pathway_type not in pathway_cfg:
         raise KeyError(f'Unrecognized pathway type {pathway_type}')
-    else:
-        pathway_cls = pathway_cfg[pathway_type]
+    pathway_cls = pathway_cfg[pathway_type]
     pathway = pathway_cls(*args, **kwargs, **cfg_)
 
     return pathway
@@ -231,19 +231,22 @@ def __init__(self,
         self.speed_ratio_audio = speed_ratio_audio
         self.channel_ratio_audio = channel_ratio_audio
         self.drop_out_ratio = drop_out_ratio
-
-        if slow_pathway['lateral']:
-            slow_pathway['speed_ratio'] = speed_ratio_fast
-            slow_pathway['channel_ratio'] = channel_ratio_fast
-        if audio_pathway['lateral']:
-            audio_pathway['speed_ratio'] = speed_ratio_audio
-            audio_pathway['channel_ratio'] = channel_ratio_audio
+        slow_pathway_ = copy.deepcopy(slow_pathway)
+        fast_pathway_ = copy.deepcopy(fast_pathway)
+        audio_pathway_ = copy.deepcopy(audio_pathway)
+
+        if slow_pathway_['lateral']:
+            slow_pathway_['speed_ratio'] = speed_ratio_fast
+            slow_pathway_['channel_ratio'] = channel_ratio_fast
+        if audio_pathway_['lateral']:
+            audio_pathway_['speed_ratio'] = speed_ratio_audio
+            audio_pathway_['channel_ratio'] = channel_ratio_audio
         random.seed(100)
         # set the random seed to avoid different
         # graphs in distributed env
-        self.slow_path = build_pathway(slow_pathway)
-        self.fast_path = build_pathway(fast_pathway)
-        self.audio_path = build_pathway(audio_pathway)
+        self.slow_path = build_pathway(slow_pathway_)
+        self.fast_path = build_pathway(fast_pathway_)
+        self.audio_path = build_pathway(audio_pathway_)
 
     def init_weights(self):
         """Initiate the parameters either from existing checkpoint or from

diff --git a/mmaction/models/common/conv_audio.py b/mmaction/models/common/conv_audio.py
@@ -34,9 +34,6 @@ def __init__(self,
         kernel_size = _pair(kernel_size)
         stride = _pair(stride)
         padding = _pair(padding)
-        padding = _pair(dilation)
-
-        assert len(kernel_size) == len(stride) == len(padding) == 2
 
         self.in_channels = in_channels
         self.out_channels = out_channels

diff --git a/mmaction/models/heads/avslowfast_head.py b/mmaction/models/heads/avslowfast_head.py
@@ -29,15 +29,15 @@ def forward(self, x):
         Returns:
             torch.Tensor: The classification scores for input samples.
         """
-        # ([N, channel_fast, T, H, W], [(N, channel_slow, T, H, W)])
+        # ([N, channel_fast, T, H, W], [(N, channel_slow, T, H, W)], [(N, channel_audio, T, F)]) # noqa:E501
         x_slow, x_fast, x_audio = x
-        if len(x_audio.size()) == 4:
+        if x_audio.dim() == 4:
             x_audio = x_audio.unsqueeze(4)
-        # ([N, channel_fast, 1, 1, 1], [N, channel_slow, 1, 1, 1])
+        # ([N, channel_fast, 1, 1, 1], [N, channel_slow, 1, 1, 1], [N, channel_audio, 1, 1, 1]) # noqa:E501
         x_slow = self.avg_pool(x_slow)
         x_fast = self.avg_pool(x_fast)
         x_audio = self.avg_pool(x_audio)
-        # [N, channel_fast + channel_slow, 1, 1, 1]
+        # [N, channel_fast + channel_slow, + channel_audio 1, 1, 1]
         x = torch.cat((x_slow, x_fast, x_audio), dim=1)
 
         if self.dropout is not None:

diff --git a/mmaction/models/recognizers/audio_visual_recognizer.py b/mmaction/models/recognizers/audio_visual_recognizer.py
@@ -54,8 +54,7 @@ def forward_test(self, imgs, audios):
     def forward(self, imgs, audios, label, return_loss=True):
         if return_loss:
             return self.forward_train(imgs, audios, label)
-        else:
-            return self.forward_test(imgs, audios)
+        return self.forward_test(imgs, audios)
 
     def forward_gradcam(self, audios):
         raise NotImplementedError

diff --git a/tests/test_models/test_backbone.py b/tests/test_models/test_backbone.py
@@ -968,7 +968,7 @@ def test_resnet_audio_backbone():
     audioonly.init_weights()
     audioonly.train()
     feat = audioonly(spec)
-    assert feat.shape == torch.size([1, 2048])
+    assert feat.shape == torch.Size([1, 1024, 8, 5])
 
 
 def test_avslowfast_backbone():
@@ -983,9 +983,9 @@ def test_avslowfast_backbone():
     avsf.train()
     feat = avsf(imgs, spec)
     assert isinstance(feat, tuple)
-    assert feat[0].shape == torch.size([1, 2048])
-    assert feat[1].shape == torch.size([1, 256])
-    assert feat[2].shape == torch.size([1, 1024])
+    assert feat[0].shape == torch.Size([1, 2048, 4, 1, 1])
+    assert feat[1].shape == torch.Size([1, 256, 32, 1, 1])
+    assert feat[2].shape == torch.Size([1, 1024, 8, 5])
 
 
 @pytest.mark.skipif(

diff --git a/tests/test_models/test_common_modules.py b/tests/test_models/test_common_modules.py
@@ -23,13 +23,9 @@ def test_conv2plus1d():
 
 
 def test_conv_audio():
-    with pytest.raises(AssertionError):
-        # Length of kernel size, stride and padding must be the same
-        ConvAudio(3, 8, (2, 2))
-
     conv_audio = ConvAudio(3, 8, 2)
     conv_audio.init_weights()
 
     x = torch.rand(1, 3, 8, 8)
-    output = ConvAudio(x)
-    assert output.shape == torch.Size([1, 8, 8, 8])
+    output = conv_audio(x)
+    assert output.shape == torch.Size([1, 8, 9, 9])
diff --git a/tests/test_models/test_head.py b/tests/test_models/test_head.py
@@ -221,7 +221,7 @@ def test_avslowfast_head():
     avsf_head.init_weights()
 
     assert avsf_head.num_classes == 4
-    assert avsf_head.dropout_ratio == 0.5
+    assert avsf_head.dropout_ratio == 0.8
     assert avsf_head.in_channels == 5
     assert avsf_head.init_std == 0.01
     assert avsf_head.spatial_type == 'avg'
@@ -238,11 +238,13 @@ def test_avslowfast_head():
 
     slow_shape = (8, 1, 4, 7, 7)
     fast_shape = (8, 2, 4, 7, 7)
-    audio_shape = (8, 1, 7, 7)
+    audio_shape = (8, 2, 7, 7)
 
-    feat = tuple(
-        torch.rand(slow_shape), torch.rand(fast_shape),
-        torch.rand(audio_shape))
+    feat = tuple([
+        torch.rand(slow_shape),
+        torch.rand(fast_shape),
+        torch.rand(audio_shape)
+    ])
 
     # tsn head inference
     cls_scores = avsf_head(feat)

diff --git a/tests/test_models/test_recognizers.py b/tests/test_models/test_recognizers.py
@@ -424,7 +424,7 @@ def test_tpn():
 
 def test_audio_recognizer():
     model, train_cfg, test_cfg = _get_audio_recognizer_cfg(
-        'resnet/tsn_resnet_r50_64x1x1_100e_kinetics400_audio_feature.py')
+        'resnet/tsn_r50_64x1x1_100e_kinetics400_audio_feature.py')
     model['backbone']['pretrained'] = None
 
     recognizer = build_recognizer(
@@ -454,11 +454,16 @@ def test_av_recognizer():
     recognizer = build_recognizer(
         model, train_cfg=train_cfg, test_cfg=test_cfg)
 
-    input_shape = (1, 3, 1, 128, 80)
-    demo_inputs_audio = generate_demo_inputs(input_shape, model_type='audio')
-    input_shape = (1, 3, 32, 16, 16)
-    demo_inputs_visual = generate_demo_inputs(input_shape)
-    demo_inputs = {**demo_inputs_audio, **demo_inputs_visual}
+    audio_shape = (1, 3, 1, 128, 80)
+    demo_inputs_audio = generate_demo_inputs(audio_shape, model_type='audio')
+    visual_shape = (1, 3, 3, 32, 16, 16)
+    demo_inputs_visual = generate_demo_inputs(visual_shape, model_type='3D')
+    assert torch.equal(demo_inputs_audio['gt_labels'],
+                       demo_inputs_visual['gt_labels'])
+    demo_inputs = dict(
+        imgs=demo_inputs_visual['imgs'],
+        audios=demo_inputs_audio['audios'],
+        gt_labels=demo_inputs_audio['gt_labels'])
     imgs = demo_inputs['imgs']
     audios = demo_inputs['audios']
     gt_labels = demo_inputs['gt_labels']
@@ -530,6 +535,7 @@ def generate_demo_inputs(input_shape=(1, 3, 3, 224, 224), model_type='2D'):
 
     inputs = {
         'imgs': torch.FloatTensor(imgs),
-        'gt_labels': gt_labels,
+        'audios': torch.FloatTensor(imgs),
+        'gt_labels': gt_labels
     }
     return inputs