Skip to content

Commit

Permalink
Fix code style according to review.
Browse files Browse the repository at this point in the history
Minor.

Fix bugs in unittest.=
  • Loading branch information
xusu committed Nov 25, 2020
1 parent 9ed8287 commit 8b4b529
Show file tree
Hide file tree
Showing 12 changed files with 53 additions and 49 deletions.
2 changes: 1 addition & 1 deletion configs/recognition_audio/audioonly/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

|config | n_fft | gpus | backbone |pretrain| top1 acc/delta| top5 acc/delta | inference_time(video/s) | gpu_mem(M)| ckpt | log| json|
|:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
|[audioonly_r50_64x1x1_100e_kinetics400_audio_feature](/configs/recognition_audio/avslowfast/audioonly_r50_64x1x1_100e_kinetics400_audio_feature.py)|1024|8| ResNet50 | None |20.37|37.37|x|6154|[ckpt]()|[log]()|[json]()|
|[audioonly_r50_64x1x1_100e_kinetics400_audio_feature](/configs/recognition_audio/audioonly/audioonly_r50_64x1x1_100e_kinetics400_audio_feature.py)|1024|8| ResNet50 | None |20.37|37.37|x|6154|[ckpt]()|[log]()|[json]()|

Notes:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
init_std=0.01))
# model training and testing settings
train_cfg = None
test_cfg = dict(average_clips='score')
test_cfg = dict(average_clips='prob')
# dataset settings
dataset_type = 'AudioFeatureRawframeDataset'
data_root = 'data/kinetics400/rawframes_train'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@
# runtime settings
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/tsn_resnet_r18_64x1x1_100e_kinetics400_audio_feature/'
work_dir = './work_dirs/tsn_r18_64x1x1_100e_kinetics400_audio_feature/'
load_from = None
resume_from = None
workflow = [('train', 1)]
5 changes: 3 additions & 2 deletions mmaction/datasets/audio_feature_rawframe_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@

@DATASETS.register_module
class AudioFeatureRawframeDataset(RawframeDataset):
"""Dataset that read both audio and visual, supporting both rawframes and
videos."""

def __init__(self, ann_file, pipeline, audio_prefix, **kwargs):
self.audio_prefix = audio_prefix
self.video_prefix = kwargs.pop('video_prefix', None)
super(AudioFeatureRawframeDataset,
self).__init__(ann_file, pipeline, **kwargs)
super().__init__(ann_file, pipeline, **kwargs)

def load_annotations(self):
video_infos = []
Expand Down
29 changes: 16 additions & 13 deletions mmaction/models/backbones/resnet3d_avslowfast.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import random

import torch
Expand Down Expand Up @@ -59,7 +60,7 @@ def __init__(self,
act_cfg=None)

self.lateral_connections = []
for i in range(len(self.stage_blocks)):
for i, _ in enumerate(self.stage_blocks):
planes = self.base_channels * 2**i
self.inplanes = planes * self.block.expansion

Expand Down Expand Up @@ -139,8 +140,7 @@ def build_pathway(cfg, *args, **kwargs):
pathway_type = cfg_.pop('type')
if pathway_type not in pathway_cfg:
raise KeyError(f'Unrecognized pathway type {pathway_type}')
else:
pathway_cls = pathway_cfg[pathway_type]
pathway_cls = pathway_cfg[pathway_type]
pathway = pathway_cls(*args, **kwargs, **cfg_)

return pathway
Expand Down Expand Up @@ -231,19 +231,22 @@ def __init__(self,
self.speed_ratio_audio = speed_ratio_audio
self.channel_ratio_audio = channel_ratio_audio
self.drop_out_ratio = drop_out_ratio

if slow_pathway['lateral']:
slow_pathway['speed_ratio'] = speed_ratio_fast
slow_pathway['channel_ratio'] = channel_ratio_fast
if audio_pathway['lateral']:
audio_pathway['speed_ratio'] = speed_ratio_audio
audio_pathway['channel_ratio'] = channel_ratio_audio
slow_pathway_ = copy.deepcopy(slow_pathway)
fast_pathway_ = copy.deepcopy(fast_pathway)
audio_pathway_ = copy.deepcopy(audio_pathway)

if slow_pathway_['lateral']:
slow_pathway_['speed_ratio'] = speed_ratio_fast
slow_pathway_['channel_ratio'] = channel_ratio_fast
if audio_pathway_['lateral']:
audio_pathway_['speed_ratio'] = speed_ratio_audio
audio_pathway_['channel_ratio'] = channel_ratio_audio
random.seed(100)
# set the random seed to avoid different
# graphs in distributed env
self.slow_path = build_pathway(slow_pathway)
self.fast_path = build_pathway(fast_pathway)
self.audio_path = build_pathway(audio_pathway)
self.slow_path = build_pathway(slow_pathway_)
self.fast_path = build_pathway(fast_pathway_)
self.audio_path = build_pathway(audio_pathway_)

def init_weights(self):
"""Initiate the parameters either from existing checkpoint or from
Expand Down
3 changes: 0 additions & 3 deletions mmaction/models/common/conv_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,6 @@ def __init__(self,
kernel_size = _pair(kernel_size)
stride = _pair(stride)
padding = _pair(padding)
padding = _pair(dilation)

assert len(kernel_size) == len(stride) == len(padding) == 2

self.in_channels = in_channels
self.out_channels = out_channels
Expand Down
8 changes: 4 additions & 4 deletions mmaction/models/heads/avslowfast_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,15 @@ def forward(self, x):
Returns:
torch.Tensor: The classification scores for input samples.
"""
# ([N, channel_fast, T, H, W], [(N, channel_slow, T, H, W)])
# ([N, channel_fast, T, H, W], [(N, channel_slow, T, H, W)], [(N, channel_audio, T, F)]) # noqa:E501
x_slow, x_fast, x_audio = x
if len(x_audio.size()) == 4:
if x_audio.dim() == 4:
x_audio = x_audio.unsqueeze(4)
# ([N, channel_fast, 1, 1, 1], [N, channel_slow, 1, 1, 1])
# ([N, channel_fast, 1, 1, 1], [N, channel_slow, 1, 1, 1], [N, channel_audio, 1, 1, 1]) # noqa:E501
x_slow = self.avg_pool(x_slow)
x_fast = self.avg_pool(x_fast)
x_audio = self.avg_pool(x_audio)
# [N, channel_fast + channel_slow, 1, 1, 1]
# [N, channel_fast + channel_slow, + channel_audio 1, 1, 1]
x = torch.cat((x_slow, x_fast, x_audio), dim=1)

if self.dropout is not None:
Expand Down
3 changes: 1 addition & 2 deletions mmaction/models/recognizers/audio_visual_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,7 @@ def forward_test(self, imgs, audios):
def forward(self, imgs, audios, label, return_loss=True):
if return_loss:
return self.forward_train(imgs, audios, label)
else:
return self.forward_test(imgs, audios)
return self.forward_test(imgs, audios)

def forward_gradcam(self, audios):
raise NotImplementedError
Expand Down
8 changes: 4 additions & 4 deletions tests/test_models/test_backbone.py
Original file line number Diff line number Diff line change
Expand Up @@ -968,7 +968,7 @@ def test_resnet_audio_backbone():
audioonly.init_weights()
audioonly.train()
feat = audioonly(spec)
assert feat.shape == torch.size([1, 2048])
assert feat.shape == torch.Size([1, 1024, 8, 5])


def test_avslowfast_backbone():
Expand All @@ -983,9 +983,9 @@ def test_avslowfast_backbone():
avsf.train()
feat = avsf(imgs, spec)
assert isinstance(feat, tuple)
assert feat[0].shape == torch.size([1, 2048])
assert feat[1].shape == torch.size([1, 256])
assert feat[2].shape == torch.size([1, 1024])
assert feat[0].shape == torch.Size([1, 2048, 4, 1, 1])
assert feat[1].shape == torch.Size([1, 256, 32, 1, 1])
assert feat[2].shape == torch.Size([1, 1024, 8, 5])


@pytest.mark.skipif(
Expand Down
8 changes: 2 additions & 6 deletions tests/test_models/test_common_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,9 @@ def test_conv2plus1d():


def test_conv_audio():
with pytest.raises(AssertionError):
# Length of kernel size, stride and padding must be the same
ConvAudio(3, 8, (2, 2))

conv_audio = ConvAudio(3, 8, 2)
conv_audio.init_weights()

x = torch.rand(1, 3, 8, 8)
output = ConvAudio(x)
assert output.shape == torch.Size([1, 8, 8, 8])
output = conv_audio(x)
assert output.shape == torch.Size([1, 8, 9, 9])
12 changes: 7 additions & 5 deletions tests/test_models/test_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def test_avslowfast_head():
avsf_head.init_weights()

assert avsf_head.num_classes == 4
assert avsf_head.dropout_ratio == 0.5
assert avsf_head.dropout_ratio == 0.8
assert avsf_head.in_channels == 5
assert avsf_head.init_std == 0.01
assert avsf_head.spatial_type == 'avg'
Expand All @@ -238,11 +238,13 @@ def test_avslowfast_head():

slow_shape = (8, 1, 4, 7, 7)
fast_shape = (8, 2, 4, 7, 7)
audio_shape = (8, 1, 7, 7)
audio_shape = (8, 2, 7, 7)

feat = tuple(
torch.rand(slow_shape), torch.rand(fast_shape),
torch.rand(audio_shape))
feat = tuple([
torch.rand(slow_shape),
torch.rand(fast_shape),
torch.rand(audio_shape)
])

# tsn head inference
cls_scores = avsf_head(feat)
Expand Down
20 changes: 13 additions & 7 deletions tests/test_models/test_recognizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ def test_tpn():

def test_audio_recognizer():
model, train_cfg, test_cfg = _get_audio_recognizer_cfg(
'resnet/tsn_resnet_r50_64x1x1_100e_kinetics400_audio_feature.py')
'resnet/tsn_r50_64x1x1_100e_kinetics400_audio_feature.py')
model['backbone']['pretrained'] = None

recognizer = build_recognizer(
Expand Down Expand Up @@ -454,11 +454,16 @@ def test_av_recognizer():
recognizer = build_recognizer(
model, train_cfg=train_cfg, test_cfg=test_cfg)

input_shape = (1, 3, 1, 128, 80)
demo_inputs_audio = generate_demo_inputs(input_shape, model_type='audio')
input_shape = (1, 3, 32, 16, 16)
demo_inputs_visual = generate_demo_inputs(input_shape)
demo_inputs = {**demo_inputs_audio, **demo_inputs_visual}
audio_shape = (1, 3, 1, 128, 80)
demo_inputs_audio = generate_demo_inputs(audio_shape, model_type='audio')
visual_shape = (1, 3, 3, 32, 16, 16)
demo_inputs_visual = generate_demo_inputs(visual_shape, model_type='3D')
assert torch.equal(demo_inputs_audio['gt_labels'],
demo_inputs_visual['gt_labels'])
demo_inputs = dict(
imgs=demo_inputs_visual['imgs'],
audios=demo_inputs_audio['audios'],
gt_labels=demo_inputs_audio['gt_labels'])
imgs = demo_inputs['imgs']
audios = demo_inputs['audios']
gt_labels = demo_inputs['gt_labels']
Expand Down Expand Up @@ -530,6 +535,7 @@ def generate_demo_inputs(input_shape=(1, 3, 3, 224, 224), model_type='2D'):

inputs = {
'imgs': torch.FloatTensor(imgs),
'gt_labels': gt_labels,
'audios': torch.FloatTensor(imgs),
'gt_labels': gt_labels
}
return inputs

0 comments on commit 8b4b529

Please sign in to comment.