From c5f00a8d7091b46b0ecd175f722d98ea8bea485e Mon Sep 17 00:00:00 2001 From: HaodongDuan Date: Fri, 16 Oct 2020 17:35:47 +0800 Subject: [PATCH 1/9] resolve comments --- tools/data/hvu/generate_sub_file_list.py | 49 ++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 tools/data/hvu/generate_sub_file_list.py diff --git a/tools/data/hvu/generate_sub_file_list.py b/tools/data/hvu/generate_sub_file_list.py new file mode 100644 index 0000000000..77c7bed651 --- /dev/null +++ b/tools/data/hvu/generate_sub_file_list.py @@ -0,0 +1,49 @@ +import argparse +import os.path as osp + +import mmcv + + +def main(annotation_file, category): + assert category in [ + 'action', 'attribute', 'concept', 'event', 'object', 'scene' + ] + + data = mmcv.load(annotation_file) + basename = osp.basename(annotation_file) + dirname = osp.dirname(annotation_file) + basename = basename.replace('hvu', f'hvu_{category}') + + target_file = osp.join(dirname, basename) + + def parse_item(item, category): + label = item['label'] + if category in label: + item['label'] = label[category] + return item + else: + return None + + result = [] + for item in data: + label = item['label'] + if category in label: + item['label'] = label[category] + result.append(item) + + mmcv.dump(data, target_file) + + +if __name__ == '__main__': + description = 'Helper script for generating HVU per-category file list.' + p = argparse.ArgumentParser(description=description) + p.add_argument( + 'annotation_file', + type=str, + help=('The annotation file which contains tags of all categories.')) + p.add_argument( + 'category', + type=str, + choices=['action', 'attribute', 'concept', 'event', 'object', 'scene'], + help='The tag category that you want to generate file list for.') + main(**vars(p.parse_args())) From 05575c18bf7dbba7e6e9f9a0f1c4f973668d2c44 Mon Sep 17 00:00:00 2001 From: HaodongDuan Date: Fri, 16 Oct 2020 17:37:19 +0800 Subject: [PATCH 2/9] update changelog --- docs/changelog.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/changelog.md b/docs/changelog.md index d0fe20a249..3cff48664d 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -4,6 +4,7 @@ **Improvements** - Set default values of 'average_clips' in each config file so that there is no need to set it explicitly during testing in most cases ([#232](https://github.com/open-mmlab/mmaction2/pull/232)) +- Extend HVU datatools to generate individual file list for each tag category ([#258](https://github.com/open-mmlab/mmaction2/pull/258)) **Bug Fixes** - Fix the potential bug for default value in dataset_setting ([#245](https://github.com/open-mmlab/mmaction2/pull/245)) From 295463f8534f2f6fa42ea2cadf40b926936c06e9 Mon Sep 17 00:00:00 2001 From: kenny Date: Fri, 18 Dec 2020 22:08:13 +0800 Subject: [PATCH 3/9] + sparse demo --- demo/long_video_demo.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/demo/long_video_demo.py b/demo/long_video_demo.py index 62b91d0b91..7ff272c473 100644 --- a/demo/long_video_demo.py +++ b/demo/long_video_demo.py @@ -45,6 +45,13 @@ def parse_args(): type=float, default=0.01, help='recognition score threshold') + parser.add_argument( + '--stride', + type=float, + default=0, + default=('the prediction stride equals to ' + 'stride * sample_length, if set as 0, the ' + 'prediction stride is 1')) args = parser.parse_args() return args @@ -88,6 +95,11 @@ def show_results(): ret, scores = inference() + if ret and stride > 0: + pred_stride = int(sample_length * stride) + for i in range(pred_stride): + frame_queue.popleft() + if ret: num_selected_labels = min(len(label), 5) scores_tuples = tuple(zip(label, scores)) @@ -142,13 +154,14 @@ def inference(): def main(): global frame_queue, threshold, sample_length, data, test_pipeline, model, \ - out_file, video_path, device, input_step, label, result_queue + out_file, video_path, device, input_step, label, result_queue, stride args = parse_args() input_step = args.input_step threshold = args.threshold video_path = args.video out_file = args.out_file + stride = args.stride device = torch.device(args.device) model = init_recognizer(args.config, args.checkpoint, device=device) From f8b85dfef60c2eb6b17559be537fb44b284c7f5c Mon Sep 17 00:00:00 2001 From: kenny Date: Mon, 21 Dec 2020 11:09:04 +0800 Subject: [PATCH 4/9] fix bug --- demo/long_video_demo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/demo/long_video_demo.py b/demo/long_video_demo.py index 7ff272c473..1d8eb69c68 100644 --- a/demo/long_video_demo.py +++ b/demo/long_video_demo.py @@ -49,9 +49,9 @@ def parse_args(): '--stride', type=float, default=0, - default=('the prediction stride equals to ' - 'stride * sample_length, if set as 0, the ' - 'prediction stride is 1')) + help=('the prediction stride equals to ' + 'stride * sample_length, if set as 0, the ' + 'prediction stride is 1')) args = parser.parse_args() return args From 01e87a2705a44205d7aa9dbfec50e4f3c3a5f8db Mon Sep 17 00:00:00 2001 From: kenny Date: Mon, 21 Dec 2020 11:19:46 +0800 Subject: [PATCH 5/9] add doc for the new arg --- demo/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/demo/README.md b/demo/README.md index 854c8e24dc..389fc983af 100644 --- a/demo/README.md +++ b/demo/README.md @@ -225,6 +225,7 @@ Optional arguments: - `INPUT_STEP`: Input step for sampling frames, which can help to get more spare input. If not specified , it will be set to 1. - `DEVICE_TYPE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`. - `THRESHOLD`: Threshold of prediction score for action recognition. Only label with score higher than the threshold will be shown. If not specified, it will be set to 0.01. +- `STRIDE`: By default, the demo generates a prediction for each single frame, which might cost lots of time. To speed up, you can set the argument `STRIDE` and then the demo will generate a prediction every `STRIDE x sample_length` frames. For example, if the sample_length is 64 frames and you set `STRIDE` to 0.5, predictions will be generated every 32 frames. If set as 0, predictions will be generated for each frame. Default: 0. Examples: From cd234be691edeaca3048d06a6f8980657260b5cd Mon Sep 17 00:00:00 2001 From: kenny Date: Tue, 22 Dec 2020 19:24:41 +0800 Subject: [PATCH 6/9] update changelog --- docs/changelog.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/changelog.md b/docs/changelog.md index 3e56586252..9f66cba3b6 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -10,6 +10,7 @@ **Improvements** +- Add arg `stride` to long_video_demo.py, to make inference faster ([#468](https://github.com/open-mmlab/mmaction2/pull/468)) - Support training and testing for Spatio-Temporal Action Detection ([#351](https://github.com/open-mmlab/mmaction2/pull/351)) - Fix CI due to pip upgrade ([#454](https://github.com/open-mmlab/mmaction2/pull/454)) - Add markdown lint in pre-commit hook ([#255](https://github.com/open-mmlab/mmaction2/pull/225)) From d01b674aab9b0513b8f1939cdf1a587fac8e3825 Mon Sep 17 00:00:00 2001 From: kenny Date: Tue, 22 Dec 2020 21:50:38 +0800 Subject: [PATCH 7/9] reorg code in long demo --- demo/long_video_demo.py | 66 ++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/demo/long_video_demo.py b/demo/long_video_demo.py index 1d8eb69c68..4a21ed051b 100644 --- a/demo/long_video_demo.py +++ b/demo/long_video_demo.py @@ -30,7 +30,7 @@ def parse_args(): description='MMAction2 predict different labels in a long video demo') parser.add_argument('config', help='test config file path') parser.add_argument('checkpoint', help='checkpoint file/url') - parser.add_argument('video', help='video file/url') + parser.add_argument('video_path', help='video file/url') parser.add_argument('label', help='label file') parser.add_argument('out_file', help='output filename') parser.add_argument( @@ -56,8 +56,11 @@ def parse_args(): return args -def show_results(): - cap = cv2.VideoCapture(video_path) +def show_results(model, data, label, args): + frame_queue = deque(maxlen=args.sample_length) + result_queue = deque(maxlen=1) + + cap = cv2.VideoCapture(args.video_path) num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) @@ -69,7 +72,7 @@ def show_results(): frame_size = (frame_width, frame_height) ind = 0 - video_writer = cv2.VideoWriter(out_file, fourcc, fps, frame_size) + video_writer = cv2.VideoWriter(args.out_file, fourcc, fps, frame_size) prog_bar = mmcv.ProgressBar(num_frames) backup_frames = [] @@ -81,24 +84,19 @@ def show_results(): # drop it when encounting None continue backup_frames.append(np.array(frame)[:, :, ::-1]) - if ind == sample_length: + if ind == args.sample_length: # provide a quick show at the beginning frame_queue.extend(backup_frames) backup_frames = [] - elif ((len(backup_frames) == input_step and ind > sample_length) - or ind == num_frames): + elif ((len(backup_frames) == args.input_step + and ind > args.sample_length) or ind == num_frames): # pick a frame from the backup # when the backup is full or reach the last frame chosen_frame = random.choice(backup_frames) backup_frames = [] frame_queue.append(chosen_frame) - ret, scores = inference() - - if ret and stride > 0: - pred_stride = int(sample_length * stride) - for i in range(pred_stride): - frame_queue.popleft() + ret, scores = inference(model, data, args, frame_queue) if ret: num_selected_labels = min(len(label), 5) @@ -113,7 +111,7 @@ def show_results(): results = result_queue.popleft() for i, result in enumerate(results): selected_label, score = result - if score < threshold: + if score < args.threshold: break location = (0, 40 + i * 20) text = selected_label + ': ' + str(round(score, 2)) @@ -132,39 +130,37 @@ def show_results(): cv2.destroyAllWindows() -def inference(): - if len(frame_queue) != sample_length: +def inference(model, data, args, frame_queue): + if len(frame_queue) != args.sample_length: # Do no inference when there is no enough frames return False, None cur_windows = list(np.array(frame_queue)) - img = frame_queue.popleft() if data['img_shape'] is None: - data['img_shape'] = img.shape[:2] + data['img_shape'] = frame_queue[0].shape[:2] + cur_data = data.copy() cur_data['imgs'] = cur_windows - cur_data = test_pipeline(cur_data) + cur_data = args.test_pipeline(cur_data) cur_data = collate([cur_data], samples_per_gpu=1) if next(model.parameters()).is_cuda: - cur_data = scatter(cur_data, [device])[0] + cur_data = scatter(cur_data, [args.device])[0] with torch.no_grad(): scores = model(return_loss=False, **cur_data)[0] + + if args.stride > 0: + pred_stride = int(args.sample_length * args.stride) + for i in range(pred_stride): + frame_queue.popleft() + return True, scores def main(): - global frame_queue, threshold, sample_length, data, test_pipeline, model, \ - out_file, video_path, device, input_step, label, result_queue, stride - args = parse_args() - input_step = args.input_step - threshold = args.threshold - video_path = args.video - out_file = args.out_file - stride = args.stride - - device = torch.device(args.device) - model = init_recognizer(args.config, args.checkpoint, device=device) + + args.device = torch.device(args.device) + model = init_recognizer(args.config, args.checkpoint, device=args.device) data = dict(img_shape=None, modality='RGB', label=-1) with open(args.label, 'r') as f: label = [line.strip() for line in f] @@ -184,10 +180,12 @@ def main(): # remove step to decode frames pipeline_.remove(step) test_pipeline = Compose(pipeline_) + assert sample_length > 0 - frame_queue = deque(maxlen=sample_length) - result_queue = deque(maxlen=1) - show_results() + args.sample_length = sample_length + args.test_pipeline = test_pipeline + + show_results(model, data, label, args) if __name__ == '__main__': From a8391650c035ce48b919cb418664714d50952d60 Mon Sep 17 00:00:00 2001 From: Jintao Lin <528557675@qq.com> Date: Wed, 23 Dec 2020 11:30:20 +0800 Subject: [PATCH 8/9] add a comment --- demo/long_video_demo.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/demo/long_video_demo.py b/demo/long_video_demo.py index 4a21ed051b..78fd8e7598 100644 --- a/demo/long_video_demo.py +++ b/demo/long_video_demo.py @@ -153,6 +153,9 @@ def inference(model, data, args, frame_queue): for i in range(pred_stride): frame_queue.popleft() + # for case ``args.stride=0`` + # deque will automatically popleft one element + return True, scores From 2238681f457ea00d2dd26d92d3dc04ac90e7ae1e Mon Sep 17 00:00:00 2001 From: kenny Date: Thu, 24 Dec 2020 21:53:21 +0800 Subject: [PATCH 9/9] resolve comments --- demo/README.md | 2 +- demo/long_video_demo.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/demo/README.md b/demo/README.md index 389fc983af..b31fb9d8fa 100644 --- a/demo/README.md +++ b/demo/README.md @@ -225,7 +225,7 @@ Optional arguments: - `INPUT_STEP`: Input step for sampling frames, which can help to get more spare input. If not specified , it will be set to 1. - `DEVICE_TYPE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`. - `THRESHOLD`: Threshold of prediction score for action recognition. Only label with score higher than the threshold will be shown. If not specified, it will be set to 0.01. -- `STRIDE`: By default, the demo generates a prediction for each single frame, which might cost lots of time. To speed up, you can set the argument `STRIDE` and then the demo will generate a prediction every `STRIDE x sample_length` frames. For example, if the sample_length is 64 frames and you set `STRIDE` to 0.5, predictions will be generated every 32 frames. If set as 0, predictions will be generated for each frame. Default: 0. +- `STRIDE`: By default, the demo generates a prediction for each single frame, which might cost lots of time. To speed up, you can set the argument `STRIDE` and then the demo will generate a prediction every `STRIDE x sample_length` frames (`sample_length` indicates the size of temporal window from which you sample frames, which equals to `clip_len x frame_interval`). For example, if the sample_length is 64 frames and you set `STRIDE` to 0.5, predictions will be generated every 32 frames. If set as 0, predictions will be generated for each frame. The desired value of `STRIDE` is (0, 1], while it also works for `STRIDE > 1` (the generated predictions will be too sparse). Default: 0. Examples: diff --git a/demo/long_video_demo.py b/demo/long_video_demo.py index 4a21ed051b..3902784c11 100644 --- a/demo/long_video_demo.py +++ b/demo/long_video_demo.py @@ -49,8 +49,10 @@ def parse_args(): '--stride', type=float, default=0, - help=('the prediction stride equals to ' - 'stride * sample_length, if set as 0, the ' + help=('the prediction stride equals to stride * sample_length ' + '(sample_length indicates the size of temporal window from ' + 'which you sample frames, which equals to ' + 'clip_len x frame_interval), if set as 0, the ' 'prediction stride is 1')) args = parser.parse_args() return args