From c5f00a8d7091b46b0ecd175f722d98ea8bea485e Mon Sep 17 00:00:00 2001
From: HaodongDuan <duanhaodong@sensetime.com>
Date: Fri, 16 Oct 2020 17:35:47 +0800
Subject: [PATCH 1/9] resolve comments

---
 tools/data/hvu/generate_sub_file_list.py | 49 ++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 tools/data/hvu/generate_sub_file_list.py

diff --git a/tools/data/hvu/generate_sub_file_list.py b/tools/data/hvu/generate_sub_file_list.py
new file mode 100644
index 0000000000..77c7bed651
--- /dev/null
+++ b/tools/data/hvu/generate_sub_file_list.py
@@ -0,0 +1,49 @@
+import argparse
+import os.path as osp
+
+import mmcv
+
+
+def main(annotation_file, category):
+    assert category in [
+        'action', 'attribute', 'concept', 'event', 'object', 'scene'
+    ]
+
+    data = mmcv.load(annotation_file)
+    basename = osp.basename(annotation_file)
+    dirname = osp.dirname(annotation_file)
+    basename = basename.replace('hvu', f'hvu_{category}')
+
+    target_file = osp.join(dirname, basename)
+
+    def parse_item(item, category):
+        label = item['label']
+        if category in label:
+            item['label'] = label[category]
+            return item
+        else:
+            return None
+
+    result = []
+    for item in data:
+        label = item['label']
+        if category in label:
+            item['label'] = label[category]
+            result.append(item)
+
+    mmcv.dump(data, target_file)
+
+
+if __name__ == '__main__':
+    description = 'Helper script for generating HVU per-category file list.'
+    p = argparse.ArgumentParser(description=description)
+    p.add_argument(
+        'annotation_file',
+        type=str,
+        help=('The annotation file which contains tags of all categories.'))
+    p.add_argument(
+        'category',
+        type=str,
+        choices=['action', 'attribute', 'concept', 'event', 'object', 'scene'],
+        help='The tag category that you want to generate file list for.')
+    main(**vars(p.parse_args()))

From 05575c18bf7dbba7e6e9f9a0f1c4f973668d2c44 Mon Sep 17 00:00:00 2001
From: HaodongDuan <duanhaodong@sensetime.com>
Date: Fri, 16 Oct 2020 17:37:19 +0800
Subject: [PATCH 2/9] update changelog

---
 docs/changelog.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/changelog.md b/docs/changelog.md
index d0fe20a249..3cff48664d 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -4,6 +4,7 @@
 
 **Improvements**
 - Set default values of 'average_clips' in each config file so that there is no need to set it explicitly during testing in most cases ([#232](https://github.com/open-mmlab/mmaction2/pull/232))
+- Extend HVU datatools to generate individual file list for each tag category ([#258](https://github.com/open-mmlab/mmaction2/pull/258))
 
 **Bug Fixes**
 - Fix the potential bug for default value in dataset_setting ([#245](https://github.com/open-mmlab/mmaction2/pull/245))

From 295463f8534f2f6fa42ea2cadf40b926936c06e9 Mon Sep 17 00:00:00 2001
From: kenny <duanhaodong@sensetime.com>
Date: Fri, 18 Dec 2020 22:08:13 +0800
Subject: [PATCH 3/9] + sparse demo

---
 demo/long_video_demo.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/demo/long_video_demo.py b/demo/long_video_demo.py
index 62b91d0b91..7ff272c473 100644
--- a/demo/long_video_demo.py
+++ b/demo/long_video_demo.py
@@ -45,6 +45,13 @@ def parse_args():
         type=float,
         default=0.01,
         help='recognition score threshold')
+    parser.add_argument(
+        '--stride',
+        type=float,
+        default=0,
+        default=('the prediction stride equals to '
+                 'stride * sample_length, if set as 0, the '
+                 'prediction stride is 1'))
     args = parser.parse_args()
     return args
 
@@ -88,6 +95,11 @@ def show_results():
 
         ret, scores = inference()
 
+        if ret and stride > 0:
+            pred_stride = int(sample_length * stride)
+            for i in range(pred_stride):
+                frame_queue.popleft()
+
         if ret:
             num_selected_labels = min(len(label), 5)
             scores_tuples = tuple(zip(label, scores))
@@ -142,13 +154,14 @@ def inference():
 
 def main():
     global frame_queue, threshold, sample_length, data, test_pipeline, model, \
-        out_file, video_path, device, input_step, label, result_queue
+        out_file, video_path, device, input_step, label, result_queue, stride
 
     args = parse_args()
     input_step = args.input_step
     threshold = args.threshold
     video_path = args.video
     out_file = args.out_file
+    stride = args.stride
 
     device = torch.device(args.device)
     model = init_recognizer(args.config, args.checkpoint, device=device)

From f8b85dfef60c2eb6b17559be537fb44b284c7f5c Mon Sep 17 00:00:00 2001
From: kenny <duanhaodong@sensetime.com>
Date: Mon, 21 Dec 2020 11:09:04 +0800
Subject: [PATCH 4/9] fix bug

---
 demo/long_video_demo.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/demo/long_video_demo.py b/demo/long_video_demo.py
index 7ff272c473..1d8eb69c68 100644
--- a/demo/long_video_demo.py
+++ b/demo/long_video_demo.py
@@ -49,9 +49,9 @@ def parse_args():
         '--stride',
         type=float,
         default=0,
-        default=('the prediction stride equals to '
-                 'stride * sample_length, if set as 0, the '
-                 'prediction stride is 1'))
+        help=('the prediction stride equals to '
+              'stride * sample_length, if set as 0, the '
+              'prediction stride is 1'))
     args = parser.parse_args()
     return args
 

From 01e87a2705a44205d7aa9dbfec50e4f3c3a5f8db Mon Sep 17 00:00:00 2001
From: kenny <duanhaodong@sensetime.com>
Date: Mon, 21 Dec 2020 11:19:46 +0800
Subject: [PATCH 5/9] add doc for the new arg

---
 demo/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/demo/README.md b/demo/README.md
index 854c8e24dc..389fc983af 100644
--- a/demo/README.md
+++ b/demo/README.md
@@ -225,6 +225,7 @@ Optional arguments:
 - `INPUT_STEP`: Input step for sampling frames, which can help to get more spare input. If not specified , it will be set to 1.
 - `DEVICE_TYPE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`.
 - `THRESHOLD`: Threshold of prediction score for action recognition. Only label with score higher than the threshold will be shown. If not specified, it will be set to 0.01.
+- `STRIDE`: By default, the demo generates a prediction for each single frame, which might cost lots of time. To speed up, you can set the argument `STRIDE` and then the demo will generate a prediction every `STRIDE x sample_length` frames. For example, if the sample_length is 64 frames and you set `STRIDE` to 0.5, predictions will be generated every 32 frames. If set as 0, predictions will be generated for each frame. Default: 0.
 
 Examples:
 

From cd234be691edeaca3048d06a6f8980657260b5cd Mon Sep 17 00:00:00 2001
From: kenny <duanhaodong@sensetime.com>
Date: Tue, 22 Dec 2020 19:24:41 +0800
Subject: [PATCH 6/9] update changelog

---
 docs/changelog.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/changelog.md b/docs/changelog.md
index 3e56586252..9f66cba3b6 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -10,6 +10,7 @@
 
 **Improvements**
 
+- Add arg `stride` to long_video_demo.py, to make inference faster ([#468](https://github.com/open-mmlab/mmaction2/pull/468))
 - Support training and testing for Spatio-Temporal Action Detection ([#351](https://github.com/open-mmlab/mmaction2/pull/351))
 - Fix CI due to pip upgrade ([#454](https://github.com/open-mmlab/mmaction2/pull/454))
 - Add markdown lint in pre-commit hook ([#255](https://github.com/open-mmlab/mmaction2/pull/225))

From d01b674aab9b0513b8f1939cdf1a587fac8e3825 Mon Sep 17 00:00:00 2001
From: kenny <duanhaodong@sensetime.com>
Date: Tue, 22 Dec 2020 21:50:38 +0800
Subject: [PATCH 7/9] reorg code in long demo

---
 demo/long_video_demo.py | 66 ++++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 34 deletions(-)

diff --git a/demo/long_video_demo.py b/demo/long_video_demo.py
index 1d8eb69c68..4a21ed051b 100644
--- a/demo/long_video_demo.py
+++ b/demo/long_video_demo.py
@@ -30,7 +30,7 @@ def parse_args():
         description='MMAction2 predict different labels in a long video demo')
     parser.add_argument('config', help='test config file path')
     parser.add_argument('checkpoint', help='checkpoint file/url')
-    parser.add_argument('video', help='video file/url')
+    parser.add_argument('video_path', help='video file/url')
     parser.add_argument('label', help='label file')
     parser.add_argument('out_file', help='output filename')
     parser.add_argument(
@@ -56,8 +56,11 @@ def parse_args():
     return args
 
 
-def show_results():
-    cap = cv2.VideoCapture(video_path)
+def show_results(model, data, label, args):
+    frame_queue = deque(maxlen=args.sample_length)
+    result_queue = deque(maxlen=1)
+
+    cap = cv2.VideoCapture(args.video_path)
     num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
@@ -69,7 +72,7 @@ def show_results():
     frame_size = (frame_width, frame_height)
 
     ind = 0
-    video_writer = cv2.VideoWriter(out_file, fourcc, fps, frame_size)
+    video_writer = cv2.VideoWriter(args.out_file, fourcc, fps, frame_size)
     prog_bar = mmcv.ProgressBar(num_frames)
     backup_frames = []
 
@@ -81,24 +84,19 @@ def show_results():
             # drop it when encounting None
             continue
         backup_frames.append(np.array(frame)[:, :, ::-1])
-        if ind == sample_length:
+        if ind == args.sample_length:
             # provide a quick show at the beginning
             frame_queue.extend(backup_frames)
             backup_frames = []
-        elif ((len(backup_frames) == input_step and ind > sample_length)
-              or ind == num_frames):
+        elif ((len(backup_frames) == args.input_step
+               and ind > args.sample_length) or ind == num_frames):
             # pick a frame from the backup
             # when the backup is full or reach the last frame
             chosen_frame = random.choice(backup_frames)
             backup_frames = []
             frame_queue.append(chosen_frame)
 
-        ret, scores = inference()
-
-        if ret and stride > 0:
-            pred_stride = int(sample_length * stride)
-            for i in range(pred_stride):
-                frame_queue.popleft()
+        ret, scores = inference(model, data, args, frame_queue)
 
         if ret:
             num_selected_labels = min(len(label), 5)
@@ -113,7 +111,7 @@ def show_results():
             results = result_queue.popleft()
             for i, result in enumerate(results):
                 selected_label, score = result
-                if score < threshold:
+                if score < args.threshold:
                     break
                 location = (0, 40 + i * 20)
                 text = selected_label + ': ' + str(round(score, 2))
@@ -132,39 +130,37 @@ def show_results():
     cv2.destroyAllWindows()
 
 
-def inference():
-    if len(frame_queue) != sample_length:
+def inference(model, data, args, frame_queue):
+    if len(frame_queue) != args.sample_length:
         # Do no inference when there is no enough frames
         return False, None
 
     cur_windows = list(np.array(frame_queue))
-    img = frame_queue.popleft()
     if data['img_shape'] is None:
-        data['img_shape'] = img.shape[:2]
+        data['img_shape'] = frame_queue[0].shape[:2]
+
     cur_data = data.copy()
     cur_data['imgs'] = cur_windows
-    cur_data = test_pipeline(cur_data)
+    cur_data = args.test_pipeline(cur_data)
     cur_data = collate([cur_data], samples_per_gpu=1)
     if next(model.parameters()).is_cuda:
-        cur_data = scatter(cur_data, [device])[0]
+        cur_data = scatter(cur_data, [args.device])[0]
     with torch.no_grad():
         scores = model(return_loss=False, **cur_data)[0]
+
+    if args.stride > 0:
+        pred_stride = int(args.sample_length * args.stride)
+        for i in range(pred_stride):
+            frame_queue.popleft()
+
     return True, scores
 
 
 def main():
-    global frame_queue, threshold, sample_length, data, test_pipeline, model, \
-        out_file, video_path, device, input_step, label, result_queue, stride
-
     args = parse_args()
-    input_step = args.input_step
-    threshold = args.threshold
-    video_path = args.video
-    out_file = args.out_file
-    stride = args.stride
-
-    device = torch.device(args.device)
-    model = init_recognizer(args.config, args.checkpoint, device=device)
+
+    args.device = torch.device(args.device)
+    model = init_recognizer(args.config, args.checkpoint, device=args.device)
     data = dict(img_shape=None, modality='RGB', label=-1)
     with open(args.label, 'r') as f:
         label = [line.strip() for line in f]
@@ -184,10 +180,12 @@ def main():
             # remove step to decode frames
             pipeline_.remove(step)
     test_pipeline = Compose(pipeline_)
+
     assert sample_length > 0
-    frame_queue = deque(maxlen=sample_length)
-    result_queue = deque(maxlen=1)
-    show_results()
+    args.sample_length = sample_length
+    args.test_pipeline = test_pipeline
+
+    show_results(model, data, label, args)
 
 
 if __name__ == '__main__':

From a8391650c035ce48b919cb418664714d50952d60 Mon Sep 17 00:00:00 2001
From: Jintao Lin <528557675@qq.com>
Date: Wed, 23 Dec 2020 11:30:20 +0800
Subject: [PATCH 8/9] add a comment

---
 demo/long_video_demo.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/demo/long_video_demo.py b/demo/long_video_demo.py
index 4a21ed051b..78fd8e7598 100644
--- a/demo/long_video_demo.py
+++ b/demo/long_video_demo.py
@@ -153,6 +153,9 @@ def inference(model, data, args, frame_queue):
         for i in range(pred_stride):
             frame_queue.popleft()
 
+    # for case ``args.stride=0``
+    # deque will automatically popleft one element
+
     return True, scores
 
 

From 2238681f457ea00d2dd26d92d3dc04ac90e7ae1e Mon Sep 17 00:00:00 2001
From: kenny <duanhaodong@sensetime.com>
Date: Thu, 24 Dec 2020 21:53:21 +0800
Subject: [PATCH 9/9] resolve comments

---
 demo/README.md          | 2 +-
 demo/long_video_demo.py | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/demo/README.md b/demo/README.md
index 389fc983af..b31fb9d8fa 100644
--- a/demo/README.md
+++ b/demo/README.md
@@ -225,7 +225,7 @@ Optional arguments:
 - `INPUT_STEP`: Input step for sampling frames, which can help to get more spare input. If not specified , it will be set to 1.
 - `DEVICE_TYPE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. If not specified, it will be set to `cuda:0`.
 - `THRESHOLD`: Threshold of prediction score for action recognition. Only label with score higher than the threshold will be shown. If not specified, it will be set to 0.01.
-- `STRIDE`: By default, the demo generates a prediction for each single frame, which might cost lots of time. To speed up, you can set the argument `STRIDE` and then the demo will generate a prediction every `STRIDE x sample_length` frames. For example, if the sample_length is 64 frames and you set `STRIDE` to 0.5, predictions will be generated every 32 frames. If set as 0, predictions will be generated for each frame. Default: 0.
+- `STRIDE`: By default, the demo generates a prediction for each single frame, which might cost lots of time. To speed up, you can set the argument `STRIDE` and then the demo will generate a prediction every `STRIDE x sample_length` frames (`sample_length` indicates the size of temporal window from which you sample frames, which equals to `clip_len x frame_interval`). For example, if the sample_length is 64 frames and you set `STRIDE` to 0.5, predictions will be generated every 32 frames. If set as 0, predictions will be generated for each frame. The desired value of `STRIDE` is (0, 1], while it also works for `STRIDE > 1` (the generated predictions will be too sparse). Default: 0.
 
 Examples:
 
diff --git a/demo/long_video_demo.py b/demo/long_video_demo.py
index 4a21ed051b..3902784c11 100644
--- a/demo/long_video_demo.py
+++ b/demo/long_video_demo.py
@@ -49,8 +49,10 @@ def parse_args():
         '--stride',
         type=float,
         default=0,
-        help=('the prediction stride equals to '
-              'stride * sample_length, if set as 0, the '
+        help=('the prediction stride equals to stride * sample_length '
+              '(sample_length indicates the size of temporal window from '
+              'which you sample frames, which equals to '
+              'clip_len x frame_interval), if set as 0, the '
               'prediction stride is 1'))
     args = parser.parse_args()
     return args