diff --git a/configs/recognition/tsn/tsn_r50_320p_1x1x8_150e_activitynet_clip_flow.py b/configs/recognition/tsn/tsn_r50_320p_1x1x8_150e_activitynet_clip_flow.py index eb03c7dec2..e0461fa49d 100644 --- a/configs/recognition/tsn/tsn_r50_320p_1x1x8_150e_activitynet_clip_flow.py +++ b/configs/recognition/tsn/tsn_r50_320p_1x1x8_150e_activitynet_clip_flow.py @@ -80,6 +80,7 @@ filename_tmpl='flow_{}_{:05d}.jpg', with_offset=True, modality='Flow', + start_index=0, pipeline=train_pipeline), val=dict( type=dataset_type, @@ -88,6 +89,7 @@ filename_tmpl='flow_{}_{:05d}.jpg', with_offset=True, modality='Flow', + start_index=0, pipeline=val_pipeline), test=dict( type=dataset_type, @@ -96,6 +98,7 @@ filename_tmpl='flow_{}_{:05d}.jpg', with_offset=True, modality='Flow', + start_index=0, pipeline=test_pipeline)) # optimizer optimizer = dict(type='SGD', lr=0.001, momentum=0.9, weight_decay=0.0001) diff --git a/configs/recognition/tsn/tsn_r50_320p_1x1x8_150e_activitynet_video_flow.py b/configs/recognition/tsn/tsn_r50_320p_1x1x8_150e_activitynet_video_flow.py index 65254ed380..8eb1044664 100644 --- a/configs/recognition/tsn/tsn_r50_320p_1x1x8_150e_activitynet_video_flow.py +++ b/configs/recognition/tsn/tsn_r50_320p_1x1x8_150e_activitynet_video_flow.py @@ -80,6 +80,7 @@ filename_tmpl='flow_{}_{:05d}.jpg', with_offset=True, modality='Flow', + start_index=0, pipeline=train_pipeline), val=dict( type=dataset_type, @@ -88,6 +89,7 @@ filename_tmpl='flow_{}_{:05d}.jpg', with_offset=True, modality='Flow', + start_index=0, pipeline=val_pipeline), test=dict( type=dataset_type, @@ -96,6 +98,7 @@ filename_tmpl='flow_{}_{:05d}.jpg', with_offset=True, modality='Flow', + start_index=0, pipeline=test_pipeline)) # optimizer optimizer = dict(type='SGD', lr=0.001, momentum=0.9, weight_decay=0.0001) diff --git a/configs/recognition/tsn/tsn_r50_320p_1x1x8_50e_activitynet_clip_rgb.py b/configs/recognition/tsn/tsn_r50_320p_1x1x8_50e_activitynet_clip_rgb.py index 86f010f1c1..af0169ab11 100644 --- a/configs/recognition/tsn/tsn_r50_320p_1x1x8_50e_activitynet_clip_rgb.py +++ b/configs/recognition/tsn/tsn_r50_320p_1x1x8_50e_activitynet_clip_rgb.py @@ -79,6 +79,7 @@ data_prefix=data_root, pipeline=train_pipeline, with_offset=True, + start_index=0, filename_tmpl='image_{:05d}.jpg'), val=dict( type=dataset_type, @@ -86,6 +87,7 @@ data_prefix=data_root_val, pipeline=val_pipeline, with_offset=True, + start_index=0, filename_tmpl='image_{:05d}.jpg'), test=dict( type=dataset_type, @@ -93,6 +95,7 @@ data_prefix=data_root_val, pipeline=test_pipeline, with_offset=True, + start_index=0, filename_tmpl='image_{:05d}.jpg')) # optimizer optimizer = dict(type='SGD', lr=0.001, momentum=0.9, weight_decay=0.0001) diff --git a/configs/recognition/tsn/tsn_r50_320p_1x1x8_50e_activitynet_video_rgb.py b/configs/recognition/tsn/tsn_r50_320p_1x1x8_50e_activitynet_video_rgb.py index bc56560886..16c43b797d 100644 --- a/configs/recognition/tsn/tsn_r50_320p_1x1x8_50e_activitynet_video_rgb.py +++ b/configs/recognition/tsn/tsn_r50_320p_1x1x8_50e_activitynet_video_rgb.py @@ -79,6 +79,7 @@ data_prefix=data_root, pipeline=train_pipeline, with_offset=True, + start_index=0, filename_tmpl='image_{:05d}.jpg'), val=dict( type=dataset_type, @@ -86,6 +87,7 @@ data_prefix=data_root_val, pipeline=val_pipeline, with_offset=True, + start_index=0, filename_tmpl='image_{:05d}.jpg'), test=dict( type=dataset_type, @@ -93,6 +95,7 @@ data_prefix=data_root_val, pipeline=test_pipeline, with_offset=True, + start_index=0, filename_tmpl='image_{:05d}.jpg')) # optimizer optimizer = dict(type='SGD', lr=0.001, momentum=0.9, weight_decay=0.0001) diff --git a/mmaction/datasets/ssn_dataset.py b/mmaction/datasets/ssn_dataset.py index 0ba32ef673..32daa88742 100644 --- a/mmaction/datasets/ssn_dataset.py +++ b/mmaction/datasets/ssn_dataset.py @@ -455,8 +455,8 @@ def evaluate(self, plain_detections = {} for class_idx in range(len(detections)): detection_list = [] - for vid, dets in detections[class_idx].items(): - detection_list.extend([[vid, class_idx] + x[:3] + for video, dets in detections[class_idx].items(): + detection_list.extend([[video, class_idx] + x[:3] for x in dets.tolist()]) plain_detections[class_idx] = detection_list @@ -500,7 +500,7 @@ def get_all_gts(self): """Fetch groundtruth instances of the entire dataset.""" gts = {} for video_info in self.video_infos: - vid = video_info['video_id'] + video = video_info['video_id'] for gt in video_info['gts']: class_idx = gt.label - 1 # gt_info: [relative_start, relative_end] @@ -508,7 +508,7 @@ def get_all_gts(self): gt.start_frame / video_info['total_frames'], gt.end_frame / video_info['total_frames'] ] - gts.setdefault(class_idx, {}).setdefault(vid, + gts.setdefault(class_idx, {}).setdefault(video, []).append(gt_info) return gts diff --git a/tools/data/activitynet/action_name.csv b/tools/data/activitynet/action_name.csv new file mode 100644 index 0000000000..5f5fe1d9c9 --- /dev/null +++ b/tools/data/activitynet/action_name.csv @@ -0,0 +1,201 @@ +action +Applying sunscreen +Arm wrestling +Assembling bicycle +BMX +Baking cookies +Baton twirling +Beach soccer +Beer pong +Blow-drying hair +Blowing leaves +Playing ten pins +Braiding hair +Building sandcastles +Bullfighting +Calf roping +Camel ride +Canoeing +Capoeira +Carving jack-o-lanterns +Changing car wheel +Cleaning sink +Clipping cat claws +Croquet +Curling +Cutting the grass +Decorating the Christmas tree +Disc dog +Doing a powerbomb +Doing crunches +Drum corps +Elliptical trainer +Doing fencing +Fixing the roof +Fun sliding down +Futsal +Gargling mouthwash +Grooming dog +Hand car wash +Hanging wallpaper +Having an ice cream +Hitting a pinata +Hula hoop +Hurling +Ice fishing +Installing carpet +Kite flying +Kneeling +Knitting +Laying tile +Longboarding +Making a cake +Making a lemonade +Making an omelette +Mooping floor +Painting fence +Painting furniture +Peeling potatoes +Plastering +Playing beach volleyball +Playing blackjack +Playing congas +Playing drums +Playing ice hockey +Playing pool +Playing rubik cube +Powerbocking +Putting in contact lenses +Putting on shoes +Rafting +Raking leaves +Removing ice from car +Riding bumper cars +River tubing +Rock-paper-scissors +Rollerblading +Roof shingle removal +Rope skipping +Running a marathon +Scuba diving +Sharpening knives +Shuffleboard +Skiing +Slacklining +Snow tubing +Snowboarding +Spread mulch +Sumo +Surfing +Swimming +Swinging at the playground +Table soccer +Throwing darts +Trimming branches or hedges +Tug of war +Using the monkey bar +Using the rowing machine +Wakeboarding +Waterskiing +Waxing skis +Welding +Drinking coffee +Zumba +Doing kickboxing +Doing karate +Tango +Putting on makeup +High jump +Playing bagpipes +Cheerleading +Wrapping presents +Cricket +Clean and jerk +Preparing pasta +Bathing dog +Discus throw +Playing field hockey +Grooming horse +Preparing salad +Playing harmonica +Playing saxophone +Chopping wood +Washing face +Using the pommel horse +Javelin throw +Spinning +Ping-pong +Making a sandwich +Brushing hair +Playing guitarra +Doing step aerobics +Drinking beer +Playing polo +Snatch +Paintball +Long jump +Cleaning windows +Brushing teeth +Playing flauta +Tennis serve with ball bouncing +Bungee jumping +Triple jump +Horseback riding +Layup drill in basketball +Vacuuming floor +Cleaning shoes +Doing nails +Shot put +Fixing bicycle +Washing hands +Ironing clothes +Using the balance beam +Shoveling snow +Tumbling +Using parallel bars +Getting a tattoo +Rock climbing +Smoking hookah +Shaving +Getting a piercing +Springboard diving +Playing squash +Playing piano +Dodgeball +Smoking a cigarette +Sailing +Getting a haircut +Playing lacrosse +Cumbia +Tai chi +Painting +Mowing the lawn +Shaving legs +Walking the dog +Hammer throw +Skateboarding +Polishing shoes +Ballet +Hand washing clothes +Plataform diving +Playing violin +Breakdancing +Windsurfing +Hopscotch +Doing motocross +Mixing drinks +Starting a campfire +Belly dance +Removing curlers +Archery +Volleyball +Playing water polo +Playing racquetball +Kayaking +Polishing forniture +Playing kickball +Using uneven bars +Washing dishes +Pole vault +Playing accordion +Playing badminton diff --git a/tools/data/activitynet/activitynet_feature_extraction.py b/tools/data/activitynet/activitynet_feature_postprocessing.py similarity index 95% rename from tools/data/activitynet/activitynet_feature_extraction.py rename to tools/data/activitynet/activitynet_feature_postprocessing.py index ca49030b1f..6b11e2b6d3 100644 --- a/tools/data/activitynet/activitynet_feature_extraction.py +++ b/tools/data/activitynet/activitynet_feature_postprocessing.py @@ -15,7 +15,7 @@ def parse_args(): parser.add_argument('--rgb', default='', help='rgb feature root') parser.add_argument('--flow', default='', help='flow feature root') parser.add_argument('--dest', default='', help='dest root') - parser.add_argument('--output-format', default='pkl', help='clip length') + parser.add_argument('--output-format', default='csv') args = parser.parse_args() return args @@ -64,12 +64,13 @@ def pool_feature(data, num_proposals=100, num_sample_bins=3, pool_type='mean'): def merge_feat(name): # concatenate rgb feat and flow feat for a single sample - global args rgb_feat = load(osp.join(args.rgb, name)) flow_feat = load(osp.join(args.flow, name)) rgb_feat = pool_feature(rgb_feat) flow_feat = pool_feature(flow_feat) feat = np.concatenate([rgb_feat, flow_feat], axis=-1) + if not osp.exists(args.dest): + os.system(f'mkdir -p {args.dest}') if args.output_format == 'pkl': dump(feat, osp.join(args.dest, name)) elif args.output_format == 'csv': diff --git a/tools/data/activitynet/download.py b/tools/data/activitynet/download.py new file mode 100644 index 0000000000..e4d9ba7902 --- /dev/null +++ b/tools/data/activitynet/download.py @@ -0,0 +1,125 @@ +# This scripts is copied from +# https://github.com/activitynet/ActivityNet/blob/master/Crawler/Kinetics/download.py # noqa: E501 +# The code is licensed under the MIT licence. +import os +import subprocess + +import mmcv + +import ssl # isort:skip + +from joblib import Parallel, delayed # isort:skip + +ssl._create_default_https_context = ssl._create_unverified_context +data_file = '../../../data/ActivityNet' +video_list = f'{data_file}/video_info_new.csv' +anno_file = f'{data_file}/anet_anno_action.json' +output_dir = f'{data_file}/videos' + + +def download_clip(video_identifier, + output_filename, + num_attempts=5, + url_base='https://www.youtube.com/watch?v='): + """Download a video from youtube if exists and is not blocked. + arguments: + --------- + video_identifier: str + Unique YouTube video identifier (11 characters) + output_filename: str + File path where the video will be stored. + """ + # Defensive argument checking. + assert isinstance(video_identifier, str), 'video_identifier must be string' + assert isinstance(output_filename, str), 'output_filename must be string' + assert len(video_identifier) == 11, 'video_identifier must have length 11' + + status = False + + if not os.path.exists(output_filename): + command = [ + 'youtube-dl', '--quiet', '--no-warnings', '--no-check-certificate', + '-f', 'mp4', '-o', + '"%s"' % output_filename, + '"%s"' % (url_base + video_identifier) + ] + command = ' '.join(command) + print(command) + attempts = 0 + while True: + try: + subprocess.check_output( + command, shell=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError: + attempts += 1 + if attempts == num_attempts: + return status, 'Fail' + else: + break + # Check if the video was successfully saved. + status = os.path.exists(output_filename) + return status, 'Downloaded' + + +def download_clip_wrapper(youtube_id, output_dir): + """Wrapper for parallel processing purposes.""" + # we do this to align with names in annotations + output_filename = os.path.join(output_dir, 'v_' + youtube_id + '.mp4') + if os.path.exists(output_filename): + status = tuple(['v_' + youtube_id, True, 'Exists']) + return status + + downloaded, log = download_clip(youtube_id, output_filename) + status = tuple(['v_' + youtube_id, downloaded, log]) + return status + + +def parse_activitynet_annotations(input_csv): + """Returns a list of YoutubeID. + arguments: + --------- + input_csv: str + Path to CSV file containing the following columns: + 'video,numFrame,seconds,fps,rfps,subset,featureFrame' + returns: + ------- + youtube_ids: list + List of all YoutubeIDs in ActivityNet. + + """ + lines = open(input_csv).readlines() + lines = lines[1:] + # YoutubeIDs do not have prefix `v_` + youtube_ids = [x.split(',')[0][2:] for x in lines] + return youtube_ids + + +def main(input_csv, output_dir, anno_file, num_jobs=24): + # Reading and parsing ActivityNet. + youtube_ids = parse_activitynet_annotations(input_csv) + + # Creates folders where videos will be saved later. + if not os.path.exists(output_dir): + os.makedirs(output_dir) + # Download all clips. + if num_jobs == 1: + status_list = [] + for index in youtube_ids: + status_list.append(download_clip_wrapper(index, output_dir)) + else: + status_list = Parallel(n_jobs=num_jobs)( + delayed(download_clip_wrapper)(index, output_dir) + for index in youtube_ids) + + # Save download report. + mmcv.dump(status_list, 'download_report.json') + annotation = mmcv.load(anno_file) + downloaded = {status[0]: status[1] for status in status_list} + annotation = {k: v for k, v in annotation.items() if downloaded[k]} + anno_file_bak = anno_file.replace('.json', '_bak.json') + os.system(f'mv {anno_file} {anno_file_bak}') + mmcv.dump(annotation, anno_file) + + +if __name__ == '__main__': + main(video_list, output_dir, anno_file, 24) diff --git a/tools/data/activitynet/download_videos.sh b/tools/data/activitynet/download_videos.sh new file mode 100644 index 0000000000..602cf48fcd --- /dev/null +++ b/tools/data/activitynet/download_videos.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +# set up environment +conda env create -f environment.yml +source activate activitynet +pip install --upgrade youtube-dl + +DATA_DIR="../../../data/ActivityNet" +python download.py + +source deactivate activitynet +conda remove -n activitynet --all diff --git a/tools/data/activitynet/environment.yml b/tools/data/activitynet/environment.yml new file mode 100644 index 0000000000..fe4c3ffb57 --- /dev/null +++ b/tools/data/activitynet/environment.yml @@ -0,0 +1,36 @@ +name: activitynet +channels: + - anaconda + - menpo + - conda-forge + - defaults +dependencies: + - ca-certificates=2020.1.1 + - certifi=2020.4.5.1 + - ffmpeg=2.8.6 + - libcxx=10.0.0 + - libedit=3.1.20181209 + - libffi=3.3 + - ncurses=6.2 + - openssl=1.1.1g + - pip=20.0.2 + - python=3.7.7 + - readline=8.0 + - setuptools=46.4.0 + - sqlite=3.31.1 + - tk=8.6.8 + - wheel=0.34.2 + - xz=5.2.5 + - zlib=1.2.11 + - pip: + - decorator==4.4.2 + - intel-openmp==2019.0 + - joblib==0.15.1 + - mkl==2019.0 + - numpy==1.18.4 + - olefile==0.46 + - pandas==1.0.3 + - python-dateutil==2.8.1 + - pytz==2020.1 + - six==1.14.0 + - youtube-dl==2020.5.8 diff --git a/tools/data/activitynet/extract_frames.sh b/tools/data/activitynet/extract_frames.sh new file mode 100644 index 0000000000..32565aebc1 --- /dev/null +++ b/tools/data/activitynet/extract_frames.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +cd ../ +python build_rawframes.py ../../data/ActivityNet/videos/ ../../data/ActivityNet/rawframes/ --level 1 --flow-type tvl1 --ext mp4 --task both --new-width 340 --new-height 256 +echo "Raw frames (RGB and tv-l1) Generated for train set" + +cd activitynet/ diff --git a/tools/data/activitynet/generate_rawframes_filelist.py b/tools/data/activitynet/generate_rawframes_filelist.py new file mode 100644 index 0000000000..0c126588ba --- /dev/null +++ b/tools/data/activitynet/generate_rawframes_filelist.py @@ -0,0 +1,105 @@ +import os +import os.path as osp + +import mmcv + +data_file = '../../../data/ActivityNet' +video_list = f'{data_file}/video_info_new.csv' +anno_file = f'{data_file}/anet_anno_action.json' +rawframe_dir = f'{data_file}/rawframes' +action_name_list = 'action_name.csv' + + +def generate_rawframes_filelist(): + anet_annotations = mmcv.load(anno_file) + + videos = open(video_list).readlines() + videos = [x.strip().split(',') for x in videos] + attr_names = videos[0][1:] + # the first line is 'video,numFrame,seconds,fps,rfps,subset,featureFrame' + attr_names = [x.lower() for x in attr_names] + attr_types = [int, float, float, float, str, int] + + video_annos = {} + for line in videos[1:]: + name = line[0] + data = {} + for attr_name, attr_type, attr_val in zip(attr_names, attr_types, + line[1:]): + data[attr_name] = attr_type(attr_val) + video_annos[name] = data + + # only keep downloaded videos + video_annos = { + k: v + for k, v in video_annos.items() if k in anet_annotations + } + # update numframe + for video in video_annos: + pth = osp.join(rawframe_dir, video) + num_imgs = len(os.listdir(pth)) + # one more rgb img than flow + assert (num_imgs - 1) % 3 == 0 + num_frames = (num_imgs - 1) // 3 + video_annos[video]['numframe'] = num_frames + + anet_labels = open('action_name.csv').readlines() + anet_labels = [x.strip() for x in anet_labels[1:]] + + train_videos, val_videos = {}, {} + for k, video in video_annos.items(): + if video['subset'] == 'training': + train_videos[k] = video + elif video['subset'] == 'validation': + val_videos[k] = video + + def simple_label(video_idx): + anno = anet_annotations[video_idx] + label = anno['annotations'][0]['label'] + return anet_labels.index(label) + + train_lines = [ + k + ' ' + str(train_videos[k]['numframe']) + ' ' + + str(simple_label(k)) for k in train_videos + ] + val_lines = [ + k + ' ' + str(val_videos[k]['numframe']) + ' ' + str(simple_label(k)) + for k in val_videos + ] + + with open(osp.join(data_file, 'anet_train_video.txt'), 'w') as fout: + fout.write('\n'.join(train_lines)) + with open(osp.join(data_file, 'anet_val_video.txt'), 'w') as fout: + fout.write('\n'.join(val_lines)) + + def clip_list(k, anno, vidanno): + num_seconds = anno['duration_second'] + num_frames = vidanno['numframe'] + fps = num_frames / num_seconds + segs = anno['annotations'] + lines = [] + for seg in segs: + segment = seg['segment'] + label = seg['label'] + label = anet_labels.index(label) + start, end = int(segment[0] * fps), int(segment[1] * fps) + if end > num_frames - 1: + end = num_frames - 1 + newline = f'{k} {start} {end - start + 1} {label}' + lines.append(newline) + return lines + + train_clips, val_clips = [], [] + for k in train_videos: + train_clips.extend(clip_list(k, anet_annotations[k], train_videos[k])) + for k in val_videos: + val_clips.extend(clip_list(k, anet_annotations[k], val_videos[k])) + + with open(osp.join(data_file, 'anet_train_clip.txt'), 'w') as fout: + fout.write('\n'.join(train_clips)) + with open(osp.join(data_file, 'anet_val_clip.txt'), 'w') as fout: + fout.write('\n'.join(val_clips)) + + +if __name__ == '__main__': + generate_rawframes_filelist() diff --git a/tools/data/activitynet/preparing_activitynet.md b/tools/data/activitynet/preparing_activitynet.md index 0b9ab21d89..83a7d6ef16 100644 --- a/tools/data/activitynet/preparing_activitynet.md +++ b/tools/data/activitynet/preparing_activitynet.md @@ -1,7 +1,8 @@ # Preparing ActivityNet For basic dataset information, please refer to the official [website](http://activity-net.org/). -Here, we use the ActivityNet rescaled feature provided in this [repo](https://github.com/wzmsltw/BSN-boundary-sensitive-network#code-and-data-preparation). +For action detection, you can either use the ActivityNet rescaled feature provided in this [repo](https://github.com/wzmsltw/BSN-boundary-sensitive-network#code-and-data-preparation) or extract feature with mmaction2 (which has better performance). +We release both pipeline. Before we start, please make sure that current working directory is `$MMACTION2/tools/data/activitynet/`. ## Step 1. Download Annotations @@ -10,13 +11,15 @@ First of all, you can run the following script to download annotation files. bash download_annotations.sh ``` -## Step 2. Prepare Videos Features +## Option 1: Use the ActivityNet rescaled feature provided in this [repo](https://github.com/wzmsltw/BSN-boundary-sensitive-network#code-and-data-preparation) + +### Step 2. Prepare Videos Features Then, you can run the following script to download activitynet features. ```shell bash download_features.sh ``` -## Step 3. Process Annotation Files +### Step 3. Process Annotation Files Next, you can run the following script to process the downloaded annotation files for training and testing. It first merges the two annotation files together and then seperates the annoations by `train`, `val` and `test`. @@ -24,7 +27,62 @@ It first merges the two annotation files together and then seperates the annoati python process_annotations.py ``` -## Step 4. Check Directory Structure +## Option 2: Extract ActivityNet feature using MMAction2. + +### Step 2. Prepare Videos. +Then, you can run the following script to prepare videos. +The codes are adapted from the [official crawler](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics). Note that this might take a long time. +Some videos in the ActivityNet dataset might be no longer available on YouTube, so that after video downloading, the downloading scripts update the annotation file to make sure every video in it exists. + +```shell +bash download_videos.sh +``` + +### Step 3. Extract RGB and Flow +Before extracting, please refer to [install.md](/docs/install.md) for installing [denseflow](https://github.com/open-mmlab/denseflow). + +Use following scripts to extract both RGB and Flow. + +```shell +bash extract_frames.sh +``` + +These three commands above can generate images with size 340x256, if you want to generate images with short edge 320 (320p), +you can change the args `--new-width 340 --new-height 256` to `--new-short 320`. +More details can be found in [data_preparation](/docs/data_preparation.md) + +### Step 4. Generate File List for ActivityNet Finetuning +With extracted frames, you can generate video-level or clip-level lists of rawframes, which can be used for ActivityNet Finetuning. + +```shell +python generate_rawframes_filelist.py +``` + +### Step 5. Finetune TSN models on ActivityNet +You can use ActivityNet configs in `configs/recognition/tsn` to finetune TSN models on ActivityNet. +You need to use Kinetics models for pretraining. +Both RGB models and Flow models are supported. + +### Step 6. Extract ActivityNet Feature with finetuned ckpts +After finetuning TSN on ActivityNet, you can use it to extract both RGB and Flow feature. + +```shell +python tsn_feature_extraction.py --data-prefix ../../../data/ActivityNet/rawframes --data-list ../../../data/ActivityNet/anet_train_video.txt --output-prefix ../../../data/ActivityNet/rgb_feat --modality RGB --ckpt /path/to/rgb_checkpoint.pth + +python tsn_feature_extraction.py --data-prefix ../../../data/ActivityNet/rawframes --data-list ../../../data/ActivityNet/anet_val_video.txt --output-prefix ../../../data/ActivityNet/rgb_feat --modality RGB --ckpt /path/to/rgb_checkpoint.pth + +python tsn_feature_extraction.py --data-prefix ../../../data/ActivityNet/rawframes --data-list ../../../data/ActivityNet/anet_train_video.txt --output-prefix ../../../data/ActivityNet/flow_feat --modality Flow --ckpt /path/to/flow_checkpoint.pth + +python tsn_feature_extraction.py --data-prefix ../../../data/ActivityNet/rawframes --data-list ../../../data/ActivityNet/anet_val_video.txt --output-prefix ../../../data/ActivityNet/flow_feat --modality Flow --ckpt /path/to/flow_checkpoint.pth +``` + +After feature extraction, you can use our post processing scripts to concat RGB and Flow feature, generate the 100-t X 400-d feature for Action Detection. + +```shell +python activitynet_feature_postprocessing --rgb ../../../data/ActivityNet/rgb_feat --flow ../../../data/ActivityNet/flow_feat --dest ../../../data/ActivityNet/mmaction_feat +``` + +## Final Step. Check Directory Structure After the whole data pipeline for ActivityNet preparation, you will get the features and annotation files. diff --git a/tools/data/activitynet/tsn_feature_extraction.py b/tools/data/activitynet/tsn_feature_extraction.py index 377c6cd088..353befc1e6 100644 --- a/tools/data/activitynet/tsn_feature_extraction.py +++ b/tools/data/activitynet/tsn_feature_extraction.py @@ -1,4 +1,5 @@ import argparse +import os import os.path as osp import pickle @@ -23,8 +24,7 @@ def parse_args(): type=int, default=16, help='the sampling frequency of frame in the untrimed video') - parser.add_argument('--clip-len', type=int, default=1, help='clip length') - parser.add_argument('--modality', default='RGB') + parser.add_argument('--modality', default='RGB', choices=['RGB', 'Flow']) parser.add_argument('--ckpt', help='checkpoint for feature extraction') parser.add_argument( '--part', @@ -40,6 +40,7 @@ def parse_args(): def main(): args = parse_args() args.is_rgb = args.modality == 'RGB' + args.clip_len = 1 if args.is_rgb else 5 args.input_format = 'NCHW' if args.is_rgb else 'NCHW_Flow' rgb_norm_cfg = dict( mean=[123.675, 116.28, 103.53], @@ -47,7 +48,7 @@ def main(): to_bgr=False) flow_norm_cfg = dict(mean=[128, 128], std=[128, 128]) args.img_norm_cfg = rgb_norm_cfg if args.is_rgb else flow_norm_cfg - args.f_tmpl = 'image_{:05d}.jpg' if args.is_rgb else 'flow_{}_{:05d}.jpg' + args.f_tmpl = 'img_{:05d}.jpg' if args.is_rgb else 'flow_{}_{:05d}.jpg' args.in_channels = args.clip_len * (3 if args.is_rgb else 2) # max batch_size for one forward args.batch_size = 200 @@ -95,8 +96,12 @@ def main(): # enumerate Untrimmed videos, extract feature from each of them prog_bar = mmcv.ProgressBar(len(data)) + if not osp.exists(args.output_prefix): + os.system(f'mkdir -p {args.output_prefix}') + for item in data: - frame_dir, length, output_file = item.split() + frame_dir, length, label = item.split() + output_file = osp.basename(frame_dir) + '.pkl' frame_dir = osp.join(args.data_prefix, frame_dir) output_file = osp.join(args.output_prefix, output_file) assert output_file.endswith('.pkl') @@ -107,6 +112,7 @@ def main(): frame_dir=frame_dir, total_frames=length, filename_tmpl=args.f_tmpl, + start_index=0, modality=args.modality) sample = data_pipeline(tmpl) imgs = sample['imgs'] diff --git a/tools/data/build_file_list.py b/tools/data/build_file_list.py index 194a285bf4..1deb94baf3 100644 --- a/tools/data/build_file_list.py +++ b/tools/data/build_file_list.py @@ -127,7 +127,7 @@ def build_list(split): else: raise ValueError( 'frame_info should be ' + - '[`vid`(str), `label`(int)|`labels(list[int])`') + '[`video`(str), `label`(int)|`labels(list[int])`') else: # videos if isinstance(item[1], int): @@ -144,7 +144,7 @@ def build_list(split): else: raise ValueError( 'frame_info should be ' + - '[`vid`(str), `label`(int)|`labels(list[int])`') + '[`video`(str), `label`(int)|`labels(list[int])`') if shuffle: random.shuffle(rgb_list) random.shuffle(flow_list) diff --git a/tools/data/kinetics400/download.py b/tools/data/kinetics400/download.py index 2a87f92868..417fc96703 100755 --- a/tools/data/kinetics400/download.py +++ b/tools/data/kinetics400/download.py @@ -1,5 +1,6 @@ # This scripts is copied from # https://github.com/activitynet/ActivityNet/blob/master/Crawler/Kinetics/download.py # noqa: E501 +# The code is licensed under the MIT licence. import argparse import glob import json @@ -183,12 +184,12 @@ def main(input_csv, # Download all clips. if num_jobs == 1: - status_lst = [] + status_list = [] for i, row in dataset.iterrows(): - status_lst.append( + status_list.append( download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir)) else: - status_lst = Parallel( + status_list = Parallel( n_jobs=num_jobs)(delayed(download_clip_wrapper)( row, label_to_dir, trim_format, tmp_dir) for i, row in dataset.iterrows()) @@ -198,7 +199,7 @@ def main(input_csv, # Save download report. with open('download_report.json', 'w') as fobj: - fobj.write(json.dumps(status_lst)) + fobj.write(json.dumps(status_list)) if __name__ == '__main__': diff --git a/tools/data/parse_file_list.py b/tools/data/parse_file_list.py index de4abc369b..994d382df1 100644 --- a/tools/data/parse_file_list.py +++ b/tools/data/parse_file_list.py @@ -100,24 +100,25 @@ def parse_ucf101_splits(level): class_mapping = {x[1]: int(x[0]) - 1 for x in class_index} def line_to_map(line): - """A function to map line string to vid and label. + """A function to map line string to video and label. Args: line (str): A long directory path, which is a text path. Returns: - tuple[str, str]: (vid, label), vid is the video id, + tuple[str, str]: (video, label), video is the video id, label is the video label. """ items = line.strip().split() - vid = osp.splitext(items[0])[0] + video = osp.splitext(items[0])[0] if level == 1: - vid = osp.basename(vid) + video = osp.basename(video) label = items[0] elif level == 2: - vid = osp.join(osp.basename(osp.dirname(vid)), osp.basename(vid)) + video = osp.join( + osp.basename(osp.dirname(video)), osp.basename(video)) label = class_mapping[osp.dirname(items[0])] - return vid, label + return video, label splits = [] for i in range(1, 4): @@ -155,16 +156,17 @@ def parse_sthv1_splits(level): def line_to_map(line, test_mode=False): items = line.strip().split(';') - vid = items[0] + video = items[0] if level == 1: - vid = osp.basename(vid) + video = osp.basename(video) elif level == 2: - vid = osp.join(osp.basename(osp.dirname(vid)), osp.basename(vid)) + video = osp.join( + osp.basename(osp.dirname(video)), osp.basename(video)) if test_mode: - return vid + return video else: label = class_mapping[items[1]] - return vid, label + return video, label with open(train_file, 'r') as fin: train_list = [line_to_map(x) for x in fin] @@ -201,18 +203,19 @@ def parse_sthv2_splits(level): class_mapping = json.loads(fin.read()) def line_to_map(item, test_mode=False): - vid = item['id'] + video = item['id'] if level == 1: - vid = osp.basename(vid) + video = osp.basename(video) elif level == 2: - vid = osp.join(osp.basename(osp.dirname(vid)), osp.basename(vid)) + video = osp.join( + osp.basename(osp.dirname(video)), osp.basename(video)) if test_mode: - return vid + return video else: template = item['template'].replace('[', '') template = template.replace(']', '') label = class_mapping[template] - return vid, label + return video, label with open(train_file, 'r') as fin: items = json.loads(fin.read()) @@ -239,9 +242,9 @@ def parse_mmit_splits(): # Read the annotations def line_to_map(x): - vid = osp.splitext(x[0])[0] + video = osp.splitext(x[0])[0] labels = [int(digit) for digit in x[1:]] - return vid, labels + return video, labels csv_reader = csv.reader(open('data/mmit/annotations/trainingSet.csv')) train_list = [line_to_map(x) for x in csv_reader] @@ -284,7 +287,7 @@ def convert_label(s, keep_whitespaces=False): return s.replace('"', '') def line_to_map(x, test=False): - """A function to map line string to vid and label. + """A function to map line string to video and label. Args: x (str): A single line from Kinetics-400 csv file. @@ -292,22 +295,22 @@ def line_to_map(x, test=False): annotation file. Returns: - tuple[str, str]: (vid, label), vid is the video id, + tuple[str, str]: (video, label), video is the video id, label is the video label. """ if test: - # vid = f'{x[0]}_{int(x[1]):06d}_{int(x[2]):06d}' - vid = f'{x[1]}_{int(float(x[2])):06d}_{int(float(x[3])):06d}' + # video = f'{x[0]}_{int(x[1]):06d}_{int(x[2]):06d}' + video = f'{x[1]}_{int(float(x[2])):06d}_{int(float(x[3])):06d}' label = -1 # label unknown - return vid, label + return video, label else: - vid = f'{x[1]}_{int(float(x[2])):06d}_{int(float(x[3])):06d}' + video = f'{x[1]}_{int(float(x[2])):06d}_{int(float(x[3])):06d}' if level == 2: - vid = f'{convert_label(x[0])}/{vid}' + video = f'{convert_label(x[0])}/{video}' else: assert level == 1 label = class_mapping[convert_label(x[0])] - return vid, label + return video, label train_file = 'data/kinetics400/annotations/kinetics_train.csv' val_file = 'data/kinetics400/annotations/kinetics_val.csv' @@ -350,9 +353,9 @@ def parse_mit_splits(): class_mapping[cat] = int(digit) def line_to_map(x): - vid = osp.splitext(x[0])[0] + video = osp.splitext(x[0])[0] label = class_mapping[osp.dirname(x[0])] - return vid, label + return video, label csv_reader = csv.reader(open('data/mit/annotations/trainingSet.csv')) train_list = [line_to_map(x) for x in csv_reader] @@ -427,13 +430,14 @@ def generate_class_index_file(): def line_to_map(line): items = line.strip().split() - vid = osp.splitext(items[0])[0] + video = osp.splitext(items[0])[0] if level == 1: - vid = osp.basename(vid) + video = osp.basename(video) elif level == 2: - vid = osp.join(osp.basename(osp.dirname(vid)), osp.basename(vid)) + video = osp.join( + osp.basename(osp.dirname(video)), osp.basename(video)) label = class_mapping[osp.dirname(items[0])] - return vid, label + return video, label splits = [] for i in range(1, 4): diff --git a/tools/flow_extraction.py b/tools/flow_extraction.py index 1c702ccf02..d0026711b4 100644 --- a/tools/flow_extraction.py +++ b/tools/flow_extraction.py @@ -86,11 +86,11 @@ def extract_dense_flow(path, frames = [] assert osp.exists(path) - vid = cv2.VideoCapture(path) - flag, f = vid.read() + video = cv2.VideoCapture(path) + flag, f = video.read() while flag: frames.append(f) - flag, f = vid.read() + flag, f = video.read() flow = generate_flow(frames, method=method)