forked from raraz15/ddsp_simplified
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeature_extraction.py
97 lines (75 loc) · 4.19 KB
/
feature_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from typing import List, Dict
import numpy as np
from ddsp_simplified.utils.heuristic_audio_features_generator import HeuristicAudioFeaturesGenerator
from dsp_utils.spectral_ops import compute_loudness, compute_f0, compute_mfcc, compute_logmel
from feature_names import INPUT_FEATURE_LOUDNESS_DB, INPUT_FEATURE_F0_HZ, INPUT_FEATURE_MFCC, INPUT_FEATURE_LOG_MEL
from misc_constants import AUDIO_SYNTH
from utilities import concat_dct, frame_generator
## -------------------------------------------------- Feature Extraction ---------------------------------------
def feature_extractor(audio, sample_rate=16000, model=None, frame_rate=250,
f0=True, loudness=True, mfcc=False, log_mel=False,
mfcc_nfft=1024, l_nfft=2048, logmel_nfft=2048,
conf_threshold=0.0):
"""
mfcc_nfft should be determined by preprocessing timesteps.
l_nfft and log_mel nfft are used as here in the library."""
features = {'audio': audio}
if f0:
f0, confidence = compute_f0(audio, sample_rate, frame_rate, viterbi=True)
f0 = confidence_filter(f0, confidence, conf_threshold)
features[INPUT_FEATURE_F0_HZ] = f0
if mfcc:
# overlap and fft_size taken from the code
# overlap is the same except for frame size 63
features[INPUT_FEATURE_MFCC] = compute_mfcc(audio,
fft_size=mfcc_nfft,
overlap=0.75,
mel_bins=128,
mfcc_bins=30)
if log_mel:
features[INPUT_FEATURE_LOG_MEL] = compute_logmel(audio,
bins=229, #64
fft_size=logmel_nfft,
overlap=0.75,
pad_end=True,
sample_rate=sample_rate)
if loudness:
# apply reverb before l extraction to match
# room acoustics for timbre transfer
if model is not None and model.add_reverb:
audio = model.reverb({AUDIO_SYNTH: audio[np.newaxis,:]})[0]
features[INPUT_FEATURE_LOUDNESS_DB] = compute_loudness(audio,
sample_rate=sample_rate,
frame_rate=frame_rate,
n_fft=l_nfft,
use_tf=False)
return features
def extract_features_from_frames(frames, **kwargs):
"""Extracts features from multiple frames and concatenates them."""
return concat_dct([feature_extractor(frame, **kwargs) for frame in frames])
def extract_features_from_audio_frames_using_heuristic_generator(
audio_frames: List[np.ndarray],
midi_frames: Dict[str, np.ndarray]
) -> Dict[str, np.ndarray]:
generator = HeuristicAudioFeaturesGenerator()
res = []
for (audio_frame, midi_frame) in zip(audio_frames, midi_frames):
heuristic_audio_features = generator.generate(midi_frame['midi'])
res.append({
'audio': audio_frame,
INPUT_FEATURE_F0_HZ: heuristic_audio_features[INPUT_FEATURE_F0_HZ],
INPUT_FEATURE_LOUDNESS_DB: heuristic_audio_features[INPUT_FEATURE_LOUDNESS_DB]
})
return concat_dct(res)
def process_track(track, sample_rate=16000, audio_length=60, frame_size=64000, **kwargs):
"""Generates frames from a track and extracts features for each frame."""
MAX_AUDIO_LENGTH = sample_rate*audio_length
if len(track) > MAX_AUDIO_LENGTH: # trim from the end
track = track[:MAX_AUDIO_LENGTH]
frames = frame_generator(track, frame_size) # large chunks of audio
return extract_features_from_frames(frames, **kwargs)
def confidence_filter(F0, confidence, threshold):
"""
Silences the time instants where the model confidence is below the given threshold.
"""
return [f if confidence[idx] >= threshold else 0.0 for idx, f in enumerate(F0)]