forked from raraz15/ddsp_simplified
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
136 lines (95 loc) · 4.79 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from typing import Dict, Tuple
import numpy as np
from tensorflow.keras import layers as tfkl
from dsp_utils import spectral_ops
from dsp_utils.core import resample, midi_to_hz, hz_to_midi
from feature_names import FEATURE_F0_HZ, FEATURE_F0_MIDI_SCALED, FEATURE_LD_SCALED, INPUT_FEATURE_F0_HZ, \
INPUT_FEATURE_LOUDNESS_DB
F0_RANGE = spectral_ops.F0_RANGE
LD_RANGE = spectral_ops.LD_RANGE
CC_RANGE = 128
VELOCITY_RANGE = 128
PITCH_RANGE = 128
from utilities import at_least_3d
class F0LoudnessAndMidiFeaturesPreprocessor(tfkl.Layer):
"""Resamples and scales 'f0_hz' and 'loudness_db' features. Used in the Supervised Setting."""
def __init__(self, timesteps=250, **kwargs):
super().__init__(**kwargs)
self.timesteps = timesteps
def call(self, inputs: Dict[str, np.ndarray], *args, **kwargs):
preprocessed_audio_features = self._preprocess_audio_features(inputs={
INPUT_FEATURE_F0_HZ: inputs[INPUT_FEATURE_F0_HZ],
INPUT_FEATURE_LOUDNESS_DB: inputs[INPUT_FEATURE_LOUDNESS_DB]
})
inputs_with_midi_features_only = dict(inputs) # create a copy
del inputs_with_midi_features_only[INPUT_FEATURE_F0_HZ]
del inputs_with_midi_features_only[INPUT_FEATURE_LOUDNESS_DB]
preprocessed_midi_features = self._preprocess_midi_features(
inputs=inputs_with_midi_features_only
)
return {
**preprocessed_audio_features,
**preprocessed_midi_features
}
def _preprocess_audio_features(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
# Downsample features, but do not update them in the dict
f0_hz = self.resample(inputs[INPUT_FEATURE_F0_HZ])
loudness_db = self.resample(inputs[INPUT_FEATURE_LOUDNESS_DB])
# For NN training, scale frequency and loudness to the range [0, 1].
# Log-scale f0 features. Loudness from [-1, 0] to [1, 0].
f0_midi_scaled = hz_to_midi(f0_hz) / F0_RANGE
ld_scaled = (loudness_db / LD_RANGE) + 1.0
return {FEATURE_F0_HZ: at_least_3d(inputs[INPUT_FEATURE_F0_HZ]), # kept for the harmonic synth, convert to 3d here
FEATURE_F0_MIDI_SCALED: f0_midi_scaled, # used in the decoder in this form
FEATURE_LD_SCALED: ld_scaled} # same as f0_midi_scaled
def _preprocess_midi_features(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
res = {}
for (feature_name, data) in inputs.items():
resampled = self.resample(data)
min_value, max_value = self._get_range_of_midi_feature(feature_name)
scaled = (resampled - min_value) / (max_value - min_value)
res[feature_name] = scaled
return res
def _get_range_of_midi_feature(self, feature_name: str) -> Tuple[float, float]:
return 0.0, 128.0
def resample(self, x):
x = at_least_3d(x)
return resample(x, self.timesteps, method="linear")
# TODO: fix the Encoder_f
# Downsample the f and l using the F0LoudnessAndMidiFeaturesPreprocessor
# Remove this class
class LoudnessPreprocessor(tfkl.Layer):
def __init__(self, timesteps=250, **kwargs):
super().__init__(**kwargs)
self.timesteps = timesteps
def call(self, inputs):
loudness_db = inputs["loudness_db"]
# Resample features to time_steps.
loudness_db = self.resample(loudness_db)
# For NN training, scale frequency and loudness to the range [0, 1].
ld_scaled = (loudness_db / LD_RANGE) + 1.0
return {"ld_scaled":ld_scaled}
def resample(self, x):
x = at_least_3d(x)
return resample(x, self.timesteps, method="linear")
# TODO: delete??
class MidiF0LoudnessPreprocessor(tfkl.Layer):
"""Scales the loudness, converts scaled midi to hz and resamples. Used in the Unsupervised setting."""
def __init__(self, timesteps=1000, **kwargs):
super().__init__(**kwargs)
self.timesteps = timesteps
def call(self, inputs):
loudness_db, f0_scaled = inputs["loudness_db"], inputs["f0_midi_scaled"]
# Resample features to time_steps.
f0_scaled = resample(f0_scaled, self.timesteps)
loudness_db = resample(loudness_db, self.timesteps)
# For NN training, scale frequency and loudness to the range [0, 1].
ld_scaled = (loudness_db / LD_RANGE) + 1.0
# ???????????????????????????
# Convert scaled midi to hz for the synthesizer
f0_hz = midi_to_hz(f0_scaled*F0_RANGE)
f0_hz = resample(at_least_3d(f0_hz), 1000)
return {"f0_hz":f0_hz, "loudness_db":loudness_db, "f0_midi_scaled":f0_scaled, "ld_scaled":ld_scaled}
def resample(self, x):
x = at_least_3d(x)
return resample(x, self.timesteps, method="linear")