-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathIemocapDataset.py
233 lines (185 loc) · 9.7 KB
/
IemocapDataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
import os
import torch
import torchaudio
import pandas as pd
import numpy as np
import torch.nn.functional as F
class IemocapDataset(object):
"""
Create a Dataset for Iemocap. Each item is a tuple of the form:
(waveform, sample_rate, emotion, activation, valence, dominance)
"""
_ext_audio = '.wav'
_emotions = { 'ang': 0, 'hap': 1, 'exc': 1, 'sad': 3, 'fru': 4, 'fea': 5, 'sur': 6, 'neu': 7, 'xxx': 8 }
def __init__(self,
root='IEMOCAP_full_release',
emotions=['ang', 'hap', 'exc', 'sad', 'neu'],
sessions=[1, 2, 3, 4, 5],
script_impro=['script', 'impro'],
genders=['M', 'F']):
"""
Args:
root (string): Directory containing the Session folders
"""
self.root = root
# Iterate through all 5 sessions
data = []
for i in range(1, 6):
# Define path to evaluation files of this session
path = os.path.join(root, 'Session' + str(i), 'dialog', 'EmoEvaluation')
# Get list of evaluation files
files = [file for file in os.listdir(path) if file.endswith('.txt')]
# Iterate through evaluation files to get utterance-level data
for file in files:
# Open file
f = open(os.path.join(path, file), 'r')
# Get list of lines containing utterance-level data. Trim and split each line into individual string elements.
data += [line.strip()
.replace('[', '')
.replace(']', '')
.replace(' - ', '\t')
.replace(', ', '\t')
.split('\t')
for line in f if line.startswith('[')]
# Get session number, script/impro, speaker gender, utterance number
data = [d + [d[2][4], d[2].split('_')[1], d[2][-4], d[2][-3:]] for d in data]
# Create pandas dataframe
self.df = pd.DataFrame(data, columns=['start', 'end', 'file', 'emotion', 'activation', 'valence', 'dominance', 'session', 'script_impro', 'gender', 'utterance'], dtype=np.float32)
# Filter by emotions
filtered_emotions = self.df['emotion'].isin(emotions)
self.df = self.df[filtered_emotions]
# Filter by sessions
filtered_sessions = self.df['session'].isin(sessions)
self.df = self.df[filtered_sessions]
# Filter by script_impro
filtered_script_impro = self.df['script_impro'].str.contains('|'.join(script_impro))
self.df = self.df[filtered_script_impro]
# Filter by gender
filtered_genders = self.df['gender'].isin(genders)
self.df = self.df[filtered_genders]
# Reset indices
self.df = self.df.reset_index()
# Map emotion labels to numeric values
self.df['emotion'] = self.df['emotion'].map(self._emotions).astype(np.float32)
# Map file to correct path w.r.t to root
self.df['file'] = [os.path.join('Session' + file[4], 'sentences', 'wav', file[:-5], file + self._ext_audio) for file in self.df['file']]
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
audio_name = os.path.join(self.root, self.df.loc[idx, 'file'])
waveform, sample_rate = torchaudio.load(audio_name)
emotion = self.df.loc[idx, 'emotion']
activation = self.df.loc[idx, 'activation']
valence = self.df.loc[idx, 'valence']
dominance = self.df.loc[idx, 'dominance']
sample = {
'path': audio_name,
'waveform': waveform,
'sample_rate': sample_rate,
'emotion': emotion,
'activation': activation,
'valence': valence,
'dominance': dominance
}
return sample
def collage_fn_vgg(batch):
# Clip or pad each utterance audio into 4.020 seconds.
sample_rate = 16000
n_channels = 1
frame_length = np.int(4.020 * sample_rate)
# Initialize output
waveforms = torch.zeros(0, n_channels, frame_length)
emotions = torch.zeros(0)
for item in batch:
waveform = item['waveform']
original_waveform_length = waveform.shape[1]
padded_waveform = F.pad(waveform, (0, frame_length - original_waveform_length)) if original_waveform_length < frame_length else waveform[:, :frame_length]
waveforms = torch.cat((waveforms, padded_waveform.unsqueeze(0)))
emotions = torch.cat((emotions, torch.tensor([item['emotion']])), 0)
return waveforms, emotions
def collate_fn_segments(batch):
# Segment each sample into 264ms frames and 25ms sliding window
sample_rate = 16000
segment_length = np.int(0.264 * sample_rate)
step_length = np.int(0.025 * sample_rate)
# Initialize output
segments = torch.zeros(0, segment_length)
n_segments = torch.zeros(0)
emotions = torch.zeros(0)
filenames = []
# Iterate through samples in batch
for item in batch:
waveform = item['waveform']
original_waveform_length = waveform.shape[1]
# Compute number of segments given input waveform, segment, and step lengths
item_n_segments = np.int(np.ceil((original_waveform_length - segment_length) / step_length) + 1)
# Compute and apply padding to waveform
padding_length = segment_length - original_waveform_length if original_waveform_length < segment_length else (segment_length + (item_n_segments - 1) * step_length - original_waveform_length)
padded_waveform = F.pad(waveform, (0, padding_length))
padded_waveform = padded_waveform.view(-1)
# Construct tensor of segments
item_segments = torch.zeros(item_n_segments, segment_length)
for i in range(item_n_segments):
item_segments[i] = padded_waveform[i*step_length:i*step_length+segment_length]
segments = torch.cat((segments, item_segments), 0)
# Construct tensor of emotion labels
emotion = torch.tensor([item['emotion']])
emotions = torch.cat((emotions, emotion.repeat(item_n_segments)), 0)
# Construct list of
filenames += [item['path'].split('/')[-1]]*item_n_segments
# Construct tensor of n_frames (contains a list of number of frames per item)
item_n_segments = torch.tensor([float(item_n_segments)])
n_segments = torch.cat((n_segments, item_n_segments), 0)
return segments, emotions, n_segments, filenames
def collate_fn(batch):
# Frame each sample into 25ms frames and 10ms sliding (step) window.
# This means that the frame length for a 16kHz signal is 0.025 * 16000 = 400 samples.
# Frame step is usually something like 10ms (160 samples), which allows some overlap to the frames.
# The first 400 sample frame starts at sample 0, the next 400 sample frame starts at sample 160 etc until the end of the speech file is reached.
# If the speech file does not divide into an even number, pad it with zeros so that it does.
sample_rate = 16000
# n_channels = 1
frame_length = np.int(0.025 * sample_rate)
step_length = np.int(0.01 * sample_rate)
# Initialize output
# frames = torch.zeros(0, n_channels, frame_length)
frames = torch.zeros(0, frame_length)
n_frames = torch.zeros(0)
emotions = torch.zeros(0)
for item in batch:
waveform = item['waveform']
original_waveform_length = waveform.shape[1]
# Compute number of frames given input waveform, frame and step lengths
item_n_frames = np.int(np.ceil((original_waveform_length - frame_length) / step_length) + 1)
# Compute and apply padding to waveform
padding_length = frame_length - original_waveform_length if original_waveform_length < frame_length else (frame_length + (item_n_frames - 1) * step_length - original_waveform_length)
padded_waveform = F.pad(waveform, (0, padding_length))
padded_waveform = padded_waveform.view(-1)
# Construct tensor of frames
# item_frames = torch.zeros(n_frames, n_channels, frame_length)
item_frames = torch.zeros(item_n_frames, frame_length)
for i in range(item_n_frames):
item_frames[i] = padded_waveform[i*step_length:i*step_length+frame_length]
# item_frames[i] = padded_waveform[:, i*step_length:i*step_length+frame_length]
frames = torch.cat((frames, item_frames), 0)
# Construct tensor of emotion labels
emotion = torch.tensor([item['emotion']])
emotions = torch.cat((emotions, emotion.repeat(item_n_frames)), 0)
# Construct tensor of n_frames (contains a list of number of frames per item)
item_n_frames = torch.tensor([float(item_n_frames)])
n_frames = torch.cat((n_frames, item_n_frames), 0)
return frames, emotions, n_frames
# Example: Load Iemocap dataset
# iemocap_dataset = IemocapDataset('/home/alanwuha/Documents/Projects/datasets/iemocap/IEMOCAP_full_release')
# Example: Iterate through samples
# for i in range(len(iemocap_dataset)):
# sample = iemocap_dataset[i]
# print(i, sample)
# Number of audio by duration
# dataset_duration = np.ceil(iemocap_dataset.df['end'] - iemocap_dataset.df['start'])
# idx = np.where(dataset_duration == 35)
# durations = np.unique(dataset_duration)
# durations_count = [np.sum(dataset_duration == i) for i in durations]
# print('End')