-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathmultiscale_kit.py
42 lines (35 loc) · 1.4 KB
/
multiscale_kit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import random
from nltk.tokenize import sent_tokenize
########### Augmentation-related helper functions
interpunctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '...']
def single_multi_scale_augment(data, min_length=50, aug_mode='word'):
# format e.g. eda_deletion-0.2
# from nltk.tokenize import sent_tokenize
lines = sent_tokenize(data)
if len(lines) <= 1:
return data
if 'sentence_deletion' in aug_mode:
# sentence random deletion, similar to word deletion
# sentence_deletion-p
name, strp = aug_mode.split('-')
p = float(strp)
new_sentences = []
for sentence in lines:
r = random.uniform(0, 1)
if r > p:
new_sentences.append(sentence)
connect_ch = '' if 'zh' in aug_mode else ' '
if len(new_sentences) < 1: # if empty
return data
out = connect_ch.join(new_sentences)
return out
else:
raise NotImplementedError(f'Multiscaling mode {aug_mode} not implemented!')
def multi_scale_augment(data, min_length=50, aug_mode='word'):
new_data = data
if type(aug_mode) == list: # multiple
for aug in aug_mode:
new_data = single_multi_scale_augment(new_data, min_length, aug)
return new_data
else: # single
return single_multi_scale_augment(data, min_length, aug_mode)