-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathdont_patronize_me.py
122 lines (111 loc) · 3.66 KB
/
dont_patronize_me.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os
import pandas as pd
from collections import defaultdict
from sklearn.preprocessing import MultiLabelBinarizer
class DontPatronizeMe:
def __init__(self, train_path, test_path):
self.train_path = train_path
self.test_path = test_path
self.train_task1_df = None
self.train_task2_df = None
self.test_set_df = None
def load_task1(self):
"""
Load task 1 training set and convert the tags into binary labels.
Paragraphs with original labels of 0 or 1 are considered to be negative examples of PCL and will have the label 0 = negative.
Paragraphs with original labels of 2, 3 or 4 are considered to be positive examples of PCL and will have the label 1 = positive.
It returns a pandas dataframe with paragraphs and labels.
"""
rows=[]
with open(os.path.join(self.train_path, 'dontpatronizeme_pcl.tsv')) as f:
for line in f.readlines()[4:]:
par_id=line.strip().split('\t')[0]
art_id = line.strip().split('\t')[1]
keyword=line.strip().split('\t')[2]
country=line.strip().split('\t')[3]
t=line.strip().split('\t')[4]#.lower()
l=line.strip().split('\t')[-1]
if l=='0' or l=='1':
lbin=0
else:
lbin=1
rows.append(
{'par_id':par_id,
'art_id':art_id,
'keyword':keyword,
'country':country,
'text':t,
'label':lbin,
'orig_label':l
}
)
df=pd.DataFrame(rows, columns=['par_id', 'art_id', 'keyword', 'country', 'text', 'label', 'orig_label'])
self.train_task1_df = df
def load_task2(self, return_one_hot=True):
# Reads the data for task 2 and present it as paragraphs with binarized labels (a list with seven positions, "activated or not (1 or 0)",
# depending on wether the category is present in the paragraph).
# It returns a pandas dataframe with paragraphs and list of binarized labels.
tag2id = {
'Unbalanced_power_relations':0,
'Shallow_solution':1,
'Presupposition':2,
'Authority_voice':3,
'Metaphors':4,
'Compassion':5,
'The_poorer_the_merrier':6
}
print('Map of label to numerical label:')
print(tag2id)
data = defaultdict(list)
with open (os.path.join(self.train_path, 'dontpatronizeme_categories.tsv')) as f:
for line in f.readlines()[4:]:
par_id=line.strip().split('\t')[0]
art_id = line.strip().split('\t')[1]
text=line.split('\t')[2]#.lower()
keyword=line.split('\t')[3]
country=line.split('\t')[4]
start=line.split('\t')[5]
finish=line.split('\t')[6]
text_span=line.split('\t')[7]
label=line.strip().split('\t')[-2]
num_annotators=line.strip().split('\t')[-1]
labelid = tag2id[label]
if not labelid in data[(par_id, art_id, text, keyword, country)]:
data[(par_id,art_id, text, keyword, country)].append(labelid)
par_ids=[]
art_ids=[]
pars=[]
keywords=[]
countries=[]
labels=[]
for par_id, art_id, par, kw, co in data.keys():
par_ids.append(par_id)
art_ids.append(art_id)
pars.append(par)
keywords.append(kw)
countries.append(co)
for label in data.values():
labels.append(label)
if return_one_hot:
labels = MultiLabelBinarizer().fit_transform(labels)
df = pd.DataFrame(list(zip(par_ids,
art_ids,
pars,
keywords,
countries,
labels)), columns=['par_id',
'art_id',
'text',
'keyword',
'country',
'label',
])
self.train_task2_df = df
def load_test(self):
#self.test_df = [line.strip() for line in open(self.test_path)]
rows=[]
with open(self.test_path) as f:
for line in f:
t=line.strip().split('\t')
rows.append(t)
self.test_set_df = pd.DataFrame(rows, columns="par_id art_id keyword country text".split())