-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathdelexicalizer.py
91 lines (72 loc) · 2.86 KB
/
delexicalizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
__author__='thiagocastroferreira'
import copy
import nltk
import os
import parser_original as parser
from entry import *
def get_entitymap(triples):
agents = []
bridges = []
patients = []
visited = []
for triple in triples:
if triple.subject not in visited:
agents.append(triple.subject)
visited.append(triple.subject)
else:
if triple.subject in patients and triple.subject not in bridges:
bridges.append(triple.subject)
patients.remove(triple.subject)
if triple.object not in visited:
patients.append(triple.object)
visited.append(triple.object)
else:
if triple.object in agents and triple.object not in bridges:
bridges.append(triple.object)
agents.remove(triple.object)
entitymap = []
for i, agent in enumerate(agents):
tagentity = TagEntity(tag='AGENT-'+str(i+1), entity=agent)
entitymap.append(tagentity)
for i, bridge in enumerate(bridges):
tagentity = TagEntity(tag='BRIDGE-'+str(i+1), entity=bridge)
entitymap.append(tagentity)
for i, patient in enumerate(patients):
tagentity = TagEntity(tag='PATIENT-'+str(i+1), entity=patient)
entitymap.append(tagentity)
return entitymap
def delexicalize(entry):
for lexEntry in entry.lexEntries:
template = copy.copy(lexEntry.substring)
template = ' '.join(nltk.word_tokenize(template))
entitymap = entry.entitymap_to_dict()
for tag in entitymap:
refex = entitymap[tag].replace('_', ' ')
template = template.replace(refex, tag)
lexEntry.template = template
return entry
if __name__ == '__main__':
path = 'corpus/original/test/testdata_with_lex.xml'
entryset = parser.parse(path)
for i, entry in enumerate(entryset):
entryset[i].entitymap = get_entitymap(entry.modifiedtripleset)
entryset[i] = delexicalize(entryset[i])
for lexEntry in entryset[i].lexEntries:
print(lexEntry.substring)
print(lexEntry.template)
print(10 * '-')
path = 'corpus/delexicalized/v1.2/test'
if not os.path.exists(path):
os.mkdir(path)
sizes = set([entry.size for entry in entryset])
categories = set([entry.category for entry in entryset])
for size in sizes:
ppath = os.path.join(path, str(size)+'triples')
if not os.path.exists(ppath):
os.mkdir(ppath)
size_entries = [entry for entry in entryset if entry.size == size]
for category in categories:
category_entries = [entry for entry in size_entries if entry.category == category]
if len(category_entries) > 0:
fname = category + '.xml'
parser.generate(entries=category_entries, fname=os.path.join(ppath, fname))