-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathm2_to_ixml.py
160 lines (140 loc) · 4.69 KB
/
m2_to_ixml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# -*- coding: utf-8 -*-
#!/usr/bin/python
# Copyright (c) 2015 Mariano Felice
#
# Converts an M^2 Scorer .m2 file to the I-measure XML format.
#
# This script is part of the I-measure package and is covered by the MIT License.
#
from elementtree.SimpleXMLWriter import XMLWriter
from itertools import groupby
import nltk # For tokenisation of corrections
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
help_str = \
'''Usage: python ''' + sys.argv[0] + ''' -in:<file> [-out:<file>]
\t -in: Input .m2 file.
\t -out: Output file. Default is input filename .ieval.xml.
'''
# Globals
in_file = None
out_file = None
### FUNCTIONS ###
def cluster_has_overlap(c, e):
return any(edit_has_overlap(e, ce) for ce in c)
def edit_has_overlap(e1, e2):
# [0]: start offset
# [1]: end offset
if e1[0] == e2[0] and e1[1] == e2[1]:
return True
elif e1[0] == e1[1]:
return (e1[0] > e2[0] and e1[1] < e2[1])
elif e2[0] == e2[1]:
return (e2[0] > e1[0] and e2[1] < e1[1])
else:
return (e1[1] > e2[0] and e1[1] <= e2[1]) or \
(e1[0] >= e2[0] and e1[0] < e2[1])
def group_by_alternatives(cluster):
alt_list = []
# Sort and group by annotator
cluster.sort(key=lambda x: x[-1])
for key, group in groupby(cluster, lambda x: x[-1]):
alt_list.append([x for x in group])
return alt_list
def get_type(cluster):
types = set(x[2] for x in cluster)
return '/'.join(types)
### MAIN ###
for i in range(1,len(sys.argv)):
if sys.argv[i].startswith("-in:"):
in_file = sys.argv[i][4:]
if sys.argv[i].startswith("-out:"):
out_file = sys.argv[i][5:]
# Do we have what we need?
if not in_file:
print help_str
exit(0)
# Read gold standard annotations
f_in = open(in_file,"r")
src_sents = [] # Save source sentences
annotators = [] # Save annotators per sentence
ref_annot = []
s = -1
for line in f_in:
if line[0] == "S":
s += 1
# Get and save original sentence
src_sents.append(line.split()[1:])
annotators.append(set())
ref_annot.append([])
elif line[0] == "A":
# Save annotations
tokens = line.split("|||")
coords = tokens[0].split()
c_start = int(coords[1])
c_end = int(coords[2])
etype = tokens[1]
# Uses only the first correction
correction = tokens[2].split("||")[0]
# Tokenise it just in case!
correction = tokens[2].split("||")[0]
correction = ' '.join(nltk.word_tokenize(correction))
required = tokens[3]
annotator = int(tokens[5])
annotators[s].add(annotator)
if c_start == -1 and c_end == -1 and etype.lower() == "noop":
# Noop --> empty set of edits (source is right)
pass
else:
ref_annot[s].append([c_start, c_end, etype, correction, annotator])
f_in.close()
# Create the output XML
if not out_file:
out_file = in_file + ".ieval.xml"
f_out = XMLWriter(out_file, "UTF-8")
f_out.declaration()
f_out.start("scripts")
f_out.start("script", id="1") # Assume only one script
# Do clustering
for s in xrange(len(ref_annot)):
sys.stdout.write("\rSentence %s..." % (s+1))
sys.stdout.flush()
clusters = []
# Sort edits from longest to shortest range
ref_annot[s].sort(key=lambda x: x[0] - x[1])
for e in ref_annot[s]: # Go through each edit
merge = False
for c in clusters:
if cluster_has_overlap(c, e):
# If the edit overlaps with an existing cluster, merge
c.append(e)
merge = True
break
if not merge:
# If the edit couldn't be merged, create a new cluster
clusters.append([e])
# Sort clusters by start and end offsets
clusters.sort(key=lambda x: (x[0][0],x[0][1]))
# Write to XML
f_out.start("sentence", id=str(s+1), numann=str(len(annotators[s])))
f_out.element("text", ' '.join(src_sents[s]))
f_out.start("error-list")
# Clusters
for i in xrange(len(clusters)):
alternatives = group_by_alternatives(clusters[i])
f_out.start("error", id=str(i+1), type=get_type(clusters[i]),
req=('yes' if len(alternatives)==len(annotators[s]) else 'no'))
# Alternatives
for j in xrange(len(alternatives)):
f_out.start("alt", ann=str(alternatives[j][0][4]))
# Corrections
for k in xrange(len(alternatives[j])):
f_out.element("c", alternatives[j][k][3], start=str(alternatives[j][k][0]), end=str(alternatives[j][k][1]))
f_out.end("alt")
f_out.end("error")
f_out.end("error-list")
f_out.end("sentence")
f_out.end("script")
f_out.end("scripts")
print ""