-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathproduce.ini
241 lines (205 loc) · 9.43 KB
/
produce.ini
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
### UTILITIES #################################################################
[]
prelude =
import errno
import os
import subprocess
def makedirs(path):
try:
os.makedirs(path)
except OSError as error:
if error.errno != errno.EEXIST:
raise error
def sync_bows(lang, sentence_id, user, layer):
"""
Makes the file %%{lang}/%%{pr}/%%{sid}/%%{user}/%%{layer}.bows up to date with
information from the database. Does not change its timestamp if
its contents do not change. Returns True if there were changes,
False otherwise.
This is an ugly trick to simulate a dependency into the
database. Produce 3.0 will have a feature making this trick
superfluous.
"""
dirpath = os.path.join('out', lang, sentence_id[:2], sentence_id)
makedirs(dirpath)
filepath = os.path.join(dirpath, '{}.{}.bows'.format(user, layer))
bowlist = subprocess.check_output(['python3', './src/python/bows_tsv.py', lang, sentence_id, user, layer]).decode('UTF-8')
try:
with open(filepath, encoding='UTF-8') as f:
old_bowlist = f.read()
if old_bowlist == bowlist:
return False
except FileNotFoundError:
pass
with open(filepath, 'w', encoding='UTF-8') as f:
f.write(bowlist)
return True
[dummy]
# Dummy task. Targets depending on this will be reproduced on every invocation.
type = task
### RAW #######################################################################
# For working from the command line, it is convenient to have a rule to create
# .raw files. This only works if the sentence is in the database though. The
# Web interface generates .raw files for sentences whether they are in the
# database or not.
[raw/%{lang}/%{pr}/%{sid}.raw]
dep.raw = src/python/raw.py
recipe =
set -e
mkdir -p raw/%{lang}/%{pr}
python3 %{raw} %{lang} %{sid} > %{target}
### DERIVATION PROJECTION #####################################################
[proj-%{lang}-%{part}]
cond = %{part in ('train', 'traindevtest')}
type = task
depfile = proj/proj-%{lang}-%{part}.d
[proj/proj-%{lang}-%{part}.d]
cond = %{part in ('train', 'traindevtest')}
dep.data_proj = src/python/data_proj.py
recipe =
set -e
mkdir -p proj
python3 %{data_proj} %{lang} %{part} > %{target}
[out/%{lang}/%{pr}/%{tid}/proj/%{sid}/proj-%{part}.wordalign]
cond = %{part in ('train', 'traindevtest')}
dep.wordalign = src/python/wordalign.py
recipe =
set -e
mkdir -p out/%{lang}/%{pr}/%{tid}/proj/%{sid}
python %{wordalign} %{lang} %{tid} %{sid} %{part} > %{target}
[out/%{lang}/%{pr}/%{tid}/proj/%{sid}/auto.sentalign]
dep.sentalign = src/python/sentalign.py
dep.foroff = out/%{lang}/%{pr}/%{tid}/auto.tok.off
dep.engoff = out/eng/%{sid[:2]}/%{sid}/auto.tok.off
recipe =
set -e
mkdir -p out/%{lang}/%{pr}/%{tid}/proj/%{sid}
python %{sentalign} %{engoff} %{foroff} > %{target}
[out/%{lang}/%{pr}/%{tid}/proj/%{sid}/proj-%{part}.parse.tags]
cond = %{part in ('train', 'traindevtest')}
dep.derproj = src/prolog/derproj.pl
deps = src/prolog/der.pl src/prolog/util.pl src/prolog/catobj.pl src/prolog/sr.pl src/prolog/slashes.pl src/prolog/ccg.pl
dep.der = out/eng/%{sid[:2]}/%{sid}/auto.der.incomplete
dep.wordalign = out/%{lang}/%{pr}/%{tid}/proj/%{sid}/proj-%{part}.wordalign
dep.sentalign = out/%{lang}/%{pr}/%{tid}/proj/%{sid}/auto.sentalign
dep.toff = out/%{lang}/%{pr}/%{tid}/auto.tok.off
dep.soff = out/eng/%{sid[:2]}/%{sid}/auto.tok.off
recipe = swipl -l %{derproj} -g main %{der} %{wordalign} %{sentalign} %{toff} %{soff} > %{target}
### TOKENIZATION ##############################################################
# We use the automatic tokenization everywhere. In the rare cases where there
# are tokenization mistakes, we have to fix them by manually insertig auto BOWs
# into the database.
[out/%{lang}/%{pr}/%{sid}/%{user}.tok.iob.noncorr]
cond = %{user == 'auto' and lang not in ('deu', 'eng', 'ita', 'nld')}
dep.tokenize_quick = src/python/tokenize_quick.py
dep.raw = raw/%{lang}/%{pr}/%{sid}.raw
recipe =
set -e
set -o pipefail
mkdir -p `dirname %{target}`
cat %{raw} | python3 %{tokenize_quick} > %{target}
[out/%{lang}/%{pr}/%{sid}/%{user}.tok.iob.noncorr]
cond = %{user == 'auto'}
dep.raw = raw/%{lang}/%{pr}/%{sid}.raw
dep.txt2iob = src/python/txt2iob.py
dep.elephant = ext/elephant/src/elephant
dep.model = models/tok.iob/%{lang}.model
recipe =
set -e
set -o pipefail
mkdir -p `dirname %{target}`
export PATH=ext/elephant/ext:$PATH
cat %{raw} | python3 %{txt2iob} | ./ext/viasock/viasock run --process-timeout 60 --server-timeout 86400 -t '^$' -T '^$' --log log/elephant/%{lang}.log %{elephant} -m %{model} -f iob -F iob | sed -e 's/\t/ /' > %{target}
[out/%{lang}/%{pr}/%{sid}/%{user}.tok.iob]
cond = %{user == 'auto'}
dep.noncorr = out/%{lang}/%{pr}/%{sid}/%{user}.tok.iob.noncorr
dep.bows = src/python/bows_tok.py
dep.bowfile = out/%{lang}/%{pr}/%{sid}/%{user}.tok.bows
deps = %{'dummy' if sync_bows(lang, sid, user, 'tok') else ''}
recipe = cat %{noncorr} | python3 %{bows} %{bowfile} > %{target}
[out/%{lang}/%{pr}/%{sid}/%{user}.tok.off]
cond = %{user == 'auto'}
dep.iob = out/%{lang}/%{pr}/%{sid}/%{user}.tok.iob
dep.iob2off = src/python/iob2off.py
recipe = cat %{iob} | python %{iob2off} > %{target}
[out/%{lang}/%{pr}/%{sid}/%{user}.tok]
cond = %{user == 'auto'}
dep.off = out/%{lang}/%{pr}/%{sid}/%{user}.tok.off
dep.off2tok = src/python/off2tok.py
recipe = cat %{off} | python %{off2tok} > %{target}
### SUPERTAGGING ##############################################################
# We leave the supertagging mainly to the parser, but if there is a BOW, we
# constrain it to use that supertag.
[out/%{lang}/%{pr}/%{sid}/%{user}.super]
dep.off = out/%{lang}/%{pr}/%{sid}/auto.tok.off
dep.bows = ./src/python/bows_super.py
dep.bowfile = out/%{lang}/%{pr}/%{sid}/%{user}.super.bows
deps = %{'dummy' if sync_bows(lang, sid, user, 'super') else ''}
recipe = python3 %{bows} %{bowfile} %{off} > %{target}
### SPANS #####################################################################
# We leave finding spans mainly to the parser (obviously), but if there is a
# BOW, we constrain it to make that span a constituent.
[out/%{lang}/%{pr}/%{sid}/%{user}.span]
dep.off = out/%{lang}/%{pr}/%{sid}/auto.tok.off
dep.bows = ./src/python/bows_span.py
dep.bowfile = out/%{lang}/%{pr}/%{sid}/%{user}.span.bows
deps = %{'dummy' if sync_bows(lang, sid, user, 'span') else ''}
recipe = python %{bows} %{bowfile} %{off} > %{target}
### PARSING ###################################################################
[out/%{lang}/%{pr}/%{sid}/%{user}.constrained]
dep.tok = out/%{lang}/%{pr}/%{sid}/auto.tok
dep.super = out/%{lang}/%{pr}/%{sid}/%{user}.super
dep.add_supertag_constraints = src/python/add_supertag_constraints.py
dep.span = out/%{lang}/%{pr}/%{sid}/%{user}.span
dep.add_span_constraints = src/python/add_span_constraints.py
recipe =
set -e
set -o pipefail
cat %{tok} | python %{add_supertag_constraints} %{super} | python %{add_span_constraints} %{span} > %{target}
[out/%{lang}/%{pr}/%{sid}/%{user}.parse]
dep.constrained = out/%{lang}/%{pr}/%{sid}/%{user}.constrained
dep.cac_renumber = src/python/cac_renumber.py
dep.easyccg = ext/easyccg/easyccg.jar
#generic = %{'' if lang == 'en' else ' -g'}
generic =
dep.model = models/parse/%{lang}.model
# TODO use CCGrebank-based model!
recipe =
set -e
set -o pipefail
cat %{constrained} | ./ext/viasock/viasock run --process-timeout 60 --server-timeout 86400 --log log/easyccg/%{lang}.log --output-terminator '^$' --output-prelude 1 java -jar %{easyccg} --inputFormat constrained --model %{model}%{generic} --unrestrictedRules --rootCategories S[dcl] S[wq] S[q] NP S[b]\\NP S[b] S[intj] --outputFormat boxer | python %{cac_renumber} > %{target}
#cat %{constrained} | java -jar %{easyccg} --inputFormat constrained --model %{model}%{generic} --unrestrictedRules --rootCategories S[dcl] S[wq] S[q] NP S[b]\\NP S[intj] --outputFormat boxer 2> %{target}.log > %{target}
# Add tags from other layers to parse
[out/%{lang}/%{pr}/%{sid}/%{user}.parse.tags]
dep.parse = out/%{lang}/%{pr}/%{sid}/%{user}.parse
dep.insert = ./src/python/cac_addtags.py
deps = src/python/caclib.py
dep.off = out/%{lang}/%{pr}/%{sid}/auto.tok.off
recipe =
set -e
set -o pipefail
cat %{parse} | %{insert} %{off} from <(cut -d ' ' -f 1 %{off}) to <(cut -d ' ' -f 2 %{off}) > %{target}
# Convert derivations to XML
[out/%{lang}/%{pr}/%{sid}/%{user}.der.xml.incomplete]
dep.parse = out/%{lang}/%{pr}/%{sid}/%{user}.parse.tags
dep.parse2xml = src/prolog/parse2xml.pl
deps = src/prolog/util.pl src/prolog/slashes.pl
recipe = swipl -l %{parse2xml} -g main %{parse} > %{target}
# Convert derivations to Boxer Prolog format
[out/%{lang}/%{pr}/%{sid}/%{user}.der.incomplete]
dep.parse = out/%{lang}/%{pr}/%{sid}/%{user}.parse.tags
dep.parse2boxer = src/prolog/parse2boxer.pl
deps = src/prolog/util.pl src/prolog/slashes.pl
recipe = swipl -l %{parse2boxer} -g main %{parse} > %{target}
# Create an XML file containing all tags, used for adding missing derivation placeholders
[out/%{lang}/%{pr}/%{sid}/%{user}.lex.xml]
dep.off = out/%{lang}/%{pr}/%{sid}/auto.tok.off
dep.cols2xml = src/python/cols2xml.py
dep.super = out/%{lang}/%{pr}/%{sid}/%{user}.super
recipe = paste <(cut -d ' ' -f 3 %{off}) <(cut -d ' ' -f 4- %{off}) <(cut -d ' ' -f 1 %{off}) <(cut -d ' ' -f 2 %{off}) %{super} | python %{cols2xml} from to super > %{target}
# Add placeholders for sentences missing derivations to parser output
[out/%{lang}/%{pr}/%{sid}/%{user}.der.xml]
dep.incomplete = out/%{lang}/%{pr}/%{sid}/%{user}.der.xml.incomplete
dep.lexxml = out/%{lang}/%{pr}/%{sid}/%{user}.lex.xml
dep.dermerge = ./src/python/dermerge.py
recipe = python %{dermerge} %{incomplete} %{lexxml} > %{target}