Skip to content

Commit c2ff011

Browse files
committed
move bert pre-training code to sbb_ner
1 parent 9cc644e commit c2ff011

File tree

5 files changed

+834
-1
lines changed

5 files changed

+834
-1
lines changed

qurator/sbb_ner/models/corpus.py

+163
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
import re
2+
import pandas as pd
3+
from tqdm import tqdm as tqdm
4+
import click
5+
import codecs
6+
import os
7+
import sqlite3
8+
9+
from qurator.utils.parallel import run as prun
10+
11+
12+
class ChunkTask:
13+
14+
selection = None
15+
16+
def __init__(self, chunk, min_line_len):
17+
18+
self._chunk = chunk
19+
self._min_line_len = min_line_len
20+
21+
def __call__(self, *args, **kwargs):
22+
23+
return ChunkTask.reformat_chunk(self._chunk, self._min_line_len)
24+
25+
@staticmethod
26+
def reformat_chunk(chunk, min_line_len):
27+
"""
28+
Process a chunk of documents.
29+
30+
:param chunk: pandas DataFrame that contains one document per row.
31+
:param min_line_len: Break the document text up in lines that have this minimum length.
32+
:return: One big text where the documents are separated by an empty line.
33+
"""
34+
35+
text = ''
36+
37+
for i, r in chunk.iterrows():
38+
39+
if type(r.text) != str:
40+
continue
41+
42+
ppn = r.ppn if str(r.ppn).startswith('PPN') else 'PPN' + r.ppn
43+
44+
filename = str(r['file name'])
45+
46+
if not ChunkTask.selection.loc[(ppn, filename)].selected.iloc[0]:
47+
continue
48+
49+
for se in sentence_split(str(r.text), min_line_len):
50+
51+
text += se
52+
53+
text += '\n\n'
54+
55+
return text
56+
57+
@staticmethod
58+
def initialize(selection_file):
59+
60+
ChunkTask.selection = \
61+
pd.read_pickle(selection_file).\
62+
reset_index().\
63+
set_index(['ppn', 'filename']).\
64+
sort_index()
65+
66+
67+
def get_csv_chunks(alto_csv_file, chunksize):
68+
69+
for ch in tqdm(pd.read_csv(alto_csv_file, chunksize=chunksize)):
70+
71+
yield ch
72+
73+
74+
def get_sqlite_chunks(alto_sqlite_file, chunksize):
75+
76+
yield pd.DataFrame()
77+
78+
with sqlite3.connect(alto_sqlite_file) as conn:
79+
80+
conn.execute('pragma journal_mode=wal')
81+
82+
total = int(conn.execute('select count(*) from text;').fetchone()[0] / chunksize)
83+
84+
for ch in tqdm(pd.read_sql('select * from text', conn, chunksize=chunksize), total=total):
85+
86+
yield ch
87+
88+
89+
def get_chunk_tasks(chunks, min_len_len):
90+
91+
for chunk in chunks:
92+
93+
if len(chunk) == 0:
94+
continue
95+
96+
yield ChunkTask(chunk, min_len_len)
97+
98+
99+
def sentence_split(s, min_len):
100+
"""
101+
Reformat text of an entire document such that each line has at least length min_len
102+
:param s: str
103+
:param min_len: minimum line length
104+
:return: reformatted text
105+
"""
106+
107+
parts = s.split(' ')
108+
109+
se = ''
110+
for p in parts:
111+
112+
se += ' ' + p
113+
114+
if len(se) > min_len and len(p) > 2 and re.match(r'.*([^0-9])[.]$', p):
115+
yield se + '\n'
116+
se = ''
117+
118+
yield se + '\n'
119+
120+
121+
@click.command()
122+
@click.argument('fulltext-file', type=click.Path(exists=True), required=True, nargs=1)
123+
@click.argument('selection-file', type=click.Path(exists=True), required=True, nargs=1)
124+
@click.argument('corpus-file', type=click.Path(), required=True, nargs=1)
125+
@click.option('--chunksize', default=10**4, help="Process the corpus in chunks of <chunksize>. default:10**4")
126+
@click.option('--processes', default=6, help="Number of parallel processes. default: 6")
127+
@click.option('--min-line-len', default=80, help="Lower bound of line length in output file. default:80")
128+
def collect(fulltext_file, selection_file, corpus_file, chunksize, processes, min_line_len):
129+
"""
130+
Reads the fulltext from a CSV or SQLITE3 file (see also altotool) and write it to one big text file.
131+
132+
FULLTEXT_FILE: The CSV or SQLITE3 file to read from.
133+
134+
SELECTION_FILE: Consider only a subset of all pages that is defined by the DataFrame
135+
that is stored in <selection_file>.
136+
137+
CORPUS_FILE: The output file that can be used by bert-pregenerate-trainingdata.
138+
"""
139+
os.makedirs(os.path.dirname(corpus_file), exist_ok=True)
140+
141+
print('Open {}.'.format(corpus_file))
142+
corpus_fh = codecs.open(corpus_file, 'w+', 'utf-8')
143+
corpus_fh.write(u'\ufeff')
144+
145+
if fulltext_file.endswith('.csv'):
146+
chunks = get_csv_chunks(fulltext_file, chunksize)
147+
elif fulltext_file.endswith('.sqlite3'):
148+
chunks = get_sqlite_chunks(fulltext_file, chunksize)
149+
else:
150+
raise RuntimeError('Unsupported input file format.')
151+
152+
for text in prun(get_chunk_tasks(chunks, min_line_len), processes=processes, initializer=ChunkTask.initialize,
153+
initargs=(selection_file,)):
154+
155+
corpus_fh.write(text)
156+
157+
corpus_fh.close()
158+
159+
return
160+
161+
162+
if __name__ == '__main__':
163+
main()

0 commit comments

Comments
 (0)