qurator-spk
diff --git a/‎qurator/sbb_ner/models/corpus.py
+163 b/‎qurator/sbb_ner/models/corpus.py
+163
@@ -0,0 +1,163 @@
+import re
+import pandas as pd
+from tqdm import tqdm as tqdm
+import click
+import codecs
+import os
+import sqlite3
+
+from qurator.utils.parallel import run as prun
+
+
+class ChunkTask:
+
+    selection = None
+
+    def __init__(self, chunk, min_line_len):
+
+        self._chunk = chunk
+        self._min_line_len = min_line_len
+
+    def __call__(self, *args, **kwargs):
+
+        return ChunkTask.reformat_chunk(self._chunk, self._min_line_len)
+
+    @staticmethod
+    def reformat_chunk(chunk, min_line_len):
+        """
+        Process a chunk of documents.
+
+        :param chunk: pandas DataFrame that contains one document per row.
+        :param min_line_len: Break the document text up in lines that have this minimum length.
+        :return: One big text where the documents are separated by an empty line.
+        """
+
+        text = ''
+
+        for i, r in chunk.iterrows():
+
+            if type(r.text) != str:
+                continue
+
+            ppn = r.ppn if str(r.ppn).startswith('PPN') else 'PPN' + r.ppn
+
+            filename = str(r['file name'])
+
+            if not ChunkTask.selection.loc[(ppn, filename)].selected.iloc[0]:
+                continue
+
+            for se in sentence_split(str(r.text), min_line_len):
+
+                text += se
+
+            text += '\n\n'
+
+        return text
+
+    @staticmethod
+    def initialize(selection_file):
+
+        ChunkTask.selection = \
+            pd.read_pickle(selection_file).\
+                reset_index().\
+                set_index(['ppn', 'filename']).\
+                sort_index()
+
+
+def get_csv_chunks(alto_csv_file, chunksize):
+
+    for ch in tqdm(pd.read_csv(alto_csv_file, chunksize=chunksize)):
+
+        yield ch
+
+
+def get_sqlite_chunks(alto_sqlite_file, chunksize):
+
+    yield pd.DataFrame()
+
+    with sqlite3.connect(alto_sqlite_file) as conn:
+
+        conn.execute('pragma journal_mode=wal')
+
+        total = int(conn.execute('select count(*) from text;').fetchone()[0] / chunksize)
+
+        for ch in tqdm(pd.read_sql('select * from text', conn, chunksize=chunksize), total=total):
+
+            yield ch
+
+
+def get_chunk_tasks(chunks, min_len_len):
+
+    for chunk in chunks:
+
+        if len(chunk) == 0:
+            continue
+
+        yield ChunkTask(chunk, min_len_len)
+
+
+def sentence_split(s, min_len):
+    """
+    Reformat text of an entire document such that each line has at least length min_len
+    :param s: str
+    :param min_len: minimum line length
+    :return: reformatted text
+    """
+
+    parts = s.split(' ')
+
+    se = ''
+    for p in parts:
+
+        se += ' ' + p
+
+        if len(se) > min_len and len(p) > 2 and re.match(r'.*([^0-9])[.]$', p):
+            yield se + '\n'
+            se = ''
+
+    yield se + '\n'
+
+
+@click.command()
+@click.argument('fulltext-file', type=click.Path(exists=True), required=True, nargs=1)
+@click.argument('selection-file', type=click.Path(exists=True), required=True, nargs=1)
+@click.argument('corpus-file', type=click.Path(), required=True, nargs=1)
+@click.option('--chunksize', default=10**4, help="Process the corpus in chunks of <chunksize>. default:10**4")
+@click.option('--processes', default=6, help="Number of parallel processes. default: 6")
+@click.option('--min-line-len', default=80, help="Lower bound of line length in output file. default:80")
+def collect(fulltext_file, selection_file, corpus_file, chunksize, processes, min_line_len):
+    """
+    Reads the fulltext from a CSV or SQLITE3 file (see also altotool) and write it to one big text file.
+
+    FULLTEXT_FILE: The CSV or SQLITE3 file to read from.
+
+    SELECTION_FILE: Consider only a subset of all pages that is defined by the DataFrame
+    that is stored in <selection_file>.
+
+    CORPUS_FILE: The output file that can be used by bert-pregenerate-trainingdata.
+    """
+    os.makedirs(os.path.dirname(corpus_file), exist_ok=True)
+
+    print('Open {}.'.format(corpus_file))
+    corpus_fh = codecs.open(corpus_file, 'w+', 'utf-8')
+    corpus_fh.write(u'\ufeff')
+
+    if fulltext_file.endswith('.csv'):
+        chunks = get_csv_chunks(fulltext_file, chunksize)
+    elif fulltext_file.endswith('.sqlite3'):
+        chunks = get_sqlite_chunks(fulltext_file, chunksize)
+    else:
+        raise RuntimeError('Unsupported input file format.')
+
+    for text in prun(get_chunk_tasks(chunks, min_line_len), processes=processes, initializer=ChunkTask.initialize,
+                     initargs=(selection_file,)):
+
+        corpus_fh.write(text)
+
+    corpus_fh.close()
+
+    return
+
+
+if __name__ == '__main__':
+    main()