|
| 1 | +import re |
| 2 | +import pandas as pd |
| 3 | +from tqdm import tqdm as tqdm |
| 4 | +import click |
| 5 | +import codecs |
| 6 | +import os |
| 7 | +import sqlite3 |
| 8 | + |
| 9 | +from qurator.utils.parallel import run as prun |
| 10 | + |
| 11 | + |
| 12 | +class ChunkTask: |
| 13 | + |
| 14 | + selection = None |
| 15 | + |
| 16 | + def __init__(self, chunk, min_line_len): |
| 17 | + |
| 18 | + self._chunk = chunk |
| 19 | + self._min_line_len = min_line_len |
| 20 | + |
| 21 | + def __call__(self, *args, **kwargs): |
| 22 | + |
| 23 | + return ChunkTask.reformat_chunk(self._chunk, self._min_line_len) |
| 24 | + |
| 25 | + @staticmethod |
| 26 | + def reformat_chunk(chunk, min_line_len): |
| 27 | + """ |
| 28 | + Process a chunk of documents. |
| 29 | +
|
| 30 | + :param chunk: pandas DataFrame that contains one document per row. |
| 31 | + :param min_line_len: Break the document text up in lines that have this minimum length. |
| 32 | + :return: One big text where the documents are separated by an empty line. |
| 33 | + """ |
| 34 | + |
| 35 | + text = '' |
| 36 | + |
| 37 | + for i, r in chunk.iterrows(): |
| 38 | + |
| 39 | + if type(r.text) != str: |
| 40 | + continue |
| 41 | + |
| 42 | + ppn = r.ppn if str(r.ppn).startswith('PPN') else 'PPN' + r.ppn |
| 43 | + |
| 44 | + filename = str(r['file name']) |
| 45 | + |
| 46 | + if not ChunkTask.selection.loc[(ppn, filename)].selected.iloc[0]: |
| 47 | + continue |
| 48 | + |
| 49 | + for se in sentence_split(str(r.text), min_line_len): |
| 50 | + |
| 51 | + text += se |
| 52 | + |
| 53 | + text += '\n\n' |
| 54 | + |
| 55 | + return text |
| 56 | + |
| 57 | + @staticmethod |
| 58 | + def initialize(selection_file): |
| 59 | + |
| 60 | + ChunkTask.selection = \ |
| 61 | + pd.read_pickle(selection_file).\ |
| 62 | + reset_index().\ |
| 63 | + set_index(['ppn', 'filename']).\ |
| 64 | + sort_index() |
| 65 | + |
| 66 | + |
| 67 | +def get_csv_chunks(alto_csv_file, chunksize): |
| 68 | + |
| 69 | + for ch in tqdm(pd.read_csv(alto_csv_file, chunksize=chunksize)): |
| 70 | + |
| 71 | + yield ch |
| 72 | + |
| 73 | + |
| 74 | +def get_sqlite_chunks(alto_sqlite_file, chunksize): |
| 75 | + |
| 76 | + yield pd.DataFrame() |
| 77 | + |
| 78 | + with sqlite3.connect(alto_sqlite_file) as conn: |
| 79 | + |
| 80 | + conn.execute('pragma journal_mode=wal') |
| 81 | + |
| 82 | + total = int(conn.execute('select count(*) from text;').fetchone()[0] / chunksize) |
| 83 | + |
| 84 | + for ch in tqdm(pd.read_sql('select * from text', conn, chunksize=chunksize), total=total): |
| 85 | + |
| 86 | + yield ch |
| 87 | + |
| 88 | + |
| 89 | +def get_chunk_tasks(chunks, min_len_len): |
| 90 | + |
| 91 | + for chunk in chunks: |
| 92 | + |
| 93 | + if len(chunk) == 0: |
| 94 | + continue |
| 95 | + |
| 96 | + yield ChunkTask(chunk, min_len_len) |
| 97 | + |
| 98 | + |
| 99 | +def sentence_split(s, min_len): |
| 100 | + """ |
| 101 | + Reformat text of an entire document such that each line has at least length min_len |
| 102 | + :param s: str |
| 103 | + :param min_len: minimum line length |
| 104 | + :return: reformatted text |
| 105 | + """ |
| 106 | + |
| 107 | + parts = s.split(' ') |
| 108 | + |
| 109 | + se = '' |
| 110 | + for p in parts: |
| 111 | + |
| 112 | + se += ' ' + p |
| 113 | + |
| 114 | + if len(se) > min_len and len(p) > 2 and re.match(r'.*([^0-9])[.]$', p): |
| 115 | + yield se + '\n' |
| 116 | + se = '' |
| 117 | + |
| 118 | + yield se + '\n' |
| 119 | + |
| 120 | + |
| 121 | +@click.command() |
| 122 | +@click.argument('fulltext-file', type=click.Path(exists=True), required=True, nargs=1) |
| 123 | +@click.argument('selection-file', type=click.Path(exists=True), required=True, nargs=1) |
| 124 | +@click.argument('corpus-file', type=click.Path(), required=True, nargs=1) |
| 125 | +@click.option('--chunksize', default=10**4, help="Process the corpus in chunks of <chunksize>. default:10**4") |
| 126 | +@click.option('--processes', default=6, help="Number of parallel processes. default: 6") |
| 127 | +@click.option('--min-line-len', default=80, help="Lower bound of line length in output file. default:80") |
| 128 | +def collect(fulltext_file, selection_file, corpus_file, chunksize, processes, min_line_len): |
| 129 | + """ |
| 130 | + Reads the fulltext from a CSV or SQLITE3 file (see also altotool) and write it to one big text file. |
| 131 | +
|
| 132 | + FULLTEXT_FILE: The CSV or SQLITE3 file to read from. |
| 133 | +
|
| 134 | + SELECTION_FILE: Consider only a subset of all pages that is defined by the DataFrame |
| 135 | + that is stored in <selection_file>. |
| 136 | +
|
| 137 | + CORPUS_FILE: The output file that can be used by bert-pregenerate-trainingdata. |
| 138 | + """ |
| 139 | + os.makedirs(os.path.dirname(corpus_file), exist_ok=True) |
| 140 | + |
| 141 | + print('Open {}.'.format(corpus_file)) |
| 142 | + corpus_fh = codecs.open(corpus_file, 'w+', 'utf-8') |
| 143 | + corpus_fh.write(u'\ufeff') |
| 144 | + |
| 145 | + if fulltext_file.endswith('.csv'): |
| 146 | + chunks = get_csv_chunks(fulltext_file, chunksize) |
| 147 | + elif fulltext_file.endswith('.sqlite3'): |
| 148 | + chunks = get_sqlite_chunks(fulltext_file, chunksize) |
| 149 | + else: |
| 150 | + raise RuntimeError('Unsupported input file format.') |
| 151 | + |
| 152 | + for text in prun(get_chunk_tasks(chunks, min_line_len), processes=processes, initializer=ChunkTask.initialize, |
| 153 | + initargs=(selection_file,)): |
| 154 | + |
| 155 | + corpus_fh.write(text) |
| 156 | + |
| 157 | + corpus_fh.close() |
| 158 | + |
| 159 | + return |
| 160 | + |
| 161 | + |
| 162 | +if __name__ == '__main__': |
| 163 | + main() |
0 commit comments