diff --git a/row.py b/row.py index c963156..b001b7d 100644 --- a/row.py +++ b/row.py @@ -6,6 +6,7 @@ """ import logging import math +import re from datetime import datetime from io import BytesIO from itertools import islice @@ -859,6 +860,77 @@ def download_ocr_results(bucket_name, run_name, out_dir): Path(ocr_file).unlink() return out_dir + + +def filter_ocr_results(original_results_file, out_dir): + """download ocr results from a GCP bucket + + Args: + original_results (str): path to the parquet file with original combined results (path_to_file.gz) + out_dir (str): where to save the CSV file results + + Returns: + str: the location of the output CSV file + """ + #: silence pandas SettingWithCopyWarning + pd.options.mode.chained_assignment = None + + out_dir = Path(out_dir) + + if not out_dir.exists(): + out_dir.mkdir(parents=True) + + results_df = pd.read_parquet(original_results_file) + + orig_length = len(results_df.index) + logging.info("rumber of rows before cleanup: %i", orig_length) + + #: Add column for the original UDOT filename + results_df["udot_file_name"] = results_df.apply(lambda r: r["file_name"].split("/mosaics/", 1)[1].strip(), axis=1) + + #: Remove spaces and newline characters adjacent to colons + results_df["text"] = results_df.apply(lambda r: r["text"].replace(":\n", ":").strip(), axis=1) + results_df["text"] = results_df.apply(lambda r: r["text"].replace("\n:", ":").strip(), axis=1) + results_df["text"] = results_df.apply(lambda r: r["text"].replace(": ", ":").strip(), axis=1) + results_df["text"] = results_df.apply(lambda r: r["text"].replace(" :", ":").strip(), axis=1) + #: Then remove newline characters and replace with spaces + results_df["text"] = results_df.apply(lambda r: r["text"].replace("\n", " ").strip(), axis=1) + + #: Remove special characters except for colons with a regular expression + regex = r"[^a-zA-Z0-9 ](? 0] + + #: Convert list column to string + results_df["text"] = results_df.apply(lambda r: " ".join(r["text"]), axis=1) + + intermediate_length = len(results_df.index) + logging.info("rumber of rows before de-duplicating: %i", intermediate_length) + results_df.drop_duplicates(inplace=True, ignore_index=True) + + final_length = len(results_df.index) + diff = intermediate_length - final_length + logging.info("rumber of rows after removing duplicates: %i", final_length) + logging.info("removed %i duplicate rows", diff) + + out_file = out_dir / "filtered_ocr_results.csv" + results_df.to_csv(out_file) + logging.info("saved filtered ocr results to %s", out_file) + + return out_dir def summarize_run(folder, run_name): """summarize the results of a run diff --git a/row_cli.py b/row_cli.py index 34da7bc..99c6599 100644 --- a/row_cli.py +++ b/row_cli.py @@ -15,6 +15,7 @@ row_cli.py results download (--from=location) row_cli.py results summarize (--from=location) row_cli.py ocr-results download (--from=location --save-to=location) + row_cli.py ocr-results filter (--save-to=location) Options: --from=location The bucket or directory to operate on @@ -31,6 +32,7 @@ python row_cli.py process circles ---job=test --from=./test-data --save-to=./.ephemeral --index=./test-data --task-index=0 --file-count=1 --instances=1 --project=123456789 --processor=123456789 python row_cli.py results download bobcat --from=bucket-name python row_cli.py ocr-results download alligator --from=bucket-name --save-to=./data + python row_cli.py ocr-results filter ./data/alligator/combined_ocr_results --save-to=./data """ import logging @@ -171,6 +173,11 @@ def main(): print(f"files downloaded to {location}") + if args["ocr-results"] and args["filter"]: + location = row.filter_ocr_results(args[""], args["--save-to"]) + + print(f"files downloaded to {location}") + if args["index"] and args["filter"]: index = Path(args[""]) total_lines = 0