From 55ae5ab49245543250f46875820b1bc4d5b377b9 Mon Sep 17 00:00:00 2001 From: Erik Neemann Date: Thu, 6 Apr 2023 16:29:18 -0600 Subject: [PATCH] feat: drop duplicate results --- row.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/row.py b/row.py index de315a6..9bf7d7a 100644 --- a/row.py +++ b/row.py @@ -1145,6 +1145,16 @@ def filter_results(previous_results_file, out_dir): working_df.loc[mask, "keep"] = "no" working_df.loc[mask, "zero"] = "fail" logging.info("Number of parcels equal to 0 flagged: %i", mask.value_counts()[1]) + + #: drop duplicates on the 'udot_file_name' and 'text' fields + before_length = len(working_df.index) + logging.info("rumber of rows before final de-duplication: %i", before_length) + working_no_duplicates = working_df.drop_duplicates(["udot_file_name", "text"], inplace=False, ignore_index=True) + + after_length = len(working_no_duplicates.index) + duplicate_diff = before_length - after_length + logging.info("rumber of rows after removing duplicates: %i", after_length) + logging.info("removed %i duplicate rows", duplicate_diff) #: save all results #: save results to CSV out_file_all = out_dir / f"final-all-ocr-results-{datetime.now().strftime('%Y-%m-%d-%H-%M')}.csv"