Skip to content

Commit

Permalink
feat: filter out deed documents
Browse files Browse the repository at this point in the history
  • Loading branch information
steveoh committed Feb 14, 2023
1 parent 54519b2 commit b5f6f5a
Showing 1 changed file with 20 additions and 0 deletions.
20 changes: 20 additions & 0 deletions row_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
Usage:
row_cli.py storage generate-index (--from=location) [--prefix=prefix --save-to=location]
row_cli.py storage generate-remaining-index (--full-index=location --processed-index=location) [--save-to=location]
row_cli.py index filter <file_name>
row_cli.py storage pick-range (--from=location --task-index=index --file-count=count --instances=size)
row_cli.py images process --job=name --from=location --save-to=location --index=location --task-index=index --file-count=count --instances=size
row_cli.py image convert <file_name> (--save-to=location)
Expand Down Expand Up @@ -140,6 +141,25 @@ def main():
if args["results"] and args["summarize"]:
row.summarize_run(args["--from"], args["<run_name>"])

if args["index"] and args["filter"]:
index = Path(args["<file_name>"])
total_lines = 0
filtered_lines = 0
with index.open(mode="r", encoding="utf8", newline="") as index_file, index.with_name(
"filtered_index.txt"
).open(mode="w", encoding="utf8", newline="") as filtered_index_file:
for line in index_file:
total_lines += 1

if "deed" in line.casefold():
filtered_lines += 1

continue

filtered_index_file.write(line)

print(f"total lines: {total_lines} filtered lines: {filtered_lines}")


if __name__ == "__main__":
main()

0 comments on commit b5f6f5a

Please sign in to comment.