Merge pull request #307 from tarun-menta/textract

VikParuchuri · web-flow · commit 2cf6a770e759 · 2025-02-07T07:45:51.000-08:00
Add Textract OCR Benchmark
diff --git a/benchmark/recognition.py b/benchmark/recognition.py
@@ -10,6 +10,7 @@
 from surya.settings import settings
 from surya.recognition.languages import CODE_TO_LANGUAGE
 from benchmark.utils.tesseract import tesseract_ocr_parallel, surya_lang_to_tesseract, TESS_CODE_TO_LANGUAGE
+from benchmark.utils.textract import textract_ocr_parallel
 import os
 import datasets
 import json
@@ -22,22 +23,24 @@
 @click.option("--results_dir", type=str, help="Path to JSON file with OCR results.", default=os.path.join(settings.RESULT_DIR, "benchmark"))
 @click.option("--max_rows", type=int, help="Maximum number of pdf pages to OCR.", default=None)
 @click.option("--debug", is_flag=True, help="Enable debug mode.", default=False)
-@click.option("--tesseract", is_flag=True, help="Run tesseract instead of surya.", default=False)
+@click.option("--tesseract", is_flag=True, help="Run benchmarks on tesseract.", default=False)
+@click.option("--textract", is_flag=True, help="Run benchmarks on textract.", default=False)
 @click.option("--langs", type=str, help="Specify certain languages to benchmark.", default=None)
 @click.option("--tess_cpus", type=int, help="Number of CPUs to use for tesseract.", default=28)
+@click.option("--textract_cpus", type=int, help="Number of CPUs to use for textract.", default=28)
 @click.option("--specify_language", is_flag=True, help="Pass language codes into the model.", default=False)
-def main(results_dir: str, max_rows: int, debug: bool, tesseract: bool, langs: str, tess_cpus: int, specify_language: bool):
+def main(results_dir: str, max_rows: int, debug: bool, tesseract: bool, textract: bool, langs: str, tess_cpus: int, textract_cpus:int, specify_language: bool):
     rec_predictor = RecognitionPredictor()
 
     split = "train"
-    if max_rows:
-        split = f"train[:{max_rows}]"
-
     dataset = datasets.load_dataset(settings.RECOGNITION_BENCH_DATASET_NAME, split=split)
 
     if langs:
         langs = langs.split(",")
         dataset = dataset.filter(lambda x: x["language"] in langs, num_proc=4)
+    
+    if max_rows and max_rows<len(dataset):
+        dataset = dataset.shuffle().select(range(max_rows))
 
     images = list(dataset["image"])
     images = convert_if_not_rgb(images)
@@ -121,6 +124,28 @@ def main(results_dir: str, max_rows: int, debug: bool, tesseract: bool, langs: s
         with open(os.path.join(result_path, "tesseract_scores.json"), "w+") as f:
             json.dump(tess_scores, f)
 
+    if textract:
+        start = time.time()
+        textract_predictions = textract_ocr_parallel(images, cpus=textract_cpus)
+        textract_time = time.time()-start
+
+        textract_scores = defaultdict(list)
+        for idx, (pred, ref_text, lang) in enumerate(zip(textract_predictions, line_text, lang_list)):
+            image_score = overlap_score(pred, ref_text)
+            for l in lang:
+                textract_scores[CODE_TO_LANGUAGE[l]].append(image_score)
+
+        flat_textract_scores = [s for l in textract_scores for s in textract_scores[l]]
+        benchmark_stats["textract"] = {
+            "avg_score": sum(flat_textract_scores) / len(flat_textract_scores),
+            "lang_scores": {l: sum(scores) / len(scores) for l, scores in textract_scores.items()},
+            "time_per_img": textract_time / len(images)
+        }
+        print(len(flat_textract_scores))
+
+        with open(os.path.join(result_path, "textract_scores.json"), "w+") as f:
+            json.dump(textract_scores, f)
+
     with open(os.path.join(result_path, "results.json"), "w+", encoding="utf-8") as f:
         json.dump(benchmark_stats, f)
 
@@ -133,6 +158,10 @@ def main(results_dir: str, max_rows: int, debug: bool, tesseract: bool, langs: s
         table_data.append(
             ["tesseract", benchmark_stats["tesseract"]["time_per_img"], benchmark_stats["tesseract"]["avg_score"]] + [benchmark_stats["tesseract"]["lang_scores"].get(l, 0) for l in key_languages]
         )
+    if textract:
+        table_data.append(
+            ["textract", benchmark_stats["textract"]["time_per_img"], benchmark_stats["textract"]["avg_score"]] + [benchmark_stats["textract"]["lang_scores"][l] for l in key_languages],
+        )
 
     print(tabulate(table_data, headers=table_headers, tablefmt="github"))
     print("Only a few major languages are displayed. See the result path for additional languages.")
diff --git a/benchmark/utils/textract.py b/benchmark/utils/textract.py
@@ -0,0 +1,29 @@
+import os
+from concurrent.futures import ThreadPoolExecutor
+from tqdm import tqdm
+import traceback
+
+from surya.input.processing import slice_bboxes_from_image
+from surya.recognition import RecognitionPredictor
+
+from textractor import Textractor
+
+def textract_ocr(extractor:Textractor, img):
+    try:
+        document = extractor.detect_document_text(file_source=img)
+        return [line.text for line in document.lines]
+    except:
+        traceback.print_exc()
+        return [None]
+
+def textract_ocr_parallel(imgs, cpus=None):
+    extractor = Textractor(profile_name='default')
+    parallel_cores = min(len(imgs), RecognitionPredictor().get_batch_size())
+    if not cpus:
+        cpus = os.cpu_count()
+    parallel_cores = min(parallel_cores, cpus)
+
+    with ThreadPoolExecutor(max_workers=parallel_cores) as executor:
+        textract_text = tqdm(executor.map(textract_ocr, [extractor]*len(imgs), imgs), total=len(imgs), desc="Running textract OCR")
+        textract_text = list(textract_text)
+    return textract_text