VikParuchuri
diff --git a/‎README.md
+22-23 b/‎README.md
+22-23
diff --git a/‎benchmark/recognition.py
+13-2 b/‎benchmark/recognition.py
+13-2
diff --git a/‎static/images/chinese_text.png
189 KB b/‎static/images/chinese_text.png
189 KB
diff --git a/‎static/images/excerpt_text.png
115 KB b/‎static/images/excerpt_text.png
115 KB
diff --git a/‎static/images/hindi.png
27.5 KB b/‎static/images/hindi.png
27.5 KB
diff --git a/‎static/images/hindi_text.png
182 KB b/‎static/images/hindi_text.png
182 KB
diff --git a/‎static/images/japanese.png
248 KB b/‎static/images/japanese.png
248 KB
diff --git a/‎static/images/japanese_text.png
403 KB b/‎static/images/japanese_text.png
403 KB
diff --git a/‎static/images/nyt_text.png
793 KB b/‎static/images/nyt_text.png
793 KB
diff --git a/‎static/images/paper_text.png
389 KB b/‎static/images/paper_text.png
389 KB
diff --git a/‎static/images/pres.png
-207 KB b/‎static/images/pres.png
-207 KB
diff --git a/‎static/images/pres_text.png
148 KB b/‎static/images/pres_text.png
148 KB
diff --git a/‎static/images/scanned_text.png
300 KB b/‎static/images/scanned_text.png
300 KB
diff --git a/‎surya/benchmark/tesseract.py
+6-5 b/‎surya/benchmark/tesseract.py
+6-5
diff --git a/‎surya/input/processing.py
+7-13 b/‎surya/input/processing.py
+7-13
diff --git a/‎surya/ocr.py
+1-1 b/‎surya/ocr.py
+1-1
@@ -8,26 +8,31 @@ Surya is for multilingual document OCR.  It can do:
 
 It works on a range of documents (see [usage](#usage) and [benchmarks](#benchmarks) for more details).
 
-![New York Times Article Example](static/images/excerpt.png)
+Detection and OCR example:
 
-Surya is named after the [Hindu sun god](https://en.wikipedia.org/wiki/Surya), who has universal vision.
+|                                                      Detection | OCR                                                                   |
+|---------------------------------------------------------------:|:----------------------------------------------------------------------|
+| ![New York Times Article Detection](static/images/excerpt.png) | ![New York Times Article Recognition](static/images/excerpt_text.png) |
+
+
+Surya is named for the [Hindu sun god](https://en.wikipedia.org/wiki/Surya), who has universal vision.
 
 ## Community
 
 [Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development.
 
 ## Examples
 
-| Name             | Text Detection                      |
-|------------------|-------------------------------------|
-| New York Times   | [Image](static/images/nyt.png)      |
-| Japanese         | [Image](static/images/japanese.png) |
-| Chinese          | [Image](static/images/chinese.png)  |
-| Hindi            | [Image](static/images/hindi.png)    |
-| Presentation     | [Image](static/images/pres.png)     |
-| Scientific Paper | [Image](static/images/paper.png)    |
-| Scanned Document | [Image](static/images/scanned.png)  |
-| Scanned Form | [Image](static/images/funsd.png)    |
+| Name             |           Text Detection            |                                      OCR |
+|------------------|:-----------------------------------:|-----------------------------------------:|
+| New York Times   |   [Image](static/images/nyt.png)    |      [Image](static/images/nyt_text.png) |
+| Japanese         | [Image](static/images/japanese.png) | [Image](static/images/japanese_text.png) |
+| Chinese          | [Image](static/images/chinese.png)  |  [Image](static/images/chinese_text.png) |
+| Hindi            |  [Image](static/images/hindi.png)   |    [Image](static/images/hindi_text.png) |
+| Presentation     |   [Image](static/images/pres.png)   |     [Image](static/images/pres_text.png) |
+| Scientific Paper |  [Image](static/images/paper.png)   |    [Image](static/images/paper_text.png) |
+| Scanned Document | [Image](static/images/scanned.png)  |  [Image](static/images/scanned_text.png) |
+| Scanned Form     |  [Image](static/images/funsd.png)   |                                          |
 
 # Installation
 
@@ -78,9 +83,7 @@ Setting the `RECOGNITION_BATCH_SIZE` env var properly will make a big difference
 Depending on your CPU core count, `RECOGNITION_BATCH_SIZE` might make a difference there too - the default CPU batch size is `32`.
 
 
-### From Python
-
-You can also do OCR from code with:
+### From python
 
 ```
 from PIL import Image
@@ -132,9 +135,7 @@ Depending on your CPU core count, `DETECTOR_BATCH_SIZE` might make a difference
 You can adjust `DETECTOR_NMS_THRESHOLD` and `DETECTOR_TEXT_THRESHOLD` if you don't get good results.  Try lowering them to detect more text, and vice versa.
 
 
-### From Python
-
-You can also do text detection from code with:
+### From python
 
 ```
 from PIL import Image
@@ -164,11 +165,9 @@ If you want to develop surya, you can install it manually:
 # Limitations
 
 - This is specialized for document OCR.  It will likely not work on photos or other images.
-- It is for printed text, not handwriting.
+- It is for printed text, not handwriting (though it may work on some handwriting).
 - The model has trained itself to ignore advertisements.
 - You can find language support for OCR in `surya/languages.py`.  Text detection should work with any language.
-- Math will not be detected well with the main detector model.  Use `DETECTOR_MODEL_CHECKPOINT=vikp/line_detector_math` for better results.
-
 
 # Benchmarks
 
@@ -193,11 +192,11 @@ Tesseract is CPU-based, and surya is CPU or GPU.  I ran the benchmarks on a syst
 
 **Methodology**
 
-Surya predicts line-level bboxes, while tesseract and others predict word-level or character-level.  It's also hard to find 100% correct datasets with line-level annotations. Merging bboxes can be noisy, so I chose not to use IoU as the metric for evaluation.
+Surya predicts line-level bboxes, while tesseract and others predict word-level or character-level.  It's hard to find 100% correct datasets with line-level annotations. Merging bboxes can be noisy, so I chose not to use IoU as the metric for evaluation.
 
 I instead used coverage, which calculates:
 
-- Precision - how well predicted bboxes cover ground truth bboxes
+- Precision - how well the predicted bboxes cover ground truth bboxes
 - Recall - how well ground truth bboxes cover predicted bboxes
 
 First calculate coverage for each bbox, then add a small penalty for double coverage, since we want the detection to have non-overlapping bboxes.  Anything with a coverage of 0.5 or higher is considered a match.
 
@@ -22,7 +22,7 @@ def main():
     parser = argparse.ArgumentParser(description="Detect bboxes in a PDF.")
     parser.add_argument("--results_dir", type=str, help="Path to JSON file with OCR results.", default=os.path.join(settings.RESULT_DIR, "benchmark"))
     parser.add_argument("--max", type=int, help="Maximum number of pdf pages to OCR.", default=None)
-    parser.add_argument("--debug", action="store_true", help="Run in debug mode.", default=False)
+    parser.add_argument("--debug", type=int, help="Debug level - 1 dumps bad detection info, 2 writes out images.", default=0)
     parser.add_argument("--tesseract", action="store_true", help="Run tesseract instead of surya.", default=False)
     args = parser.parse_args()
 
@@ -54,8 +54,10 @@ def main():
     surya_time = time.time() - start
 
     surya_scores = defaultdict(list)
+    img_surya_scores = []
     for idx, (pred, ref_text, lang) in enumerate(zip(predictions_by_image, line_text, lang_list)):
         image_score = overlap_score(pred["text_lines"], ref_text)
+        img_surya_scores.append(image_score)
         for l in lang:
             surya_scores[CODE_TO_LANGUAGE[l]].append(image_score)
 
@@ -118,7 +120,16 @@ def main():
     print(tabulate(table_data, headers=table_headers, tablefmt="github"))
     print("Only a few major languages are displayed. See the result path for additional languages.")
 
-    if args.debug:
+    if args.debug >= 1:
+        bad_detections = []
+        for idx, (score, lang) in enumerate(zip(flat_surya_scores, lang_list)):
+            if score < .8:
+                bad_detections.append((idx, lang, score))
+        print(f"Found {len(bad_detections)} bad detections. Writing to file...")
+        with open(os.path.join(result_path, "bad_detections.json"), "w+") as f:
+            json.dump(bad_detections, f)
+
+    if args.debug == 2:
         for idx, (image, pred, ref_text, bbox, lang) in enumerate(zip(images, predictions_by_image, line_text, bboxes, lang_list)):
             pred_image_name = f"{'_'.join(lang)}_{idx}_pred.png"
             ref_image_name = f"{'_'.join(lang)}_{idx}_ref.png"
 
@@ -3,6 +3,7 @@
 import numpy as np
 import pytesseract
 from pytesseract import Output
+from tqdm import tqdm
 
 from surya.input.processing import slice_bboxes_from_image
 from surya.settings import settings
@@ -37,11 +38,12 @@ def tesseract_ocr_parallel(imgs, bboxes, langs: List[str]):
     cpus = os.cpu_count()
     tess_parallel_cores = min(tess_parallel_cores, cpus)
 
-    # Tesseract uses 4 threads per instance
-    tess_parallel = max(tess_parallel_cores // 4, 1)
+    # Tesseract uses up to 4 processes per instance
+    # Divide by 2 because tesseract doesn't seem to saturate all 4 cores with these small images
+    tess_parallel = max(tess_parallel_cores // 2, 1)
 
     with ProcessPoolExecutor(max_workers=tess_parallel) as executor:
-        tess_text = executor.map(tesseract_ocr, imgs, bboxes, langs)
+        tess_text = tqdm(executor.map(tesseract_ocr, imgs, bboxes, langs), total=len(imgs), desc="Running tesseract OCR")
         tess_text = list(tess_text)
     return tess_text
 
@@ -71,7 +73,7 @@ def tesseract_parallel(imgs):
     tess_parallel = max(tess_parallel_cores // 4, 1)
 
     with ProcessPoolExecutor(max_workers=tess_parallel) as executor:
-        tess_bboxes = executor.map(tesseract_bboxes, imgs)
+        tess_bboxes = tqdm(executor.map(tesseract_bboxes, imgs), total=len(imgs), desc="Running tesseract bbox detection")
         tess_bboxes = list(tess_bboxes)
     return tess_bboxes
 
@@ -163,7 +165,6 @@ def tesseract_parallel(imgs):
     "tam": "Tamil",
     "tel": "Telugu",
     "tgk": "Tajik",
-    "tgl": "Tagalog",
     "tha": "Thai",
     "tir": "Tigrinya",
     "tur": "Turkish",
 
@@ -1,3 +1,4 @@
+import os
 from typing import List
 
 import numpy as np
@@ -69,37 +70,30 @@ def slice_bboxes_from_image(image: Image.Image, bboxes):
 
 def slice_polys_from_image(image: Image.Image, polys):
     lines = []
-    for poly in polys:
-        lines.append(slice_and_pad_poly(image, poly))
+    for idx, poly in enumerate(polys):
+        lines.append(slice_and_pad_poly(image, poly, idx))
     return lines
 
 
-def slice_and_pad_poly(image: Image.Image, coordinates):
+def slice_and_pad_poly(image: Image.Image, coordinates, idx):
     # Create a mask for the polygon
     mask = Image.new('L', image.size, 0)
 
     # coordinates must be in tuple form for PIL
     coordinates = [(corner[0], corner[1]) for corner in coordinates]
     ImageDraw.Draw(mask).polygon(coordinates, outline=1, fill=1)
+    bbox = mask.getbbox()
     mask = np.array(mask)
 
     # Extract the polygonal area from the image
     polygon_image = np.array(image)
-    polygon_image[~mask] = 0
+    polygon_image[mask == 0] = 0
     polygon_image = Image.fromarray(polygon_image)
 
-    bbox_image = Image.new('L', image.size, 0)
-    ImageDraw.Draw(bbox_image).polygon(coordinates, outline=1, fill=1)
-    bbox = bbox_image.getbbox()
-
     rectangle = Image.new('RGB', (bbox[2] - bbox[0], bbox[3] - bbox[1]), 'white')
 
     # Paste the polygon into the rectangle
-    polygon_center = (bbox[2] + bbox[0]) // 2, (bbox[3] + bbox[1]) // 2
-    rectangle_center = rectangle.width // 2, rectangle.height // 2
-    paste_position = (rectangle_center[0] - polygon_center[0] + bbox[0],
-                      rectangle_center[1] - polygon_center[1] + bbox[1])
-    rectangle.paste(polygon_image.crop(bbox), paste_position)
+    rectangle.paste(polygon_image.crop(bbox), (0, 0))
 
     return rectangle
 
@@ -16,7 +16,7 @@ def run_recognition(images: List[Image.Image], langs: List[List[str]], rec_model
     slice_map = []
     all_slices = []
     all_langs = []
-    for idx, (image, lang) in tqdm(enumerate(zip(images, langs)), desc="Slicing images"):
+    for idx, (image, lang) in enumerate(zip(images, langs)):
         if polygons is not None:
             slices = slice_polys_from_image(image, polygons[idx])
         else: