Benchmark updates

VikParuchuri · VikParuchuri · commit e790f50c0d7f · 2024-11-25T17:46:14.000-05:00
diff --git a/README.md b/README.md
@@ -222,7 +222,7 @@ The `results.json` file will contain a json dictionary where the keys are the in
 
 **Performance tips**
 
-Setting the `DETECTOR_BATCH_SIZE` env var properly will make a big difference when using a GPU.  Each batch item will use `400MB` of VRAM, so very high batch sizes are possible.  The default is a batch size `36`, which will use about 16GB of VRAM.  Depending on your CPU core count, it might help, too - the default CPU batch size is `6`.
+Setting the `LAYOUT_BATCH_SIZE` env var properly will make a big difference when using a GPU.  Each batch item will use `220MB` of VRAM, so very high batch sizes are possible.  The default is a batch size `32`, which will use about 7GB of VRAM.  Depending on your CPU core count, it might help, too - the default CPU batch size is `4`.
 
 ### From python
 
@@ -278,7 +278,6 @@ The `results.json` file will contain a json dictionary where the keys are the in
 
 Setting the `TABLE_REC_BATCH_SIZE` env var properly will make a big difference when using a GPU.  Each batch item will use `150MB` of VRAM, so very high batch sizes are possible.  The default is a batch size `64`, which will use about 10GB of VRAM.  Depending on your CPU core count, it might help, too - the default CPU batch size is `8`.
 
-
 # Limitations
 
 - This is specialized for document OCR.  It will likely not work on photos or other images.
@@ -364,16 +363,15 @@ Then we calculate precision and recall for the whole dataset.
 
 ## Layout analysis
 
-![Benchmark chart](static/images/benchmark_layout_chart.png)
-
-| Layout Type | precision | recall |
-| ----------- | --------- | ------ |
-| Image       | 0.97      | 0.96   |
-| Table       | 0.99      | 0.99   |
-| Text        | 0.9       | 0.97   |
-| Title       | 0.94      | 0.88   |
+| Layout Type   |   precision |   recall |
+|---------------|-------------|----------|
+| Image         |     0.91265 |  0.93976 |
+| List          |     0.80849 |  0.86792 |
+| Table         |     0.84957 |  0.96104 |
+| Text          |     0.93019 |  0.94571 |
+| Title         |     0.92102 |  0.95404 |
 
-Time per image - .4 seconds on GPU (A10).
+Time per image - .13 seconds on GPU (A10).
 
 **Methodology**
 
@@ -384,7 +382,7 @@ I benchmarked the layout analysis on [Publaynet](https://github.com/ibm-aur-nlp/
 
 ## Reading Order
 
-75% mean accuracy, and .14 seconds per image on an A6000 GPU.  See methodology for notes - this benchmark is not perfect measure of accuracy, and is more useful as a sanity check.
+88% mean accuracy, and .4 seconds per image on an A10 GPU.  See methodology for notes - this benchmark is not perfect measure of accuracy, and is more useful as a sanity check.
 
 **Methodology**
 
diff --git a/benchmark/layout.py b/benchmark/layout.py
@@ -44,10 +44,11 @@ def main():
     os.makedirs(result_path, exist_ok=True)
 
     label_alignment = { # First is publaynet, second is surya
-        "Image": [["Figure"], ["Picture", "Figure", "Handwriting"]],
+        "Image": [["Figure"], ["Picture", "Figure"]],
         "Table": [["Table"], ["Table", "Form", "TableOfContents"]],
-        "Text": [["Text", "List"], ["Text", "Formula", "Footnote", "Caption", "ListItem", "TextInlineMath", "Code", "PageFooter"]],
-        "Title": [["Title"], ["SectionHeader", "Title", "PageHeader"]]
+        "Text": [["Text"], ["Text", "Formula", "Footnote", "Caption", "TextInlineMath", "Code", "Handwriting"]],
+        "List": [["List"], ["ListItem"]],
+        "Title": [["Title"], ["SectionHeader", "Title"]]
     }
 
     page_metrics = collections.OrderedDict()
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "surya-ocr"
-version = "0.6.13"
+version = "0.7.0"
 description = "OCR, layout, reading order, and table recognition in 90+ languages"
 authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
 readme = "README.md"
diff --git a/surya/benchmark/metrics.py b/surya/benchmark/metrics.py
@@ -2,20 +2,9 @@
 from itertools import repeat
 
 import numpy as np
-from concurrent.futures import ProcessPoolExecutor
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 
 
-def intersection_area(box1, box2):
-    x_left = max(box1[0], box2[0])
-    y_top = max(box1[1], box2[1])
-    x_right = min(box1[2], box2[2])
-    y_bottom = min(box1[3], box2[3])
-
-    if x_right < x_left or y_bottom < y_top:
-        return 0.0
-
-    return (x_right - x_left) * (y_bottom - y_top)
-
 def box_area(box):
     return (box[2] - box[0]) * (box[3] - box[1])
 
@@ -109,16 +98,39 @@ def calculate_coverage(box, other_boxes, penalize_double=False):
     return covered_pixels_count / box_area
 
 
+def intersection_area(box1, box2):
+    x_left = max(box1[0], box2[0])
+    y_top = max(box1[1], box2[1])
+    x_right = min(box1[2], box2[2])
+    y_bottom = min(box1[3], box2[3])
+
+    if x_right < x_left or y_bottom < y_top:
+        return 0.0
+
+    return (x_right - x_left) * (y_bottom - y_top)
+
+
 def calculate_coverage_fast(box, other_boxes, penalize_double=False):
+    box = np.array(box)
+    other_boxes = np.array(other_boxes)
+
+    # Calculate box area
     box_area = (box[2] - box[0]) * (box[3] - box[1])
     if box_area == 0:
         return 0
 
-    total_intersect = 0
-    for other_box in other_boxes:
-        total_intersect += intersection_area(box, other_box)
+    x_left = np.maximum(box[0], other_boxes[:, 0])
+    y_top = np.maximum(box[1], other_boxes[:, 1])
+    x_right = np.minimum(box[2], other_boxes[:, 2])
+    y_bottom = np.minimum(box[3], other_boxes[:, 3])
+
+    widths = np.maximum(0, x_right - x_left)
+    heights = np.maximum(0, y_bottom - y_top)
+    intersect_areas = widths * heights
+
+    total_intersect = np.sum(intersect_areas)
 
-    return min(1, total_intersect / box_area)
+    return min(1.0, total_intersect / box_area)
 
 
 def precision_recall(preds, references, threshold=.5, workers=8, penalize_double=True):
@@ -139,7 +151,7 @@ def precision_recall(preds, references, threshold=.5, workers=8, penalize_double
     if penalize_double:
         coverage_func = calculate_coverage
 
-    with ProcessPoolExecutor(max_workers=workers) as executor:
+    with ThreadPoolExecutor(max_workers=workers) as executor:
         precision_func = partial(coverage_func, penalize_double=penalize_double)
         precision_iou = executor.map(precision_func, preds, repeat(references))
         reference_iou = executor.map(coverage_func, references, repeat(preds))
diff --git a/surya/layout.py b/surya/layout.py
@@ -157,7 +157,7 @@ def batch_layout_detection(images: List, model, processor, batch_size=None) -> L
                                 prediction["pause_tokens"] = last_prediction["pause_tokens"]
                                 prediction["token"].fill_(model.decoder.config.pause_token_id)
                                 batch_decoder_input[j, :] = model.decoder.config.pause_token_id
-                        elif intersects_other_boxes(prediction["polygon"], [p["polygon"] for p in batch_predictions[j]], thresh=.7):
+                        elif intersects_other_boxes(prediction["polygon"], [p["polygon"] for p in batch_predictions[j]], thresh=.4):
                             prediction["paused"] = True
                             prediction["pause_tokens"] = 1
                             prediction["token"].fill_(model.decoder.config.pause_token_id)
diff --git a/surya/settings.py b/surya/settings.py
@@ -69,8 +69,9 @@ def TORCH_DEVICE_MODEL(self) -> str:
     LAYOUT_IMAGE_SIZE: Dict = {"height": 768, "width": 768}
     LAYOUT_BATCH_SIZE: Optional[int] = None
     LAYOUT_BENCH_DATASET_NAME: str = "vikp/publaynet_bench"
-    LAYOUT_MAX_BOXES: int = 150
+    LAYOUT_MAX_BOXES: int = 100
     COMPILE_LAYOUT: bool = False
+    ORDER_BENCH_DATASET_NAME: str = "vikp/order_bench"
 
     # Table Rec
     TABLE_REC_MODEL_CHECKPOINT: str = "vikp/surya_tablerec"